From d13933747e947cd31ff1284c56a46df9afec5075 Mon Sep 17 00:00:00 2001
From: Richard Palethorpe <io@richiejp.com>
Date: Fri, 26 Jun 2026 15:28:37 +0100
Subject: [PATCH 01/33] pose/: downstream PnP + cross-run alignment prototype,
 verified vs upstream

A self-contained, pure-Python prototype DOWNSTREAM of the engine seam
([N,H,W,23]); deliberately OUTSIDE the validated src/ and NOT wired into
CMake/ctest. It prototypes live, accumulating reconstruction from a moving
camera: recover each view's camera (PnP) and align successive runs.

- focal.py / pnp.py: faithful port of FreeSplatter's scene estimate_poses --
  Weiszfeld shared focal (view 0 only, all pixels; use_first_focal), integer
  pixels, cv2.solvePnPRansac(SQPNP, reprojErr=5, iters=10), cam2world=inv(w2c),
  and the runner's 1/baseline camera rescale. numpy DLT+RANSAC fallback runs
  without cv2.
- align.py: Umeyama similarity fit + RANSAC + a residual ladder (diagnose)
  that detects whether cross-run mismatch is a uniform-scale similarity or a
  nonlinear warp; plus similarity chaining and loop-closure metrics.

Verification (all green):
- check_cv2_parity.py  -- numpy solver == exact cv2.solvePnPRansac on synthetic
  ground truth (~1e-7 clean).
- check_upstream_parity.py -- our WHOLE orchestration vs upstream estimate_poses
  on REAL engine output. Caught and fixed five divergences (the costly one:
  focal averaged over a low-overlap 2nd view gave 507 vs the correct 596, ~15%
  -> 1.35 deg pose error). Now bit-exact (0.00 deg) on 2 scene + 2 object dumps;
  the only residual is upstream's float32 K vs our float64 (a RANSAC inlier-
  boundary precision effect, <=0.5 deg on near-degenerate object data, 0 on
  scene), root-caused and documented, not papered over.
- test_pose.py -- 28 asset-free golden tests (no model/fixtures/cv2).

Empirical finding: cross-run mismatch is a uniform-scale similarity (~11% scene
scale drift), no nonlinear warp -- a 7-DoF similarity is the right alignment
model; diagnose() flags it if that ever changes.

flake.nix: add opencv4 (cv2) for the exact-upstream PnP path; numpy-only
fallback otherwise. Not needed by the engine build/test.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 flake.nix                     |   5 +-
 pose/README.md                |  95 +++++++++++++
 pose/_upstream.py             | 107 +++++++++++++++
 pose/align.py                 | 214 +++++++++++++++++++++++++++++
 pose/check_cv2_parity.py      |  73 ++++++++++
 pose/check_upstream_parity.py | 154 +++++++++++++++++++++
 pose/empirical.py             |  83 ++++++++++++
 pose/focal.py                 |  48 +++++++
 pose/pnp.py                   | 245 ++++++++++++++++++++++++++++++++++
 pose/test_pose.py             | 235 ++++++++++++++++++++++++++++++++
 10 files changed, 1258 insertions(+), 1 deletion(-)
 create mode 100644 pose/README.md
 create mode 100644 pose/_upstream.py
 create mode 100644 pose/align.py
 create mode 100644 pose/check_cv2_parity.py
 create mode 100644 pose/check_upstream_parity.py
 create mode 100644 pose/empirical.py
 create mode 100644 pose/focal.py
 create mode 100644 pose/pnp.py
 create mode 100644 pose/test_pose.py

diff --git a/flake.nix b/flake.nix
index 7976a20..5a252a5 100644
--- a/flake.nix
+++ b/flake.nix
@@ -11,7 +11,10 @@
       # hf_dump.py) run in docker/Dockerfile.cuda alongside the GPU model. The
       # M0 gate synthesizes its test GGUF in C++ (tests/test_loader.cpp), so the
       # build + fast test tier need no Python at all.
-      pyEnv = pkgs.python3.withPackages (ps: with ps; [ numpy ]);
+      # opencv4 (cv2): the downstream pose/ prototype mirrors FreeSplatter's
+      # cv2.solvePnPRansac for the exact-upstream PnP path; numpy-only fallback
+      # otherwise. Not needed by the engine build/test.
+      pyEnv = pkgs.python3.withPackages (ps: with ps; [ numpy opencv4 ]);
     in
     {
       devShells.${system} = {
diff --git a/pose/README.md b/pose/README.md
new file mode 100644
index 0000000..aa048d9
--- /dev/null
+++ b/pose/README.md
@@ -0,0 +1,95 @@
+# pose/ — camera recovery + cross-run alignment (downstream prototype)
+
+**Status: research prototype, deliberately OUTSIDE the validated engine.**
+
+The free-splatter.cpp engine is pieces 1–3 only; CLAUDE.md keeps the PnP pose
+solver out of `src/` on purpose. This directory is the *downstream consumer* of
+the engine's `[N, H, W, 23]` output — a self-contained, pure-Python prototype,
+**not wired into the CMake build or `ctest`**. It exists to prototype live,
+accumulating reconstruction from a moving-camera feed.
+
+## Why it's needed
+
+Each `free_splatter_run` over a different photo pair produces gaussians in its
+**own** coordinate system: positions are predicted in the first view's camera
+frame, then the whole scene is rescaled by `s = 1/mean(‖point‖)` (see upstream
+`estimate_poses`). To stitch successive splats into one world you need:
+
+1. **PnP** — recover each view's camera from the per-pixel 2D↔3D correspondences
+   the gaussians hand you for free.
+2. **Cross-run alignment** — because two runs normalize by a *different* `1/d`,
+   the shared content comes out at a different **scale**. Aligning run B into run
+   A is therefore a **similarity** (rotation + translation + one uniform scale,
+   7-DoF), not a rigid transform.
+
+## Upstream recipe being mirrored (TencentARC/FreeSplatter)
+
+`freesplatter/models/model.py::estimate_poses` → DUSt3R-derived `fast_pnp`
+(`utils/recon_util.py`):
+
+Verified against the cached upstream source (`.cache/upstream/{model,runner}.py`)
+and the fetched `recon_util.py`, the **scene** recipe (the live use case) is:
+
+| step | upstream (scene path) | here |
+|---|---|---|
+| correspondences | pixel ↔ gaussian center `X` (ch 0:3) | `pnp.pixel_grid` (integer px) + `points` |
+| validity mask (PnP) | `sigmoid(opacity) > 0.05` | `estimate_poses(opacity_threshold=)` (engine output is already activated) |
+| focal | Weiszfeld, **view 0 only** (`use_first_focal=True`), **all pixels** (`masks=None`) | `estimate_poses(use_first_focal=True)` → `focal.estimate_focal` |
+| pose | `cv2.solvePnPRansac(reprojErr=5, SOLVEPNP_SQPNP, iters=10)`, `cam2world=inv(world2cam)` | `pnp.solve_pnp_cv2` (exact) / `solve_pnp_numpy` (reference) |
+| anchor | none — view 0 is ~identity by construction (gaussians live in view 0's frame) | `estimate_poses` returns raw poses |
+| scene scale | runner rescales **camera centers** so the view0→view1 **baseline = 1**: `s = 1/(‖t₁−t₀‖+1e-2)` | `pnp.rescale_to_baseline` / `estimate_poses(normalize=True)` |
+
+> An earlier draft had three of these wrong (focal averaged over all views, an
+> `inv(c2w[0])` re-anchor, and a `1/mean‖pts‖` scene scale). Driving the *actual*
+> upstream `estimate_poses` on real engine output (see `check_upstream_parity.py`)
+> surfaced and fixed all three — the focal one mattered: averaging a low-overlap
+> second view dragged focal 507 vs the correct 596, a 15% error → 1.35° pose error.
+
+`solve_pnp_cv2` is the byte-for-byte upstream call (needs `opencv`, in the CUDA
+container or the nix shell). `solve_pnp_numpy` is a DLT + RANSAC reference with no
+third-party deps, verified against analytic ground truth so the pipeline runs
+anywhere numpy is.
+
+## Files
+
+- `focal.py` — Weiszfeld shared-focal estimation (mirrors upstream `estimate_focal`).
+- `pnp.py` — DLT/RANSAC + cv2 PnP backends, `estimate_poses` orchestration, and
+  `rescale_to_baseline` (the runner's scene normalization).
+- `align.py` — Umeyama similarity fit, RANSAC variant, the **residual ladder**
+  (`diagnose`: rigid→similarity→affine + depth-correlation) that detects whether a
+  single uniform scale suffices or the mismatch is non-uniform, and similarity
+  chaining / loop-closure metrics.
+- `_upstream.py` — vendored upstream kernels (`fast_pnp` verbatim, `estimate_focal`
+  numpy-transcribed) used ONLY by the parity check; not our code.
+- `test_pose.py` — asset-free golden tests (no model, no fixtures, no cv2).
+- `check_cv2_parity.py` — `solve_pnp_numpy` vs the exact `cv2.solvePnPRansac` on
+  synthetic ground truth (needs cv2).
+- `check_upstream_parity.py` — our whole orchestration vs the upstream recipe on
+  REAL engine output (needs cv2 + a dumped `[N,H,W,23]` `.f32`).
+- `empirical.py` — cross-run residual-ladder on real engine output (the scale test).
+
+## Run the tests
+
+```sh
+nix develop -c python3 pose/test_pose.py                      # golden, numpy only
+nix develop -c python3 pose/check_cv2_parity.py               # solver vs cv2 (needs cv2)
+EMP_DIR=/path/to/dumps nix develop -c python3 \
+    pose/check_upstream_parity.py A_scn.f32                   # orchestration vs upstream
+```
+
+## Status
+
+- ✅ **Solver parity** — `solve_pnp_numpy` ≡ `cv2.solvePnPRansac` (~1e-7 clean).
+- ✅ **Orchestration parity** — our `estimate_poses` is **bit-exact** to upstream's
+  scene recipe on real engine output (focal/PnP/relative-pose all 0.00° at matched
+  precision; the only residual is upstream's float32-K vs our float64, a RANSAC
+  inlier-boundary effect of ≤0.5° on near-degenerate object data, 0° on scene).
+- ✅ **Empirical scale test** — cross-run mismatch is a uniform-scale similarity
+  (scene ~11% scale drift), no nonlinear warp; `diagnose` classifies it.
+
+## Not done yet (honest)
+
+- A **dense/known-good control** sequence (other system or GT poses) before
+  trusting the live path — the baseline-sweet-spot experiment.
+- The live pipeline itself (sliding window, sim3 pose-graph to bound drift) and
+  consensus fusion (the floater-removal step) — separate, later.
diff --git a/pose/_upstream.py b/pose/_upstream.py
new file mode 100644
index 0000000..97f7199
--- /dev/null
+++ b/pose/_upstream.py
@@ -0,0 +1,107 @@
+"""Vendored upstream reference for the parity check -- NOT our code.
+
+These are the two numerical kernels FreeSplatter's `estimate_poses` calls, copied
+from TencentARC/FreeSplatter `freesplatter/utils/recon_util.py`
+(https://raw.githubusercontent.com/TencentARC/FreeSplatter/main/freesplatter/utils/recon_util.py,
+fetched 2026-06-26). They exist here ONLY so `check_upstream_parity.py` can run
+the original algorithm against our `focal.py`/`pnp.py` on real engine output.
+
+  * `fast_pnp`  -- copied VERBATIM (it is cv2 + numpy only; runs as-is in nix).
+  * `estimate_focal_np` -- a faithful numpy transcription of upstream
+    `estimate_focal` (the original is torch; the math is identical -- integer
+    `xy_grid` pixels, nan_to_num, mean/mean init, 10 IRLS reweights, the
+    `focal_base` clip which is a no-op at the default min/max). Kept line-by-line
+    so any divergence is a real algorithmic difference, not a re-derivation.
+
+Do not "improve" or refactor this file; its value is being unmodified upstream.
+"""
+import cv2
+import numpy as np
+
+
+def xy_grid(W, H):
+    """Upstream xy_grid (numpy branch): (H,W,2) int, [...,0]=col(x), [...,1]=row(y)."""
+    tw, th = np.arange(W), np.arange(H)
+    gx, gy = np.meshgrid(tw, th, indexing="xy")     # both (H,W)
+    return np.stack([gx, gy], axis=-1)
+
+
+def estimate_focal_np(pts3d, pp=None, mask=None, min_focal=0.0, max_focal=np.inf):
+    """Faithful numpy transcription of upstream recon_util.estimate_focal."""
+    H, W, THREE = pts3d.shape
+    assert THREE == 3
+    if pp is None:
+        pp = np.array([W / 2.0, H / 2.0], dtype=float)
+    pp = np.asarray(pp, float)
+
+    pixels = xy_grid(W, H).reshape(-1, 2).astype(float) - pp.reshape(1, 2)   # (HW,2)
+    pts = pts3d.reshape(H * W, 3).astype(float)
+
+    if mask is not None:
+        m = np.asarray(mask).ravel().astype(bool)
+        assert len(m) == pts.shape[0]
+        pts = pts[m]
+        pixels = pixels[m]
+
+    xy_over_z = pts[:, :2] / pts[:, 2:3]
+    xy_over_z = np.nan_to_num(xy_over_z, posinf=0.0, neginf=0.0)
+
+    dot_xy_px = (xy_over_z * pixels).sum(axis=-1)
+    dot_xy_xy = (xy_over_z ** 2).sum(axis=-1)
+    focal = dot_xy_px.mean() / dot_xy_xy.mean()
+
+    for _ in range(10):
+        dis = np.linalg.norm(pixels - focal * xy_over_z, axis=-1)
+        w = 1.0 / np.clip(dis, 1e-8, None)
+        focal = (w * dot_xy_px).mean() / (w * dot_xy_xy).mean()
+
+    focal_base = max(H, W) / (2 * np.tan(np.deg2rad(60) / 2))
+    focal = float(np.clip(focal, min_focal * focal_base, max_focal * focal_base))
+    return focal
+
+
+def fast_pnp(pts3d, mask, focal=None, pp=None, niter_PnP=10, k_dtype=np.float32):
+    """VERBATIM copy of upstream recon_util.fast_pnp (cv2 + numpy).
+
+    The default `k_dtype=np.float32` reproduces upstream byte-for-byte (upstream
+    writes `K = np.float32(...)`). `check_upstream_parity.py` passes float64 to
+    isolate the one precision choice that differs from our wrapper: on near-
+    threshold-dense object data the float32 K nudges a few hundred points across
+    the 5px inlier boundary, shifting the pose ~0.5deg. Matched precision -> exact."""
+    H, W, _ = pts3d.shape
+    pixels = np.mgrid[:W, :H].T.astype(float)
+
+    if focal is None:
+        S = max(W, H)
+        tentative_focals = np.geomspace(S / 2, S * 3, 21)
+    else:
+        tentative_focals = [focal]
+
+    if pp is None:
+        pp = (W / 2, H / 2)
+
+    best = 0,
+    for focal in tentative_focals:
+        K = np.array([(focal, 0, pp[0]), (0, focal, pp[1]), (0, 0, 1)], dtype=k_dtype)
+
+        success, R, T, inliers = cv2.solvePnPRansac(
+            pts3d[mask], pixels[mask], K, None,
+            iterationsCount=niter_PnP, reprojectionError=5, flags=cv2.SOLVEPNP_SQPNP)
+        if not success:
+            continue
+
+        score = len(inliers)
+        if success and score > best[0]:
+            best = score, R, T, focal
+
+    if not best[0]:
+        return None
+
+    _, R, T, best_focal = best
+    R = cv2.Rodrigues(R)[0]      # world to cam
+    world2cam = np.eye(4).astype(float)
+    world2cam[:3, :3] = R
+    world2cam[:3, 3] = T.reshape(3)
+    cam2world = np.linalg.inv(world2cam)
+
+    return best_focal, cam2world
diff --git a/pose/align.py b/pose/align.py
new file mode 100644
index 0000000..55d8652
--- /dev/null
+++ b/pose/align.py
@@ -0,0 +1,214 @@
+"""Coordinate-system normalization: align one inference's world into another's.
+
+DOWNSTREAM of the engine seam. Each free_splatter_run over a different photo
+pair invents its OWN coordinate system (gaussians in the first view's camera
+frame, then rescaled by s = 1/mean-point-distance -- see FreeSplatter
+estimate_poses). Two runs that share a photo therefore describe the SAME surface
+at a DIFFERENT scale, because each run's 1/d normalization is computed over a
+different point support. So aligning run B into run A is a SIMILARITY transform
+(rotation + translation + ONE uniform scale, 7 DoF), not a rigid one.
+
+This module fits that similarity in closed form (Umeyama 1991), provides a
+RANSAC variant for floaters, and -- crucially -- a residual ladder that *detects*
+whether a single uniform scale actually suffices or whether the mismatch is
+non-uniform (focal error / affine depth ambiguity), which is the open question.
+
+Pure numpy. A similarity is represented as the tuple T = (s, R, t) acting on
+column-stacked points by  x -> s * R @ x + t.
+"""
+import numpy as np
+
+
+def apply_sim(T, X):
+    """Apply T=(s,R,t) to X (N,3) -> (N,3)."""
+    s, R, t = T
+    return (s * (R @ X.T)).T + t
+
+
+def _rms(resid):
+    """Root-mean-square point error of an (N,3) residual array."""
+    return float(np.sqrt((resid ** 2).sum(axis=1).mean()))
+
+
+def fit_similarity(X, Y, with_scale=True):
+    """Umeyama: best (s,R,t) minimizing ||s R X + t - Y||^2 in closed form.
+
+    with_scale=False degenerates to rigid Kabsch (s fixed at 1). Reflections are
+    rejected (proper rotation, det R = +1) exactly as Umeyama prescribes.
+    """
+    X = np.asarray(X, float)
+    Y = np.asarray(Y, float)
+    n = X.shape[0]
+    mx = X.mean(0)
+    my = Y.mean(0)
+    Xc = X - mx
+    Yc = Y - my
+    Sigma = (Yc.T @ Xc) / n               # 3x3 cross-covariance, maps X -> Y
+    U, D, Vt = np.linalg.svd(Sigma)
+    S = np.eye(3)
+    if np.linalg.det(U) * np.linalg.det(Vt) < 0:   # reflection guard
+        S[-1, -1] = -1.0
+    R = U @ S @ Vt
+    if with_scale:
+        var_x = (Xc ** 2).sum() / n
+        s = float((D * np.diag(S)).sum() / var_x) if var_x > 0 else 1.0
+    else:
+        s = 1.0
+    t = my - s * (R @ mx)
+    return s, R, t
+
+
+def fit_rigid(X, Y):
+    """Rigid (6-DoF) fit -- similarity with the scale DoF removed."""
+    return fit_similarity(X, Y, with_scale=False)
+
+
+def fit_affine(X, Y):
+    """Full affine (12-DoF) fit: Y ~= A X + b. Returns ((A,b), prediction).
+
+    Used only as the next rung of the residual ladder: how much of the leftover
+    error after a similarity is explained by *non-uniform* (shear/anisotropic)
+    deformation -- the signature of focal error or affine-invariant depth.
+    """
+    n = X.shape[0]
+    Xh = np.hstack([X, np.ones((n, 1))])          # (n,4)
+    M, *_ = np.linalg.lstsq(Xh, Y, rcond=None)    # (4,3)
+    pred = Xh @ M
+    A = M[:3].T
+    b = M[3]
+    return (A, b), pred
+
+
+def fit_similarity_ransac(X, Y, thresh, iters=300, with_scale=True, seed=0):
+    """RANSAC similarity: robust to gross outliers (floaters in the overlap).
+
+    Minimal sample is 3 point pairs (a similarity has 7 DoF; 3 pairs give 9
+    constraints). Returns (s,R,t, inlier_mask).
+    """
+    rng = np.random.default_rng(seed)
+    n = len(X)
+    best = np.zeros(n, bool)
+    for _ in range(iters):
+        idx = rng.choice(n, 3, replace=False)
+        try:
+            T = fit_similarity(X[idx], Y[idx], with_scale)
+        except np.linalg.LinAlgError:
+            continue
+        res = np.linalg.norm(apply_sim(T, X) - Y, axis=1)
+        inl = res < thresh
+        if inl.sum() > best.sum():
+            best = inl
+    if best.sum() < 3:                            # fall back to plain fit
+        best = np.ones(n, bool)
+    T = fit_similarity(X[best], Y[best], with_scale)
+    return (*T, best)
+
+
+def residual_ladder(X, Y):
+    """Fit rigid -> similarity -> affine and report each RMS residual.
+
+    The DROP from rigid->similarity is how much was pure uniform scale; the drop
+    from similarity->affine is how much non-uniform warp remains. depth_corr is
+    the Pearson correlation between the per-point similarity residual magnitude
+    and the point's distance from the centroid -- a positive value means the
+    leftover error GROWS with depth, the fingerprint of focal/projective error.
+    All residuals are absolute (same units as Y); compare against `scene`.
+    """
+    scene = _rms(Y - Y.mean(0))                   # overall extent of the target
+
+    Tr = fit_rigid(X, Y)
+    r_rigid = _rms(apply_sim(Tr, X) - Y)
+
+    Ts = fit_similarity(X, Y, with_scale=True)
+    res_sim = apply_sim(Ts, X) - Y
+    r_sim = _rms(res_sim)
+
+    _, pred_aff = fit_affine(X, Y)
+    r_aff = _rms(pred_aff - Y)
+
+    depth = np.linalg.norm(X - X.mean(0), axis=1)
+    rmag = np.linalg.norm(res_sim, axis=1)
+    depth_corr = float(np.corrcoef(depth, rmag)[0, 1]) if rmag.std() > 1e-12 else 0.0
+
+    return {
+        "scene": scene,
+        "rigid": r_rigid,
+        "similarity": r_sim,
+        "affine": r_aff,
+        "scale": float(Ts[0]),
+        "depth_corr": depth_corr,
+    }
+
+
+def diagnose(X, Y, tol=1e-3, corr_tol=0.3):
+    """Classify the A<->B mismatch from the residual ladder.
+
+    Residuals are judged relative to scene extent. Verdicts:
+      rigid_ok    : a rigid transform already fits -- same scale (rare across runs)
+      needs_scale : a SIMILARITY fits but rigid does not -- the expected case
+      needs_affine: even similarity fails but affine fits -- non-uniform warp
+      nonlinear   : affine fails too -- genuinely nonlinear (e.g. quadratic depth)
+    `depth_corr` is reported as corroborating evidence of structure.
+    """
+    L = residual_ladder(X, Y)
+    sc = L["scene"] or 1.0
+    rr, rs, ra = L["rigid"] / sc, L["similarity"] / sc, L["affine"] / sc
+    # Is the leftover after a similarity STRUCTURED (a real warp) or just noise?
+    # Two independent signatures: affine meaningfully beats similarity, OR the
+    # residual correlates with depth. Neither => unstructured noise floor.
+    aff_gain = (L["similarity"] - L["affine"]) / L["similarity"] if L["similarity"] > 0 else 0.0
+    structured = abs(L["depth_corr"]) > corr_tol or aff_gain > 0.25
+    if rr < tol:
+        v = "rigid_ok"                       # same scale AND pose: rare across runs
+    elif rs < tol:
+        v = "needs_scale"                    # a clean similarity fits: the expected case
+    elif ra < tol:
+        v = "needs_affine"                   # similarity fails, affine fits: non-uniform
+    elif structured:
+        v = "nonlinear"                      # even affine fails AND residual is structured
+    else:
+        v = "similarity_plus_noise"          # similarity is the model; rest is noise floor
+    L["verdict"] = v
+    L["structured"] = bool(structured)
+    L["aff_gain"] = float(aff_gain)
+    return L
+
+
+# --- chaining similarities along the photo stream --------------------------
+
+def compose(T2, T1):
+    """Return T2 . T1 (apply T1 then T2)."""
+    s1, R1, t1 = T1
+    s2, R2, t2 = T2
+    return (s2 * s1, R2 @ R1, s2 * (R2 @ t1) + t2)
+
+
+def invert(T):
+    """Inverse similarity."""
+    s, R, t = T
+    si = 1.0 / s
+    Ri = R.T
+    return (si, Ri, -si * (Ri @ t))
+
+
+def identity():
+    return (1.0, np.eye(3), np.zeros(3))
+
+
+def loop_closure_error(transforms):
+    """Compose a closed loop of similarities and measure deviation from identity.
+
+    transforms: list [T_{0->1}, T_{1->2}, ..., T_{k->0}]. With drift-free links
+    the product is identity; the returned dict quantifies residual scale, rotation
+    (deg) and translation drift -- a direct scale-drift / loop-closure metric.
+    """
+    T = identity()
+    for Ti in transforms:
+        T = compose(Ti, T)
+    s, R, t = T
+    ang = np.degrees(np.arccos(np.clip((np.trace(R) - 1) / 2, -1, 1)))
+    return {
+        "scale_err": abs(np.log(s)),          # 0 == no scale drift
+        "rot_deg": float(ang),
+        "trans": float(np.linalg.norm(t)),
+    }
diff --git a/pose/check_cv2_parity.py b/pose/check_cv2_parity.py
new file mode 100644
index 0000000..4f9e8f2
--- /dev/null
+++ b/pose/check_cv2_parity.py
@@ -0,0 +1,73 @@
+"""Verify the numpy DLT/RANSAC PnP against the EXACT upstream solver.
+
+FreeSplatter uses cv2.solvePnPRansac(reprojErr=5, SOLVEPNP_SQPNP). This compares
+our dependency-free reference backend to that exact call on synthetic scenes with
+known ground truth -- both must recover the true pose AND agree with each other.
+Needs cv2 (nix develop, after opencv was added to the flake).
+
+    nix develop -c python3 pose/check_cv2_parity.py
+"""
+import numpy as np
+import pnp
+
+RNG = np.random.default_rng(7)
+fails = []
+
+
+def rot_angle(R):
+    return np.degrees(np.arccos(np.clip((np.trace(R) - 1) / 2, -1, 1)))
+
+
+def rand_small_pose(rng):
+    ax = rng.normal(size=3)
+    ax /= np.linalg.norm(ax)
+    ang = np.radians(rng.uniform(3, 20))
+    Kx = np.array([[0, -ax[2], ax[1]], [ax[2], 0, -ax[0]], [-ax[1], ax[0], 0]])
+    R = np.eye(3) + np.sin(ang) * Kx + (1 - np.cos(ang)) * Kx @ Kx
+    t = rng.uniform(-0.6, 0.6, size=3)
+    return R, t
+
+
+def run(label, with_outliers):
+    f = 400.0
+    K = np.array([[f, 0, 256.0], [0, f, 256.0], [0, 0, 1.0]])
+    worst = {"np_R": 0, "np_t": 0, "cv_R": 0, "cv_t": 0, "agree_R": 0, "agree_t": 0}
+    for _ in range(10):
+        Pw = RNG.uniform([-2, -2, 4], [2, 2, 8], size=(500, 3))
+        R, t = rand_small_pose(RNG)
+        Xc = (R @ Pw.T).T + t
+        if np.any(Xc[:, 2] <= 0):
+            continue
+        px = (K @ Xc.T).T
+        px = px[:, :2] / px[:, 2:3]
+        if with_outliers:
+            oi = RNG.choice(len(px), int(0.2 * len(px)), replace=False)
+            px[oi] += RNG.uniform(-120, 120, size=(len(oi), 2))
+
+        c2w_np, _ = pnp.solve_pnp_numpy(Pw, px, K, thresh_px=4.0, iters=300)
+        c2w_cv, _ = pnp.solve_pnp_cv2(Pw, px, K)
+        w_np, w_cv = np.linalg.inv(c2w_np), np.linalg.inv(c2w_cv)
+        worst["np_R"] = max(worst["np_R"], rot_angle(w_np[:3, :3] @ R.T))
+        worst["np_t"] = max(worst["np_t"], np.linalg.norm(w_np[:3, 3] - t))
+        worst["cv_R"] = max(worst["cv_R"], rot_angle(w_cv[:3, :3] @ R.T))
+        worst["cv_t"] = max(worst["cv_t"], np.linalg.norm(w_cv[:3, 3] - t))
+        worst["agree_R"] = max(worst["agree_R"], rot_angle(w_np[:3, :3] @ w_cv[:3, :3].T))
+        worst["agree_t"] = max(worst["agree_t"], np.linalg.norm(w_np[:3, 3] - w_cv[:3, 3]))
+
+    tolR, tolt = (0.5, 0.05) if with_outliers else (1e-3, 1e-4)
+    print(f"{label}: worst-of-10  "
+          f"numpy(R={worst['np_R']:.2e}deg t={worst['np_t']:.2e})  "
+          f"cv2(R={worst['cv_R']:.2e}deg t={worst['cv_t']:.2e})  "
+          f"agree(R={worst['agree_R']:.2e}deg t={worst['agree_t']:.2e})")
+    ok = (worst["np_R"] < tolR and worst["cv_R"] < tolR and
+          worst["agree_R"] < tolR and worst["agree_t"] < tolt)
+    if not ok:
+        fails.append(label)
+    return ok
+
+
+print("PnP backend parity: numpy DLT/RANSAC  vs  upstream cv2.solvePnPRansac")
+run("clean   ", with_outliers=False)
+run("outliers", with_outliers=True)
+print("PARITY OK" if not fails else f"PARITY FAILED: {fails}")
+raise SystemExit(1 if fails else 0)
diff --git a/pose/check_upstream_parity.py b/pose/check_upstream_parity.py
new file mode 100644
index 0000000..6fceb65
--- /dev/null
+++ b/pose/check_upstream_parity.py
@@ -0,0 +1,154 @@
+"""Verify our WHOLE pose orchestration against upstream, on REAL engine output.
+
+`check_cv2_parity.py` already pinned our solver wrapper to cv2.solvePnPRansac on
+synthetic ground truth. This goes one level up: it drives the *entire* upstream
+`estimate_poses` recipe (FreeSplatter scene path: use_first_focal, opacity mask
+for PnP, no focal mask, pnp_iter=10 -- see .cache/upstream/{model,runner}.py)
+using the vendored-verbatim upstream kernels in `_upstream.py`, and compares it
+to our `focal.py`/`pnp.py` on the gaussians the C++ engine actually produced.
+
+Three levels, each isolating one stage so a divergence points at one file:
+  1. focal kernel   -- focal.estimate_focal           vs _upstream.estimate_focal_np
+  2. PnP solver     -- pnp.solve_pnp_cv2              vs _upstream.fast_pnp   (shared focal)
+  3. full pipeline  -- pnp.estimate_poses             vs the upstream recipe replica
+Plus the scene baseline-rescale (runner.run_freesplatter_scene) for reference.
+
+    nix develop -c python3 pose/check_upstream_parity.py A_scn.f32   # scene (live case)
+
+Defaults to the scratchpad dumps if no path is given. Opacity in the engine
+output is ALREADY sigmoid-activated, so mask = (opacity > thr) reproduces
+upstream's sigmoid(raw) > thr exactly.
+"""
+import os
+import sys
+import numpy as np
+import cv2
+
+import focal
+import pnp
+import _upstream as up
+
+C, OPACITY = 23, 15
+THR = 0.05
+SEED = 0           # cv2.solvePnPRansac draws from the shared global cv::theRNG();
+                   # reseed identically before each call so RANSAC picks the SAME
+                   # minimal samples -> isolates algorithmic parity from RNG state.
+SCRATCH = os.environ.get("EMP_DIR", "").rstrip("/")
+_fails = []
+
+
+def rot_angle(R):
+    return float(np.degrees(np.arccos(np.clip((np.trace(R) - 1) / 2, -1, 1))))
+
+
+def load(path, H=512, W=512):
+    a = np.fromfile(path, np.float32)
+    N = a.size // (H * W * C)
+    a = a.reshape(N, H, W, C)
+    pts = [a[i, ..., 0:3].astype(np.float64) for i in range(N)]      # (H,W,3) each
+    op = [a[i, ..., OPACITY].astype(np.float64) for i in range(N)]   # (H,W) each
+    return N, H, W, pts, op
+
+
+def check(name, cond, detail=""):
+    tag = "PASS" if cond else "FAIL"
+    print(f"  [{tag}] {name}" + (f"   ({detail})" if detail else ""))
+    if not cond:
+        _fails.append(name)
+
+
+def upstream_estimate_poses(pts, op, H, W, pnp_iter=10):
+    """Replicate FreeSplatter scene estimate_poses with vendored kernels.
+
+    use_first_focal=True, masks=None: focal from view0 over ALL pixels; per-view
+    PnP masked by opacity>THR. Returns (c2ws list, focal)."""
+    pp = np.array([W / 2.0, H / 2.0])
+    f = up.estimate_focal_np(pts[0], pp=pp, mask=None)              # view0, all pixels
+    c2ws = []
+    cv2.setRNGSeed(SEED)
+    for i in range(len(pts)):
+        mask = op[i] > THR                                          # (H,W) bool
+        res = up.fast_pnp(pts[i], mask, focal=f, niter_PnP=pnp_iter,
+                          k_dtype=np.float64)   # matched precision (our wrapper is f64)
+        c2ws.append(res[1])
+    return c2ws, f
+
+
+def main():
+    path = sys.argv[1] if len(sys.argv) > 1 else os.path.join(SCRATCH, "A_scn.f32")
+    if not os.path.isabs(path) and SCRATCH and not os.path.exists(path):
+        path = os.path.join(SCRATCH, path)
+    N, H, W, pts, op = load(path)
+    pp = np.array([W / 2.0, H / 2.0])
+    print(f"=== upstream-parity on {os.path.basename(path)}  ({N} views) ===")
+    for i in range(N):
+        print(f"  view{i}: valid(opacity>{THR}) = {(op[i] > THR).mean()*100:.1f}%")
+
+    # ---- Level 1: focal kernel (view0, all pixels) ----
+    print("\n[1] focal kernel  focal.estimate_focal  vs  upstream.estimate_focal_np")
+    grid = pnp.pixel_grid(H, W)                                     # our pixel convention
+    f_up = up.estimate_focal_np(pts[0], pp=pp, mask=None)
+    f_ours = focal.estimate_focal(pts[0].reshape(-1, 3), grid, pp)
+    rel = abs(f_ours - f_up) / f_up
+    check("focal matches upstream (all pixels, view0)", rel < 1e-6,
+          f"ours {f_ours:.4f} vs up {f_up:.4f}  rel {rel:.2e}")
+
+    # ---- Level 2: PnP solver, shared upstream focal ----
+    print("\n[2] PnP solver  pnp.solve_pnp_cv2  vs  upstream.fast_pnp  (focal fixed)")
+    K = np.array([[f_up, 0, pp[0]], [0, f_up, pp[1]], [0, 0, 1.0]])
+    for i in range(N):
+        mask = op[i] > THR
+        Xw = pts[i].reshape(-1, 3)[mask.ravel()]
+        px = grid[mask.ravel()]
+        cv2.setRNGSeed(SEED)
+        c2w_up = up.fast_pnp(pts[i], mask, focal=f_up, niter_PnP=10,
+                             k_dtype=np.float64)[1]      # matched precision
+        cv2.setRNGSeed(SEED)
+        c2w_ours, _ = pnp.solve_pnp_cv2(Xw, px, K, iters=10, reproj=5.0)
+        dR = rot_angle(c2w_ours[:3, :3] @ c2w_up[:3, :3].T)
+        dt = float(np.linalg.norm(c2w_ours[:3, 3] - c2w_up[:3, 3]))
+        check(f"view{i} cam2world matches upstream", dR < 1e-3 and dt < 1e-4,
+              f"dR {dR:.2e} deg  dt {dt:.2e}")
+        # diagnostic: upstream's native float32 K (precision-only effect)
+        cv2.setRNGSeed(SEED)
+        c2w_f32 = up.fast_pnp(pts[i], mask, focal=f_up, niter_PnP=10)[1]
+        dR32 = rot_angle(c2w_ours[:3, :3] @ c2w_f32[:3, :3].T)
+        print(f"        (upstream native float32 K: dR {dR32:.2e} deg -- "
+              f"RANSAC inlier-boundary precision, not orchestration)")
+
+    # ---- Level 3: full pipeline ----
+    print("\n[3] full pipeline  pnp.estimate_poses  vs  upstream recipe")
+    up_c2ws, f_up2 = upstream_estimate_poses(pts, op, H, W, pnp_iter=10)
+    cv2.setRNGSeed(SEED)
+    out = pnp.estimate_poses(pts, op, opacity_threshold=THR, backend="cv2",
+                             normalize=False, cam_frame_points=pts)
+    our_c2ws, f_our2 = out["cam2world"], out["focal"]
+    relf = abs(f_our2 - f_up2) / f_up2
+    check("pipeline focal matches", relf < 1e-6, f"ours {f_our2:.4f} vs up {f_up2:.4f}")
+    # anchor-invariant relative pose view0->view1 (handles any re-anchor convention)
+    def rel_pose(c):
+        return np.linalg.inv(c[0]) @ c[1]
+    Rup, Rours = rel_pose(up_c2ws), rel_pose(our_c2ws)
+    dR = rot_angle(Rours[:3, :3] @ Rup[:3, :3].T)
+    # compare translation DIRECTION (scale/anchor independent)
+    tup, tour = Rup[:3, 3], Rours[:3, 3]
+    cos_t = float(tup @ tour / (np.linalg.norm(tup) * np.linalg.norm(tour) + 1e-12))
+    check("relative-pose rotation matches", dR < 1e-2, f"dR {dR:.2e} deg")
+    check("relative-pose translation direction matches", cos_t > 1 - 1e-6,
+          f"cos {cos_t:.8f}")
+
+    # ---- scene baseline rescale (runner), reference only ----
+    baseline = float(np.linalg.norm(up_c2ws[1][:3, 3] - up_c2ws[0][:3, 3])) + 1e-2
+    print(f"\n[ref] scene baseline rescale: 1/baseline = {1.0/baseline:.4f} "
+          f"(runner applies this to camera centers; our estimate_poses returns raw "
+          f"poses, normalize=False)")
+
+    print()
+    if _fails:
+        print(f"UPSTREAM PARITY FAILED ({len(_fails)}): " + ", ".join(_fails))
+        raise SystemExit(1)
+    print("UPSTREAM PARITY OK")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pose/empirical.py b/pose/empirical.py
new file mode 100644
index 0000000..bd91498
--- /dev/null
+++ b/pose/empirical.py
@@ -0,0 +1,83 @@
+"""Empirical cross-run test on REAL engine output.
+
+A photo that appears in two separate inferences (an overlapping pair sharing it)
+is reconstructed twice, in two different coordinate systems. Its per-pixel
+gaussian centers from the two runs are the SAME physical points described two
+ways -- exact correspondences. Fitting a similarity between them and reading the
+residual ladder answers the open question on real data: is the cross-run mismatch
+a clean uniform-scale similarity, or does it carry nonlinear warp?
+
+Usage:
+    nix develop -c python3 pose/empirical.py A.f32 viewA B.f32 viewB [label]
+
+A.f32/B.f32 are raw engine outputs [N,H,W,23]; viewA/viewB pick the shared photo
+within each (e.g. it's view 1 of pair (000,004) and view 0 of pair (004,013)).
+"""
+import sys
+import numpy as np
+import align
+
+C = 23
+OPACITY = 15
+
+
+def load_view(path, view, H=512, W=512):
+    a = np.fromfile(path, np.float32)
+    N = a.size // (H * W * C)
+    a = a.reshape(N, H, W, C)
+    g = a[view]
+    return g[..., 0:3].reshape(-1, 3).astype(np.float64), g[..., OPACITY].reshape(-1)
+
+
+def main():
+    pA, vA, pB, vB = sys.argv[1], int(sys.argv[2]), sys.argv[3], int(sys.argv[4])
+    label = sys.argv[5] if len(sys.argv) > 5 else ""
+    XA, oA = load_view(pA, vA)
+    XB, oB = load_view(pB, vB)
+
+    print(f"=== {label} ===")
+    print(f"opacity A: [{oA.min():.3f},{oA.max():.3f}] mean {oA.mean():.3f} | "
+          f"B: [{oB.min():.3f},{oB.max():.3f}] mean {oB.mean():.3f}")
+
+    thr = 0.05
+    mask = (oA > thr) & (oB > thr)
+    A, B = XA[mask], XB[mask]
+    print(f"valid (opacity>{thr} in both): {mask.sum()}/{mask.size} "
+          f"({100*mask.mean():.1f}%)")
+    if mask.sum() < 100:
+        print("too few valid correspondences"); return
+
+    scene = align._rms(A - A.mean(0))
+    # robust fit B -> A; threshold = 2% of scene extent
+    s, R, t, inl = align.fit_similarity_ransac(B, A, thresh=0.02 * scene, iters=400)
+    print(f"scene extent (rms): {scene:.4f}")
+    print(f"robust similarity B->A:  scale={s:.4f}  inliers={inl.sum()}/{len(A)} "
+          f"({100*inl.mean():.1f}%)")
+
+    # how cross-run-consistent is the WHOLE valid set under the robust fit?
+    res_all = np.linalg.norm(align.apply_sim((s, R, t), B) - A, axis=1)
+    print("cross-run consistency (all valid pixels, under robust similarity):")
+    for q in (0.01, 0.02, 0.05, 0.10):
+        print(f"   within {100*q:4.0f}% of scene extent: "
+              f"{100*(res_all < q*scene).mean():5.1f}% of pixels")
+
+    # residual ladder on the inlier (well-corresponded) surface
+    L = align.diagnose(B[inl], A[inl])
+    sc = L["scene"]
+    print("residual ladder (inliers, RMS  |  % of scene extent):")
+    for k in ("rigid", "similarity", "affine"):
+        print(f"   {k:11s} {L[k]:.5f}   {100*L[k]/sc:6.2f}%")
+    print(f"   recovered scale  {L['scale']:.4f}")
+    print(f"   depth_corr       {L['depth_corr']:+.3f}  (structured={L['structured']})")
+    print(f"   VERDICT          {L['verdict']}")
+
+    # interpretation
+    drop_scale = (L["rigid"] - L["similarity"]) / sc
+    drop_aff = (L["similarity"] - L["affine"]) / sc
+    print(f"interpretation: rigid->similarity drop {100*drop_scale:.2f}% "
+          f"(=how much was uniform scale); similarity->affine drop "
+          f"{100*drop_aff:.2f}% (=non-uniform warp).")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pose/focal.py b/pose/focal.py
new file mode 100644
index 0000000..1e2ac27
--- /dev/null
+++ b/pose/focal.py
@@ -0,0 +1,48 @@
+"""Shared focal-length estimation (Weiszfeld), faithful to FreeSplatter.
+
+Upstream (freesplatter/models/model.py::estimate_focal, via DUSt3R) assumes a
+single shared focal across views, centered principal point, square pixels. Per
+view it solves
+
+    f* = argmin_f  sum_ij || (i',j') - f * (X_ij0/X_ij2, X_ij1/X_ij2) ||
+
+where (i',j') are pixel coords relative to the principal point and X is that
+view's camera-frame point map (gaussian xyz). The robust minimizer is found by
+the Weiszfeld algorithm (iteratively reweighted least squares, ~L1). Per-view
+focals are then averaged into one shared value.
+
+Pure numpy -- no cv2 needed for this stage.
+"""
+import numpy as np
+
+
+def estimate_focal(pts3d_cam, pixels, pp, weiszfeld_iters=10):
+    """Weiszfeld focal from camera-frame points and their pixels.
+
+    pts3d_cam : (...,3) points in the view's OWN camera frame (Z>0 in front).
+    pixels    : (...,2) pixel coordinates, same leading shape.
+    pp        : (2,) principal point (image center).
+    """
+    X = np.asarray(pts3d_cam, float).reshape(-1, 3)
+    P = np.asarray(pixels, float).reshape(-1, 2) - np.asarray(pp, float)
+    # ray directions; Z==0 -> 0 exactly as upstream estimate_focal (nan_to_num)
+    u = np.nan_to_num(X[:, :2] / X[:, 2:3], posinf=0.0, neginf=0.0)   # (N,2)
+
+    pu = (P * u).sum(1)                            # per-point p . u
+    uu = (u * u).sum(1)                            # per-point ||u||^2
+    f = pu.sum() / uu.sum()                        # least-squares init
+    for _ in range(weiszfeld_iters):
+        r = np.linalg.norm(P - f * u, axis=1)
+        w = 1.0 / np.maximum(r, 1e-8)
+        f = (w * pu).sum() / (w * uu).sum()
+    return float(f)
+
+
+def estimate_shared_focal(views, pp, weiszfeld_iters=10):
+    """Average per-view focals into one shared focal.
+
+    views: list of (pts3d_cam, pixels) pairs. Mirrors upstream's
+    `focals = focals.mean().repeat(N)`.
+    """
+    fs = [estimate_focal(p3, px, pp, weiszfeld_iters) for (p3, px) in views]
+    return float(np.mean(fs)), fs
diff --git a/pose/pnp.py b/pose/pnp.py
new file mode 100644
index 0000000..85f3dd8
--- /dev/null
+++ b/pose/pnp.py
@@ -0,0 +1,245 @@
+"""PnP camera recovery from the engine's per-pixel gaussians.
+
+Mirrors FreeSplatter's estimate_poses (freesplatter/models/model.py, which calls
+the DUSt3R-derived fast_pnp in utils/recon_util.py):
+
+  * correspondences : every pixel -> its predicted 3D gaussian center X (channels
+    0:3) <-> its 2D pixel coordinate.
+  * validity mask   : opacity > 0.05 (drops floaters before PnP).
+  * focal           : Weiszfeld per view, averaged into one shared focal (focal.py).
+  * pose            : cv2.solvePnPRansac(reprojectionError=5, SOLVEPNP_SQPNP,
+                      iterationsCount=10); world2cam, then cam2world = inv(.);
+                      OpenCV convention; view 0 comes out ~identity by construction
+                      (gaussians are predicted in view 0's frame), no re-anchor.
+  * scene scale     : upstream's `estimate_poses` does NOT normalize; the runner
+                      (run_freesplatter_scene) rescales camera centers so the
+                      view0->view1 baseline is 1.0 (s = 1/(||t1-t0||+1e-2)). See
+                      `rescale_to_baseline`. (An earlier 1/mean||pts|| guess was
+                      wrong -- verified against .cache/upstream/{model,runner}.py.)
+
+Two solver backends, selected by `backend`:
+  - "cv2"  : the exact upstream call (needs opencv; available in the CUDA
+             container or after adding opencv to the nix shell). Use this for
+             numerical parity against the original.
+  - "numpy": a dependency-free DLT + RANSAC reference solver, so the pipeline and
+             its golden tests run anywhere numpy is present. Verified against
+             analytic ground truth in test_pose.py.
+  - "auto" : cv2 if importable, else numpy.
+
+Gaussian channel layout (scene, 23 ch): xyz[0:3] SH[3:15] opacity[15] scale[16:19]
+rotation[19:23]; opacity is assumed already activated (sigmoid) in [0,1].
+"""
+import numpy as np
+
+OPACITY_CHANNEL = 15
+
+
+# --- numpy reference PnP (known intrinsics) --------------------------------
+
+def _dlt(Xw, xn):
+    """Direct Linear Transform for world2cam [R|t] from normalized image coords.
+
+    Xw : (N,3) world points. xn : (N,2) normalized coords ( K^-1 [u v 1] ).
+    Returns the raw 3x4 matrix p (defined up to sign/scale); decode with _decode.
+    """
+    N = len(Xw)
+    A = np.zeros((2 * N, 12))
+    Xh = np.hstack([Xw, np.ones((N, 1))])         # (N,4)
+    A[0::2, 0:4] = Xh
+    A[0::2, 8:12] = -xn[:, 0:1] * Xh
+    A[1::2, 4:8] = Xh
+    A[1::2, 8:12] = -xn[:, 1:2] * Xh
+    _, _, Vt = np.linalg.svd(A)
+    return Vt[-1].reshape(3, 4)
+
+
+def _decode(p, Xw, xn):
+    """Turn a DLT matrix into a proper (R,t) world2cam, resolving sign by
+    reprojection error and cheirality (points must sit in front of the camera)."""
+    best = None
+    for sgn in (1.0, -1.0):
+        M = sgn * p[:, :3]
+        tau = sgn * p[:, 3]
+        U, Sv, Vt = np.linalg.svd(M)
+        d = np.linalg.det(U @ Vt)
+        R = U @ np.diag([1, 1, d]) @ Vt           # nearest proper rotation
+        lam = Sv.mean()                            # M ~= lam * R
+        if lam < 1e-12:
+            continue
+        t = tau / lam
+        Xc = (R @ Xw.T).T + t
+        if np.any(Xc[:, 2] <= 0):
+            err = np.inf
+        else:
+            proj = Xc[:, :2] / Xc[:, 2:3]
+            err = float(np.sqrt(((proj - xn) ** 2).sum(1)).mean())
+        if best is None or err < best[0]:
+            best = (err, R, t)
+    if best is None:
+        raise np.linalg.LinAlgError("PnP decode failed (all-behind camera)")
+    return best[1], best[2], best[0]
+
+
+def solve_pnp_numpy(Xw, pixels, K, thresh_px=5.0, iters=100, seed=0):
+    """RANSAC DLT PnP. Returns (cam2world 4x4, inlier_mask)."""
+    Kinv = np.linalg.inv(K)
+    uv1 = np.hstack([pixels, np.ones((len(pixels), 1))])
+    xn = (Kinv @ uv1.T).T[:, :2]                  # normalized image coords
+    thresh = thresh_px / K[0, 0]                   # px tol -> normalized tol
+
+    rng = np.random.default_rng(seed)
+    n = len(Xw)
+    best = np.zeros(n, bool)
+    for _ in range(iters):
+        idx = rng.choice(n, 6, replace=False)
+        try:
+            R, t, _ = _decode(_dlt(Xw[idx], xn[idx]), Xw[idx], xn[idx])
+        except np.linalg.LinAlgError:
+            continue
+        Xc = (R @ Xw.T).T + t
+        good = Xc[:, 2] > 0
+        proj = np.full((n, 2), 1e9)
+        proj[good] = Xc[good, :2] / Xc[good, 2:3]
+        inl = np.linalg.norm(proj - xn, axis=1) < thresh
+        if inl.sum() > best.sum():
+            best = inl
+    if best.sum() < 6:
+        best = np.ones(n, bool)
+    R, t, _ = _decode(_dlt(Xw[best], xn[best]), Xw[best], xn[best])
+    world2cam = np.eye(4)
+    world2cam[:3, :3] = R
+    world2cam[:3, 3] = t
+    return np.linalg.inv(world2cam), best
+
+
+def solve_pnp_cv2(Xw, pixels, K, iters=20, reproj=5.0):
+    """Exact upstream call (needs cv2). Returns (cam2world, inlier_mask)."""
+    import cv2
+    ok, rvec, tvec, inliers = cv2.solvePnPRansac(
+        Xw.astype(np.float64), pixels.astype(np.float64), K.astype(np.float64),
+        None, iterationsCount=iters, reprojectionError=reproj,
+        flags=cv2.SOLVEPNP_SQPNP)
+    if not ok:
+        raise RuntimeError("cv2.solvePnPRansac failed")
+    R, _ = cv2.Rodrigues(rvec)
+    world2cam = np.eye(4)
+    world2cam[:3, :3] = R
+    world2cam[:3, 3] = tvec[:, 0]
+    mask = np.zeros(len(Xw), bool)
+    if inliers is not None:
+        mask[inliers[:, 0]] = True
+    return np.linalg.inv(world2cam), mask
+
+
+def _have_cv2():
+    try:
+        import cv2  # noqa: F401
+        return True
+    except Exception:
+        return False
+
+
+def solve_pnp(Xw, pixels, K, backend="auto", **kw):
+    if backend == "auto":
+        backend = "cv2" if _have_cv2() else "numpy"
+    if backend == "cv2":
+        return solve_pnp_cv2(Xw, pixels, K, **kw)
+    return solve_pnp_numpy(Xw, pixels, K, **kw)
+
+
+# --- orchestration ---------------------------------------------------------
+
+def pixel_grid(H, W):
+    """(H*W, 2) INTEGER pixel coords [x=col, y=row], row-major -- matches upstream
+    xy_grid / np.mgrid[:W,:H].T exactly (NO half-pixel offset). Row-major so it
+    lines up with a C-order (H,W,3) gaussian map flattened by reshape(-1,3)."""
+    xs, ys = np.meshgrid(np.arange(W), np.arange(H))   # both (H,W): xs=col, ys=row
+    return np.stack([xs.ravel(), ys.ravel()], axis=1).astype(float)
+
+
+def rescale_to_baseline(cam2world):
+    """Scene normalization from runner.run_freesplatter_scene: rescale camera
+    centers so the view0->view(last) baseline is 1.0. Returns (list, scale_factor).
+
+    Mirrors upstream exactly: baseline = ||t_last - t_0|| + 1e-2; s = 1/baseline;
+    every camera translation *= s (rotations untouched)."""
+    t0 = cam2world[0][:3, 3]
+    baseline = float(np.linalg.norm(cam2world[-1][:3, 3] - t0)) + 1e-2
+    s = 1.0 / baseline
+    out = []
+    for c in cam2world:
+        c = c.copy()
+        c[:3, 3] *= s
+        out.append(c)
+    return out, s
+
+
+def estimate_poses(points, opacities=None, focal=None, pp=None,
+                   opacity_threshold=0.05, backend="auto", cam_frame_points=None,
+                   use_first_focal=True, focal_use_opacity_mask=False,
+                   pnp_iter=10, normalize=False):
+    """Faithful port of FreeSplatter's scene estimate_poses (.cache/upstream/model.py)
+    plus the runner's optional baseline rescale.
+
+    Defaults mirror the scene recipe: focal from view 0 over ALL pixels
+    (use_first_focal=True, masks=None), per-view PnP masked by opacity>thr,
+    pnp_iter=10, cam2world=inv(world2cam). estimate_poses returns the RAW poses
+    (view 0 comes out ~identity because the gaussians live in view 0's frame -- no
+    explicit re-anchor, matching upstream). Set normalize=True for the runner's
+    1/baseline camera rescale (the scene viewer normalization).
+
+    points          : list of (H,W,3) gaussian-center maps (view 0 frame).
+    opacities       : list of (H,W) activated opacities for the PnP mask (or None).
+    cam_frame_points: per-view points in each view's OWN frame for focal. With
+                      use_first_focal only view 0 is used, and view 0's frame IS the
+                      world frame, so points[0] suffices; pass None to use `points`.
+    focal_use_opacity_mask: scene=False (all pixels, faithful); object would pass an
+                      alpha mask -- approximated here by the opacity mask if True.
+    Returns dict: cam2world (list of 4x4), focal, scale.
+    """
+    from focal import estimate_focal
+
+    backend_cv2 = backend == "cv2" or (backend == "auto" and _have_cv2())
+
+    N = len(points)
+    H, W = points[0].shape[:2]
+    if pp is None:
+        pp = np.array([W / 2.0, H / 2.0])
+    grid = pixel_grid(H, W)
+
+    masks = []
+    for i in range(N):
+        if opacities is None:
+            masks.append(np.ones(H * W, bool))
+        else:
+            masks.append(opacities[i].ravel() > opacity_threshold)
+
+    # focal: mirror estimate_focals -- view 0 only (use_first_focal), all pixels
+    # unless an opacity mask is explicitly requested.
+    if focal is None:
+        cfp = cam_frame_points if cam_frame_points is not None else points
+        n_focal = 1 if use_first_focal else N
+        fs = []
+        for i in range(n_focal):
+            m = masks[i] if (focal_use_opacity_mask and opacities is not None) \
+                else np.ones(H * W, bool)
+            fs.append(estimate_focal(cfp[i].reshape(-1, 3)[m], grid[m], pp))
+        focal = float(np.mean(fs))
+
+    K = np.array([[focal, 0, pp[0]], [0, focal, pp[1]], [0, 0, 1.0]])
+
+    cam2world = []
+    for i in range(N):
+        Xw = points[i].reshape(-1, 3)[masks[i]]
+        px = grid[masks[i]]
+        if backend_cv2:
+            c2w, _ = solve_pnp_cv2(Xw, px, K, iters=pnp_iter, reproj=5.0)
+        else:
+            c2w, _ = solve_pnp_numpy(Xw, px, K)
+        cam2world.append(c2w)
+
+    scale = 1.0
+    if normalize:
+        cam2world, scale = rescale_to_baseline(cam2world)
+
+    return {"cam2world": cam2world, "focal": focal, "scale": scale}
diff --git a/pose/test_pose.py b/pose/test_pose.py
new file mode 100644
index 0000000..aec4af7
--- /dev/null
+++ b/pose/test_pose.py
@@ -0,0 +1,235 @@
+"""Asset-free golden tests for the pose / coordinate-normalization prototype.
+
+No model, no fixtures, no cv2: synthetic geometry with KNOWN ground truth, the
+way CLAUDE.md wants new ops verified ("pinned to hand-computed references... a
+wrong op should fail with zero fixtures"). Run:
+
+    nix develop -c python3 pose/test_pose.py     # from repo root
+    cd pose && nix develop -c python3 test_pose.py
+
+Each test prints PASS/FAIL with the numbers it checked; exit code is nonzero on
+any failure.
+"""
+import numpy as np
+
+import align
+import focal
+import pnp
+
+RNG = np.random.default_rng(12345)
+_fails = []
+
+
+def check(name, cond, detail: object = ""):
+    tag = "PASS" if cond else "FAIL"
+    print(f"  [{tag}] {name}" + (f"  ({detail})" if detail else ""))
+    if not cond:
+        _fails.append(name)
+
+
+def rand_rotation(rng):
+    Q, _ = np.linalg.qr(rng.normal(size=(3, 3)))
+    if np.linalg.det(Q) < 0:
+        Q[:, 0] = -Q[:, 0]
+    return Q
+
+
+def rot_angle(R):
+    return np.degrees(np.arccos(np.clip((np.trace(R) - 1) / 2, -1, 1)))
+
+
+# ===========================================================================
+# Coordinate-system normalization (the scale question)
+# ===========================================================================
+
+def test_similarity_roundtrip():
+    """Recover a KNOWN (s,R,t) exactly -- the fundamental correctness golden."""
+    print("test_similarity_roundtrip")
+    X = RNG.normal(size=(200, 3)) * np.array([3, 2, 5])
+    s_t, R_t, t_t = 1.7, rand_rotation(RNG), np.array([4.0, -1.0, 2.0])
+    Y = align.apply_sim((s_t, R_t, t_t), X)
+    s, R, t = align.fit_similarity(X, Y)
+    check("scale", abs(s - s_t) < 1e-9, f"{s:.12f} vs {s_t}")
+    check("rotation", rot_angle(R @ R_t.T) < 1e-7, f"{rot_angle(R @ R_t.T):.2e} deg")
+    check("translation", np.linalg.norm(t - t_t) < 1e-7)
+    check("residual~0", align._rms(align.apply_sim((s, R, t), X) - Y) < 1e-9)
+
+
+def test_scale_detection():
+    """B differs from A by PURE uniform scale: rigid fails, similarity nails it.
+    This is the detector for 'you need the scale DoF' -- the expected cross-run case."""
+    print("test_scale_detection")
+    X = RNG.normal(size=(300, 3)) * 2.0
+    s_t = 2.5
+    Y = align.apply_sim((s_t, rand_rotation(RNG), np.array([1.0, 2.0, -3.0])), X)
+    L = align.diagnose(X, Y)
+    check("recovered scale", abs(L["scale"] - s_t) < 1e-8, f"{L['scale']:.10f}")
+    check("rigid residual large", L["rigid"] / L["scene"] > 0.1, f"{L['rigid']/L['scene']:.3f}")
+    check("similarity residual ~0", L["similarity"] / L["scene"] < 1e-9)
+    check("verdict needs_scale", L["verdict"] == "needs_scale", L["verdict"])
+
+
+def test_nonlinear_detection():
+    """Affine and genuinely-nonlinear warps are distinguished from pure scale."""
+    print("test_nonlinear_detection")
+    X = RNG.normal(size=(400, 3)) * 2.0 + np.array([0, 0, 6.0])
+
+    # (a) anisotropic (affine) warp: stretch axes unequally -> not a similarity,
+    #     but affine fits exactly.
+    A = np.diag([1.0, 1.6, 0.7]) @ rand_rotation(RNG)
+    Yaff = (A @ X.T).T + np.array([1.0, 0.0, 2.0])
+    L = align.diagnose(X, Yaff)
+    check("affine: similarity fails", L["similarity"] / L["scene"] > 1e-3,
+          f"{L['similarity']/L['scene']:.3f}")
+    check("affine: affine fits", L["affine"] / L["scene"] < 1e-9)
+    check("affine: verdict needs_affine", L["verdict"] == "needs_affine", L["verdict"])
+
+    # (b) nonlinear warp: depth-quadratic stretch (focal-error fingerprint) ->
+    #     even affine cannot remove it, and the residual correlates with depth.
+    Z = X[:, 2]
+    Ynl = X.copy()
+    Ynl[:, 0] *= (1.0 + 0.15 * (Z - Z.mean()) ** 2 / Z.var())
+    L2 = align.diagnose(X, Ynl)
+    check("nonlinear: affine fails", L2["affine"] / L2["scene"] > 1e-3,
+          f"{L2['affine']/L2['scene']:.3f}")
+    check("nonlinear: verdict nonlinear", L2["verdict"] == "nonlinear", L2["verdict"])
+    check("nonlinear: depth-correlated residual", L2["structured"],
+          f"corr={L2['depth_corr']:.2f}")
+
+
+def test_outlier_robustness():
+    """RANSAC similarity survives 30% gross floaters in the overlap."""
+    print("test_outlier_robustness")
+    n = 400
+    X = RNG.normal(size=(n, 3)) * 2.0
+    s_t, R_t, t_t = 1.3, rand_rotation(RNG), np.array([0.5, -2.0, 1.0])
+    Y = align.apply_sim((s_t, R_t, t_t), X)
+    n_out = int(0.3 * n)
+    out_idx = RNG.choice(n, n_out, replace=False)
+    Y[out_idx] += RNG.normal(size=(n_out, 3)) * 10.0           # floaters
+    s, R, t, inl = align.fit_similarity_ransac(X, Y, thresh=0.05)
+    check("scale recovered", abs(s - s_t) < 1e-3, f"{s:.6f} vs {s_t}")
+    check("rotation recovered", rot_angle(R @ R_t.T) < 0.1)
+    check("translation recovered", np.linalg.norm(t - t_t) < 1e-2)
+    check("outliers excluded", (~inl[out_idx]).mean() > 0.9,
+          f"{(~inl[out_idx]).mean()*100:.0f}% of outliers rejected")
+
+
+def test_loop_closure():
+    """A closed loop of consistent links ~= identity; injected scale drift shows."""
+    print("test_loop_closure")
+    links = []
+    for _ in range(4):
+        links.append((RNG.uniform(0.8, 1.2), rand_rotation(RNG), RNG.normal(size=3)))
+    # close the loop exactly: last link = inverse of the composed first three
+    T = align.identity()
+    for Ti in links[:3]:
+        T = align.compose(Ti, T)
+    links_closed = links[:3] + [align.invert(T)]
+    e = align.loop_closure_error(links_closed)
+    check("closed loop ~ identity", e["scale_err"] < 1e-9 and e["rot_deg"] < 1e-6
+          and e["trans"] < 1e-9, f"scale_err={e['scale_err']:.1e}")
+    # now inject a 12% scale error into one link
+    bad = links_closed[:]
+    bad[0] = (bad[0][0] * 1.12, bad[0][1], bad[0][2])
+    e2 = align.loop_closure_error(bad)
+    check("scale drift detected", abs(e2["scale_err"] - np.log(1.12)) < 1e-9,
+          f"scale_err={e2['scale_err']:.4f} (expected {np.log(1.12):.4f})")
+
+
+# ===========================================================================
+# PnP (verified against analytic ground truth, numpy backend)
+# ===========================================================================
+
+def make_intrinsics(f=400.0, H=512, W=512):
+    pp = np.array([W / 2.0, H / 2.0])
+    K = np.array([[f, 0, pp[0]], [0, f, pp[1]], [0, 0, 1.0]])
+    return K, pp, f, H, W
+
+
+def project(Pw, R, t, K):
+    Xc = (R @ Pw.T).T + t
+    proj = (K @ Xc.T).T
+    return proj[:, :2] / proj[:, 2:3], Xc[:, 2]
+
+
+def test_focal_recovery():
+    """Weiszfeld recovers a known focal from exact (point, pixel) pairs."""
+    print("test_focal_recovery")
+    K, pp, f, H, W = make_intrinsics(f=437.0)
+    Xc = RNG.uniform([-2, -2, 4], [2, 2, 9], size=(500, 3))   # camera-frame, Z>0
+    px = f * Xc[:, :2] / Xc[:, 2:3] + pp
+    f_est = focal.estimate_focal(Xc, px, pp)
+    check("exact focal", abs(f_est - f) < 1e-6, f"{f_est:.9f} vs {f}")
+    px_noisy = px + RNG.normal(size=px.shape) * 0.5
+    f_n = focal.estimate_focal(Xc, px_noisy, pp)
+    check("focal under noise", abs(f_n - f) / f < 0.02, f"{f_n:.2f} vs {f}")
+
+
+def test_pnp_recovery():
+    """numpy DLT+RANSAC PnP recovers known per-view extrinsics (cheirality ok)."""
+    print("test_pnp_recovery")
+    K, pp, f, H, W = make_intrinsics()
+    Pw = RNG.uniform([-2, -2, 4], [2, 2, 8], size=(500, 3))   # world == view-0 cam frame
+    worst_R, worst_t = 0.0, 0.0
+    for _ in range(8):
+        # small motion so all points stay in front of the moved camera
+        ax = RNG.normal(size=3)
+        ax /= np.linalg.norm(ax)
+        ang = np.radians(RNG.uniform(3, 20))
+        K_ = np.array([[0, -ax[2], ax[1]], [ax[2], 0, -ax[0]], [-ax[1], ax[0], 0]])
+        R = np.eye(3) + np.sin(ang) * K_ + (1 - np.cos(ang)) * K_ @ K_
+        t = RNG.uniform(-0.6, 0.6, size=3)
+        px, z = project(Pw, R, t, K)
+        if np.any(z <= 0):
+            continue
+        c2w, inl = pnp.solve_pnp_numpy(Pw, px, K)
+        w2c = np.linalg.inv(c2w)
+        worst_R = max(worst_R, rot_angle(w2c[:3, :3] @ R.T))
+        worst_t = max(worst_t, np.linalg.norm(w2c[:3, 3] - t))
+    check("rotation recovered", worst_R < 1e-4, f"worst {worst_R:.2e} deg")
+    check("translation recovered", worst_t < 1e-5, f"worst {worst_t:.2e}")
+
+
+def test_pnp_outliers():
+    """PnP RANSAC tolerates corrupted correspondences (floaters)."""
+    print("test_pnp_outliers")
+    K, pp, f, H, W = make_intrinsics()
+    Pw = RNG.uniform([-2, -2, 4], [2, 2, 8], size=(600, 3))
+    R = rand_rotation(RNG)
+    # keep rotation modest so points stay in front
+    R = np.eye(3) + 0.15 * (R - R.T)
+    R, _ = np.linalg.qr(R)
+    t = np.array([0.3, -0.2, 0.4])
+    px, z = project(Pw, R, t, K)
+    keep = z > 0
+    Pw, px = Pw[keep], px[keep]
+    n_out = int(0.25 * len(px))
+    oi = RNG.choice(len(px), n_out, replace=False)
+    px[oi] += RNG.uniform(-150, 150, size=(n_out, 2))          # corrupt pixels
+    c2w, inl = pnp.solve_pnp_numpy(Pw, px, K, thresh_px=2.0, iters=300)
+    w2c = np.linalg.inv(c2w)
+    check("rotation under outliers", rot_angle(w2c[:3, :3] @ R.T) < 0.2,
+          f"{rot_angle(w2c[:3, :3] @ R.T):.3f} deg")
+    check("translation under outliers", np.linalg.norm(w2c[:3, 3] - t) < 0.02)
+    check("outliers rejected", (~inl[oi]).mean() > 0.85,
+          f"{(~inl[oi]).mean()*100:.0f}% rejected")
+
+
+def main():
+    tests = [
+        test_similarity_roundtrip, test_scale_detection, test_nonlinear_detection,
+        test_outlier_robustness, test_loop_closure,
+        test_focal_recovery, test_pnp_recovery, test_pnp_outliers,
+    ]
+    for t in tests:
+        t()
+    print()
+    if _fails:
+        print(f"FAILED ({len(_fails)}): " + ", ".join(_fails))
+        raise SystemExit(1)
+    print("ALL POSE TESTS OK")
+
+
+if __name__ == "__main__":
+    main()

From 2dfe8f38fee35a63ad1571981c941bcefe89ef3e Mon Sep 17 00:00:00 2001
From: Richard Palethorpe <io@richiejp.com>
Date: Fri, 26 Jun 2026 16:10:49 +0100
Subject: [PATCH 02/33] pose/: dense GT-posed control (Tanks&Temples OOD,
 RealEstate10K validates pipeline)

A known-good, ground-truth-posed control so a live-path failure can be attributed
to the data/model vs our code -- and a reusable engine-vs-GT harness.

- tt_control.py / tt_experiment.py: Tanks-and-Temples (NSVF) loader (OpenGL->OpenCV
  poses, baseline-vs-stride report) + engine-vs-GT check. Verdict: T&T is OUT OF
  DISTRIBUTION for FreeSplatter-scene (narrow-FOV object orbits) -- opacity confident
  on only 8-17% of pixels, pose error 28-145deg. Kept as harness + negative result.
- re10k_control.py / re10k_fetch.py / re10k_experiment.py: RealEstate10K loader
  (parser + GT geometry, GT focal_512 = fy*512), yt-dlp/ffmpeg frame fetch with a
  dead-video skip, and the engine-vs-GT check. IN DISTRIBUTION -> the control works:
  relative pose recovered to 0.4-1.5deg vs INDEPENDENT GT, opacity confident on
  68-75% of pixels. Validates our PnP beyond the upstream parity, and confirms the
  re10k camera convention.

Findings: (1) the model has a CONSTANT wide-FOV focal bias (recovers ~274 vs GT
~439, ~37%) -- benign for relative accumulation (consistent across runs), off for
metric scale; (2) pose error scales ~linearly (~25% of baseline rotation), no
degeneracy even at ~1.4deg baselines on texture-rich interiors.

Data (T&T ~1.5GB, re10k poses ~720MB, fetched frames) stays under .cache/ and the
scratchpad -- gitignored, never committed.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 pose/README.md           |  25 +++++++-
 pose/re10k_control.py    | 105 +++++++++++++++++++++++++++++++++
 pose/re10k_experiment.py |  76 ++++++++++++++++++++++++
 pose/re10k_fetch.py      |  84 ++++++++++++++++++++++++++
 pose/tt_control.py       | 124 +++++++++++++++++++++++++++++++++++++++
 pose/tt_experiment.py    |  79 +++++++++++++++++++++++++
 6 files changed, 491 insertions(+), 2 deletions(-)
 create mode 100644 pose/re10k_control.py
 create mode 100644 pose/re10k_experiment.py
 create mode 100644 pose/re10k_fetch.py
 create mode 100644 pose/tt_control.py
 create mode 100644 pose/tt_experiment.py

diff --git a/pose/README.md b/pose/README.md
index aa048d9..431fe59 100644
--- a/pose/README.md
+++ b/pose/README.md
@@ -68,6 +68,17 @@ anywhere numpy is.
   REAL engine output (needs cv2 + a dumped `[N,H,W,23]` `.f32`).
 - `empirical.py` — cross-run residual-ladder on real engine output (the scale test).
 
+### Dense GT-posed control (is it us, or the data/model?)
+
+- `tt_control.py` / `tt_experiment.py` — Tanks-and-Temples (NSVF) loader + engine
+  vs GT-pose check. **Verdict: T&T is OUT OF DISTRIBUTION** for FreeSplatter-scene
+  (narrow-FOV object orbits): opacity confident on only 8–17% of pixels, pose error
+  28–145°. Kept as the harness + the negative result.
+- `re10k_control.py` / `re10k_fetch.py` / `re10k_experiment.py` — RealEstate10K
+  loader (parser + GT geometry), yt-dlp/ffmpeg frame fetch, and the engine vs
+  GT-pose check. **In distribution → the control works**: relative pose recovered
+  to **0.4–1.5°** vs GT, opacity confident on 68–75% of pixels.
+
 ## Run the tests
 
 ```sh
@@ -75,6 +86,10 @@ nix develop -c python3 pose/test_pose.py                      # golden, numpy on
 nix develop -c python3 pose/check_cv2_parity.py               # solver vs cv2 (needs cv2)
 EMP_DIR=/path/to/dumps nix develop -c python3 \
     pose/check_upstream_parity.py A_scn.f32                   # orchestration vs upstream
+# dense control (needs the engine + a GGUF + network for re10k frames):
+nix develop -c python3 pose/re10k_fetch.py CLIP.txt OUTDIR 10
+FS_DEVICE=cpu nix develop -c python3 \
+    pose/re10k_experiment.py CLIP.txt OUTDIR 40 20,40,80,160
 ```
 
 ## Status
@@ -86,10 +101,16 @@ EMP_DIR=/path/to/dumps nix develop -c python3 \
   inlier-boundary effect of ≤0.5° on near-degenerate object data, 0° on scene).
 - ✅ **Empirical scale test** — cross-run mismatch is a uniform-scale similarity
   (scene ~11% scale drift), no nonlinear warp; `diagnose` classifies it.
+- ✅ **Dense GT-posed control** — pipeline validated against INDEPENDENT GT on
+  in-distribution re10k (relative pose to 0.4–1.5°); T&T confirmed OOD. Finding:
+  the model has a constant wide-FOV focal bias (recovers ~274 vs GT ~439) — benign
+  for relative accumulation, off for metric scale.
 
 ## Not done yet (honest)
 
-- A **dense/known-good control** sequence (other system or GT poses) before
-  trusting the live path — the baseline-sweet-spot experiment.
+- **Cross-run consistency / accumulation test** on in-distribution data (the live
+  question: clean accumulation vs needing consensus fusion).
+- A **higher-motion clip** for the wide-baseline sweet-spot sweep (the sample clip
+  is a slow pan, max ~6° over 160 frames).
 - The live pipeline itself (sliding window, sim3 pose-graph to bound drift) and
   consensus fusion (the floater-removal step) — separate, later.
diff --git a/pose/re10k_control.py b/pose/re10k_control.py
new file mode 100644
index 0000000..fe389ea
--- /dev/null
+++ b/pose/re10k_control.py
@@ -0,0 +1,105 @@
+"""RealEstate10K loader -- the dense, GT-posed, IN-DISTRIBUTION control.
+
+FreeSplatter-scene is trained on RealEstate10K-style wide-FOV walkthroughs, so
+(unlike Tanks-and-Temples, which is out of distribution) the model is known-good
+here. Each clip .txt is:
+
+    line 1 : YouTube URL
+    line k : timestamp(us)  fx fy cx cy  0 0  m00 m01 m02 m03 m10 ... m23
+             ^int            ^normalized intrinsics  ^3x4 world->camera, row-major
+
+Intrinsics are resolution-independent (fx normalized by width, fy by height).
+Frames come from the YouTube video at the given timestamps (yt-dlp + ffmpeg).
+
+Camera convention: the pose is world->camera extrinsic P=[R|t] (KP maps a world
+point to the image), OpenCV-style. c2w = inv([[R,t],[0,0,0,1]]). We VALIDATE this
+convention empirically by checking that GT relative rotation agrees with the pose
+FreeSplatter recovers on the same pair (the model is accurate in-distribution).
+
+GT focal after our 512x512 preprocessing (center-crop to height, resize): because
+fy is normalized by height and we crop to the full height then scale to 512,
+    focal_512 = fy * 512    (independent of source resolution).
+
+Pure numpy for parsing; frame extraction shells out to yt-dlp/ffmpeg.
+"""
+import os
+import subprocess
+import numpy as np
+
+HERE = os.path.dirname(os.path.abspath(__file__))
+RE10K = os.environ.get("RE10K_ROOT", os.path.join(HERE, "..", ".cache", "re10k", "RealEstate10K"))
+
+
+def parse_clip(path):
+    """Return (url, frames). frames: list of dicts {ts, fx,fy,cx,cy, w2c(3x4), c2w(4x4)}."""
+    with open(path) as f:
+        lines = [ln.strip() for ln in f if ln.strip()]
+    url = lines[0]
+    frames = []
+    for ln in lines[1:]:
+        v = ln.split()
+        ts = int(v[0])
+        fx, fy, cx, cy = (float(x) for x in v[1:5])
+        # v[5], v[6] are zeros (distortion); v[7:19] are the 3x4 world->cam
+        P = np.array([float(x) for x in v[7:19]], float).reshape(3, 4)
+        w2c = np.eye(4)
+        w2c[:3, :4] = P
+        c2w = np.linalg.inv(w2c)
+        frames.append({"ts": ts, "fx": fx, "fy": fy, "cx": cx, "cy": cy,
+                       "w2c": w2c, "c2w": c2w})
+    return url, frames
+
+
+def gt_focal_512(frame):
+    """GT focal in the engine's 512x512 input (center-crop to height + resize)."""
+    return frame["fy"] * 512.0
+
+
+def rel_pose(c2w_a, c2w_b):
+    return np.linalg.inv(c2w_a) @ c2w_b
+
+
+def rot_deg(R):
+    return float(np.degrees(np.arccos(np.clip((np.trace(R[:3, :3]) - 1) / 2, -1, 1))))
+
+
+def list_clips(split="test"):
+    d = os.path.join(RE10K, split)
+    return sorted(os.path.join(d, f) for f in os.listdir(d) if f.endswith(".txt"))
+
+
+def clip_video_id(url):
+    """YouTube id from the clip header URL."""
+    for key in ("watch?v=", "youtu.be/", "/embed/"):
+        if key in url:
+            return url.split(key)[1][:11]
+    return url.rsplit("/", 1)[-1][:11]
+
+
+def baseline_report(clip_path):
+    url, fr = parse_clip(clip_path)
+    n = len(fr)
+    cen = np.array([f["c2w"][:3, 3] for f in fr])
+    span = float(np.linalg.norm(cen - cen.mean(0), axis=1).mean())
+    print(f"=== {os.path.basename(clip_path)}  ({n} frames)  {url} ===")
+    print(f"video id: {clip_video_id(url)}   duration: "
+          f"{(fr[-1]['ts']-fr[0]['ts'])/1e6:.1f}s   GT focal(512)~{gt_focal_512(fr[0]):.1f}")
+    print(f"camera-center spread (mean dist to centroid): {span:.4f}")
+    print(f"  {'stride':>6} {'pairs':>6} {'rot(deg) med':>13} {'p10..p90':>16} "
+          f"{'baseline med':>13}")
+    for s in (1, 2, 5, 10, 20, 40):
+        rots, bl = [], []
+        for i in range(0, n - s):
+            T = rel_pose(fr[i]["c2w"], fr[i + s]["c2w"])
+            rots.append(rot_deg(T)); bl.append(np.linalg.norm(T[:3, 3]))
+        if not rots:
+            continue
+        rots, bl = np.array(rots), np.array(bl)
+        print(f"  {s:>6} {len(rots):>6} {np.median(rots):>13.2f} "
+              f"{np.percentile(rots,10):>7.2f}..{np.percentile(rots,90):<7.2f} "
+              f"{np.median(bl):>13.4f}")
+
+
+if __name__ == "__main__":
+    import sys
+    baseline_report(sys.argv[1])
diff --git a/pose/re10k_experiment.py b/pose/re10k_experiment.py
new file mode 100644
index 0000000..a2712d3
--- /dev/null
+++ b/pose/re10k_experiment.py
@@ -0,0 +1,76 @@
+"""Run FreeSplatter-scene on RealEstate10K pairs (IN-distribution) and check the
+recovered relative pose + focal against ground truth, vs frame stride (baseline).
+
+Unlike Tanks-and-Temples (out of distribution -> garbage), the model is known-good
+here, so this is a real control: low poseErr confirms our PnP against an INDEPENDENT
+GT (re10k poses), and agreement validates the re10k camera convention.
+
+Frames must already be fetched into frame_dir as fNNNN.png (re10k_fetch.py); both
+anchor and anchor+stride must be among the fetched indices.
+
+    FS_DEVICE=cpu nix develop -c python3 pose/re10k_experiment.py CLIP.txt FRAME_DIR 40 20,40,80,160
+"""
+import os
+import sys
+import subprocess
+import numpy as np
+
+import re10k_control as rc
+import pnp
+
+HERE = os.path.dirname(os.path.abspath(__file__))
+GGUF = os.environ.get("FS_GGUF", os.path.join(HERE, "..", ".cache", "freesplatter-scene-f16.gguf"))
+CLI = os.environ.get("FS_CLI", os.path.join(HERE, "..", "build", "release", "bin", "free_splatter-cli"))
+DEVICE = os.environ.get("FS_DEVICE", "cpu")
+SCRATCH = os.environ.get("SCRATCH", "/tmp")
+
+
+def run_engine(img0, img1, out):
+    r = subprocess.run([CLI, "--device", DEVICE, "--out", out, GGUF, img0, img1],
+                       capture_output=True, text=True)
+    if r.returncode != 0:
+        raise RuntimeError(f"CLI failed: {r.stderr[-500:]}")
+    a = np.fromfile(out, np.float32).reshape(2, 512, 512, 23)
+    pts = [a[v, ..., 0:3].astype(np.float64) for v in (0, 1)]
+    op = [a[v, ..., 15].astype(np.float64) for v in (0, 1)]
+    return pts, op
+
+
+def cos(a, b):
+    return float(a @ b / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-12))
+
+
+def experiment(clip, frame_dir, anchor, strides):
+    url, frames = rc.parse_clip(clip)
+    f_gt = rc.gt_focal_512(frames[0])
+    print(f"=== {os.path.basename(clip)}  anchor #{anchor}  GT focal(512)={f_gt:.1f}  "
+          f"device={DEVICE} ===\n{url}")
+    print(f"  {'stride':>6} {'GTrot':>6} {'valid%':>6} {'poseErr':>8} "
+          f"{'transDir':>8} {'scaleRat':>8} {'focal':>7} {'fErr%':>6}")
+    for s in strides:
+        j = anchor + s
+        img0 = os.path.join(frame_dir, f"f{anchor:04d}.png")
+        img1 = os.path.join(frame_dir, f"f{j:04d}.png")
+        if not (os.path.exists(img0) and os.path.exists(img1)):
+            print(f"  {s:>6}  (missing frame {anchor} or {j})"); continue
+        out = os.path.join(SCRATCH, f"re_{anchor}_{s}.f32")
+        pts, op = run_engine(img0, img1, out)
+        res = pnp.estimate_poses(pts, op, normalize=False)
+        c2w, f_est = res["cam2world"], res["focal"]
+        rel = np.linalg.inv(c2w[0]) @ c2w[1]
+        G = rc.rel_pose(frames[anchor]["c2w"], frames[j]["c2w"])
+        poseErr = rc.rot_deg(rel[:3, :3] @ G[:3, :3].T)
+        tdir = cos(rel[:3, 3], G[:3, 3])
+        srat = np.linalg.norm(rel[:3, 3]) / (np.linalg.norm(G[:3, 3]) + 1e-12)
+        validp = min((op[0] > 0.05).mean(), (op[1] > 0.05).mean()) * 100
+        ferr = abs(f_est - f_gt) / f_gt * 100
+        print(f"  {s:>6} {rc.rot_deg(G):>6.2f} {validp:>6.1f} {poseErr:>8.2f} "
+              f"{tdir:>8.4f} {srat:>8.3f} {f_est:>7.1f} {ferr:>6.1f}")
+
+
+if __name__ == "__main__":
+    clip, frame_dir = sys.argv[1], sys.argv[2]
+    anchor = int(sys.argv[3]) if len(sys.argv) > 3 else 40
+    strides = tuple(int(x) for x in sys.argv[4].split(",")) if len(sys.argv) > 4 \
+        else (20, 40, 80, 160)
+    experiment(clip, frame_dir, anchor, strides)
diff --git a/pose/re10k_fetch.py b/pose/re10k_fetch.py
new file mode 100644
index 0000000..94b1770
--- /dev/null
+++ b/pose/re10k_fetch.py
@@ -0,0 +1,84 @@
+"""Fetch a RealEstate10K clip's frames via yt-dlp + ffmpeg -- the standard re10k
+workflow, since the dataset ships only poses + YouTube URLs (frames can't be
+redistributed). Gives us a dense, in-distribution, GT-posed sequence to drive the
+control. Frames are written as fNNNN.png (NNNN = pose-line index) so they pair 1:1
+with re10k_control.parse_clip()'s frames.
+
+    nix develop -c python3 pose/re10k_fetch.py CLIP.txt OUT_DIR [step]
+
+yt-dlp comes from `nix run nixpkgs#yt-dlp`; ffmpeg is on PATH. Dead/blocked videos
+raise -- use find_live_clip to skip them.
+"""
+import os
+import subprocess
+import re10k_control as rc
+
+
+def _ytdlp(*a):
+    return ["nix", "run", "nixpkgs#yt-dlp", "--", *a]
+
+
+def is_live(url, timeout=60):
+    r = subprocess.run(_ytdlp("--no-warnings", "--simulate", "--get-duration", url),
+                       capture_output=True, text=True, timeout=timeout)
+    return r.returncode == 0
+
+
+def download_video(url, out_dir, height=720, timeout=900):
+    os.makedirs(out_dir, exist_ok=True)
+    vid = os.path.join(out_dir, "video.mp4")
+    if os.path.exists(vid) and os.path.getsize(vid) > 0:
+        return vid
+    fmt = (f"bv*[height<={height}][ext=mp4]/bv*[height<={height}]/"
+           f"b[height<={height}]/best")
+    r = subprocess.run(_ytdlp("-f", fmt, "--no-warnings", "--no-playlist",
+                              "-o", vid, url),
+                       capture_output=True, text=True, timeout=timeout)
+    if not (os.path.exists(vid) and os.path.getsize(vid) > 0):
+        raise RuntimeError("yt-dlp failed:\n" + r.stderr[-600:])
+    return vid
+
+
+def extract_frames(vid, frames, indices, out_dir):
+    imgs = {}
+    for i in indices:
+        ts = frames[i]["ts"] / 1e6                     # absolute video time (s)
+        p = os.path.join(out_dir, f"f{i:04d}.png")
+        if not os.path.exists(p):
+            subprocess.run(["ffmpeg", "-nostdin", "-loglevel", "error",
+                            "-ss", f"{ts:.6f}", "-i", vid, "-frames:v", "1",
+                            "-q:v", "2", "-y", p], check=True, timeout=120)
+        imgs[i] = p
+    return imgs
+
+
+def fetch(clip_path, out_dir, indices=None, step=10, height=720):
+    url, frames = rc.parse_clip(clip_path)
+    if indices is None:
+        indices = list(range(0, len(frames), step))
+    vid = download_video(url, out_dir, height)
+    imgs = extract_frames(vid, frames, indices, out_dir)
+    return url, frames, imgs
+
+
+def find_live_clip(split="test", start=0, limit=300, min_frames=120):
+    """First clip in [start, start+limit) that has >= min_frames AND a live video."""
+    for cp in rc.list_clips(split)[start:start + limit]:
+        _, fr = rc.parse_clip(cp)
+        if len(fr) < min_frames:
+            continue
+        try:
+            if is_live(rc.parse_clip(cp)[0]):
+                return cp
+        except subprocess.TimeoutExpired:
+            continue
+    return None
+
+
+if __name__ == "__main__":
+    import sys
+    clip, out_dir = sys.argv[1], sys.argv[2]
+    step = int(sys.argv[3]) if len(sys.argv) > 3 else 10
+    url, frames, imgs = fetch(clip, out_dir, step=step)
+    print(f"got {len(imgs)} frames from {url}")
+    print("indices:", sorted(imgs)[:6], "...", sorted(imgs)[-3:])
diff --git a/pose/tt_control.py b/pose/tt_control.py
new file mode 100644
index 0000000..fdd402f
--- /dev/null
+++ b/pose/tt_control.py
@@ -0,0 +1,124 @@
+"""Tanks-and-Temples (NSVF release) loader -- the dense, GT-posed control.
+
+A known-good, ground-truth-posed sequence so that when the live FreeSplatter path
+misbehaves we can tell whether it's the data/model or our code. NSVF layout per
+scene (.cache/tt/TanksAndTemple/<Scene>/):
+    rgb/{split}_{idx}_{origframe}.png   1920x1080
+    pose/{split}_{idx}_{origframe}.txt  4x4 camera-to-world, OpenGL convention
+    intrinsics.txt                       4x4 (fx 0 cx 0 / 0 fy cy 0 / ...)
+split 0 = train, 1 = test (held-out orbit views). Frames sorted by origframe give
+true capture order.
+
+What this gives the control:
+  * GT relative pose between any two frames  -> check our PnP relative pose.
+  * GT focal after 512x512 preprocessing     -> check FreeSplatter's recovered focal.
+  * GT baseline / parallax per frame-stride   -> drive the baseline sweep.
+
+OpenGL->OpenCV: negate the camera y,z axes (columns 1,2 of the c2w rotation),
+matching the runner's `c2ws[:, :, 1:3] *= -1`. cam2world, OpenCV (x right, y down,
+z forward) -- the same convention our pnp.estimate_poses returns.
+
+Usage:
+    nix develop -c python3 pose/tt_control.py [Scene]      # default Truck
+"""
+import os
+import re
+import sys
+import numpy as np
+
+TT_ROOT = os.environ.get(
+    "TT_ROOT",
+    os.path.join(os.path.dirname(__file__), "..", ".cache", "tt", "TanksAndTemple"))
+
+_GL2CV = np.diag([1.0, -1.0, -1.0, 1.0])      # negate camera y,z axes (right-mul)
+
+
+def gl_to_cv(c2w):
+    """OpenGL camera-to-world -> OpenCV camera-to-world."""
+    return c2w @ _GL2CV
+
+
+def load_intrinsics(scene_dir):
+    M = np.loadtxt(os.path.join(scene_dir, "intrinsics.txt"))
+    return {"fx": float(M[0, 0]), "fy": float(M[1, 1]),
+            "cx": float(M[0, 2]), "cy": float(M[1, 2])}
+
+
+def _origframe(name):
+    # {split}_{idx}_{origframe}.ext -> int(origframe)
+    return int(re.split(r"[_.]", name)[2])
+
+
+def load_scene(scene, split=None):
+    """Return (frames, intr). frames: list of dicts {name, idx, orig, img, c2w(OpenCV)}
+    sorted by capture order. split=0/1 to filter train/test, None for all."""
+    scene_dir = scene if os.path.isdir(scene) else os.path.join(TT_ROOT, scene)
+    intr = load_intrinsics(scene_dir)
+    pose_dir, rgb_dir = os.path.join(scene_dir, "pose"), os.path.join(scene_dir, "rgb")
+    frames = []
+    for pf in os.listdir(pose_dir):
+        if not pf.endswith(".txt"):
+            continue
+        sp = int(pf.split("_")[0])
+        if split is not None and sp != split:
+            continue
+        c2w_gl = np.loadtxt(os.path.join(pose_dir, pf))
+        img = os.path.join(rgb_dir, pf[:-4] + ".png")
+        frames.append({"name": pf[:-4], "split": sp, "orig": _origframe(pf),
+                       "img": img, "c2w": gl_to_cv(c2w_gl)})
+    frames.sort(key=lambda f: f["orig"])
+    return frames, intr
+
+
+def rel_pose(c2w_a, c2w_b):
+    """Camera b expressed in camera a's frame: inv(c2w_a) @ c2w_b (OpenCV)."""
+    return np.linalg.inv(c2w_a) @ c2w_b
+
+
+def rot_deg(R):
+    return float(np.degrees(np.arccos(np.clip((np.trace(R[:3, :3]) - 1) / 2, -1, 1))))
+
+
+def scene_radius(frames):
+    """Mean camera-center distance to the centroid of all camera centers."""
+    cen = np.array([f["c2w"][:3, 3] for f in frames])
+    return float(np.linalg.norm(cen - cen.mean(0), axis=1).mean()), cen.mean(0)
+
+
+def baseline_report(scene="Truck"):
+    frames, intr = load_scene(scene)
+    n = len(frames)
+    radius, _ = scene_radius(frames)
+    f512 = intr["fx"] * 512.0 / 1080.0            # after center-crop 1080 + resize 512
+    print(f"=== {scene}: {n} frames "
+          f"(train {sum(f['split']==0 for f in frames)}, "
+          f"test {sum(f['split']==1 for f in frames)}) ===")
+    print(f"intrinsics 1920x1080: fx={intr['fx']:.1f} fy={intr['fy']:.1f} "
+          f"cx={intr['cx']:.1f} cy={intr['cy']:.1f}")
+    print(f"GT focal after 512x512 preprocess (center-crop 1080 -> resize): "
+          f"{f512:.1f} px  (pp=256)")
+    print(f"orbit radius (mean cam dist to center): {radius:.4f}")
+    print("\nframe-stride sweep -- viewpoint change between frame i and i+stride:")
+    print(f"  {'stride':>6} {'pairs':>6} {'rot(deg) med':>13} {'p10..p90':>16} "
+          f"{'baseline/r med':>15} {'parallax(deg)':>14}")
+    for s in (1, 2, 3, 5, 10, 20, 40):
+        rots, bl = [], []
+        for i in range(0, n - s):
+            T = rel_pose(frames[i]["c2w"], frames[i + s]["c2w"])
+            rots.append(rot_deg(T))
+            bl.append(np.linalg.norm(T[:3, 3]))
+        if not rots:
+            continue
+        rots, bl = np.array(rots), np.array(bl)
+        blr = bl / radius
+        # small-angle parallax ~ baseline / radius (rad) -> deg
+        par = np.degrees(np.median(blr))
+        print(f"  {s:>6} {len(rots):>6} {np.median(rots):>13.2f} "
+              f"{np.percentile(rots,10):>7.2f}..{np.percentile(rots,90):<7.2f} "
+              f"{np.median(blr):>15.3f} {par:>14.2f}")
+    print("\nUse this to pick pair strides for the sweet-spot sweep: too small a "
+          "stride -> ~0 parallax (degenerate depth); too large -> low overlap.")
+
+
+if __name__ == "__main__":
+    baseline_report(sys.argv[1] if len(sys.argv) > 1 else "Truck")
diff --git a/pose/tt_experiment.py b/pose/tt_experiment.py
new file mode 100644
index 0000000..8f9449a
--- /dev/null
+++ b/pose/tt_experiment.py
@@ -0,0 +1,79 @@
+"""Run FreeSplatter-scene on Tanks-and-Temples pairs; check recovered pose + focal
+against GROUND TRUTH as a function of frame stride (baseline). The control: GT
+poses are known-good, so error here is OUR pipeline or the model being out of
+distribution -- not unknown data.
+
+Metrics per pair (view0 -> view1):
+  GTrot    : GT relative rotation (deg) = the viewpoint change we asked for.
+  valid%   : min over views of opacity>0.05 fraction (PnP support).
+  poseErr  : angle between our recovered relative rotation and GT (deg) -- the
+             scale-free "is the pose right" check.
+  transDir : cos angle between our relative-translation direction and GT (1=perfect;
+             FreeSplatter's metric scale is arbitrary, so only direction is checked).
+  scaleRat : ||our t|| / ||GT t||  -- the monocular scale FreeSplatter chose vs metric.
+  focalErr%: |recovered focal - GT focal(512)| / GT.
+
+    FS_DEVICE=cpu nix develop -c python3 pose/tt_experiment.py Truck 120 2,5,10,20
+"""
+import os
+import sys
+import subprocess
+import numpy as np
+
+import tt_control as tt
+import pnp
+
+HERE = os.path.dirname(os.path.abspath(__file__))
+GGUF = os.environ.get("FS_GGUF", os.path.join(HERE, "..", ".cache", "freesplatter-scene-f16.gguf"))
+CLI = os.environ.get("FS_CLI", os.path.join(HERE, "..", "build", "release", "bin", "free_splatter-cli"))
+DEVICE = os.environ.get("FS_DEVICE", "cpu")
+SCRATCH = os.environ.get("SCRATCH", "/tmp")
+
+
+def run_engine(img0, img1, out):
+    r = subprocess.run([CLI, "--device", DEVICE, "--out", out, GGUF, img0, img1],
+                       capture_output=True, text=True)
+    if r.returncode != 0:
+        raise RuntimeError(f"CLI failed: {r.stderr[-500:]}")
+    a = np.fromfile(out, np.float32).reshape(2, 512, 512, 23)
+    pts = [a[v, ..., 0:3].astype(np.float64) for v in (0, 1)]
+    op = [a[v, ..., 15].astype(np.float64) for v in (0, 1)]
+    return pts, op
+
+
+def cos(a, b):
+    return float(a @ b / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-12))
+
+
+def experiment(scene="Truck", anchor=120, strides=(2, 5, 10, 20)):
+    frames, intr = tt.load_scene(scene)
+    f_gt = intr["fx"] * 512.0 / 1080.0
+    print(f"=== {scene}  anchor #{anchor} ({frames[anchor]['name']})  "
+          f"GT focal(512)={f_gt:.1f}  device={DEVICE} ===")
+    print(f"  {'stride':>6} {'GTrot':>6} {'valid%':>6} {'poseErr':>8} "
+          f"{'transDir':>8} {'scaleRat':>8} {'focal':>7} {'fErr%':>6}")
+    for s in strides:
+        j = anchor + s
+        if j >= len(frames):
+            continue
+        out = os.path.join(SCRATCH, f"tt_{scene}_{anchor}_{s}.f32")
+        pts, op = run_engine(frames[anchor]["img"], frames[j]["img"], out)
+        res = pnp.estimate_poses(pts, op, normalize=False)     # scene recipe
+        c2w, f_est = res["cam2world"], res["focal"]
+        rel = np.linalg.inv(c2w[0]) @ c2w[1]
+        G = tt.rel_pose(frames[anchor]["c2w"], frames[j]["c2w"])
+        poseErr = tt.rot_deg(rel[:3, :3] @ G[:3, :3].T)
+        tdir = cos(rel[:3, 3], G[:3, 3])
+        srat = np.linalg.norm(rel[:3, 3]) / (np.linalg.norm(G[:3, 3]) + 1e-12)
+        validp = min((op[0] > 0.05).mean(), (op[1] > 0.05).mean()) * 100
+        ferr = abs(f_est - f_gt) / f_gt * 100
+        print(f"  {s:>6} {tt.rot_deg(G):>6.1f} {validp:>6.1f} {poseErr:>8.2f} "
+              f"{tdir:>8.4f} {srat:>8.3f} {f_est:>7.1f} {ferr:>6.1f}")
+
+
+if __name__ == "__main__":
+    scene = sys.argv[1] if len(sys.argv) > 1 else "Truck"
+    anchor = int(sys.argv[2]) if len(sys.argv) > 2 else 120
+    strides = tuple(int(x) for x in sys.argv[3].split(",")) if len(sys.argv) > 3 \
+        else (2, 5, 10, 20)
+    experiment(scene, anchor, strides)

From 59aaea8b61cadd9e267626cb2ed8913637e24153 Mon Sep 17 00:00:00 2001
From: Richard Palethorpe <io@richiejp.com>
Date: Fri, 26 Jun 2026 16:17:19 +0100
Subject: [PATCH 03/33] pose/: cross-run consistency on in-distribution re10k
 (accumulation is clean)

re10k_crossrun.py: a frame shared by two overlapping pairs (m-s,m) and (m,m+s) is
reconstructed twice in two coordinate systems; fit a robust similarity between the
two and read consistency + the residual ladder, swept over baseline stride. Mirrors
empirical.py but on in-distribution data where geometry is good.

Result (shared frame 120): cross-run consistency is HIGH and best at small
baseline -- stride 20 (0.41deg): 65% of pixels agree within 2% of scene extent,
98% within 10%; stride 80 (6deg): 46% / 95%. Versus the OOD doll's 7% / 21%. The
mismatch is a clean uniform-scale similarity everywhere (sim->affine buys ~0.1%,
verdict similarity_plus_noise); per-step scale drift is small and grows with
baseline (1.7% at 0.4deg, 11% at 6deg).

Takeaways for the live path: (1) the small-baseline hypothesis holds -- the sliding
window's small steps land in the high-consistency regime, so accumulation is mostly
clean; (2) consensus fusion is a polish for the residual ~2% floaters, not a
prerequisite; (3) registration is a 7-DoF similarity (no nonlinear warp), with slow
scale drift a sim3 pose-graph can bound.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 pose/re10k_crossrun.py | 69 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)
 create mode 100644 pose/re10k_crossrun.py

diff --git a/pose/re10k_crossrun.py b/pose/re10k_crossrun.py
new file mode 100644
index 0000000..422c1d5
--- /dev/null
+++ b/pose/re10k_crossrun.py
@@ -0,0 +1,69 @@
+"""Cross-run consistency on IN-DISTRIBUTION re10k -- the accumulation question.
+
+A shared frame m is reconstructed by two overlapping pairs:
+    run A = (m-s, m)  -> frame m is view 1 (gaussians in frame (m-s)'s world)
+    run B = (m, m+s)  -> frame m is view 0 (gaussians in frame m's world)
+Frame m's per-pixel gaussian centers from A and B are the SAME physical points in
+two coordinate systems -- exact correspondences. Fit a robust similarity B->A and
+read the residual ladder + how many pixels actually agree. High agreement => clean
+accumulation; low => floaters dominate and we need consensus fusion.
+
+This mirrors empirical.py but on data the model handles well (re10k), and sweeps
+the baseline stride s so we can see whether a sweet spot maximizes consistency.
+
+    FS_DEVICE=cpu nix develop -c python3 pose/re10k_crossrun.py CLIP.txt FRAME_DIR 120 20,40,80
+"""
+import os
+import sys
+import numpy as np
+
+import re10k_control as rc
+import re10k_experiment as ex          # run_engine
+import align
+
+SCRATCH = os.environ.get("SCRATCH", "/tmp")
+THR = 0.05
+
+
+def crossrun(clip, frame_dir, m, strides):
+    _, frames = rc.parse_clip(clip)
+    print(f"=== cross-run consistency: {os.path.basename(clip)}  shared frame #{m} ===")
+    print(f"  {'stride':>6} {'GTrot(i-j)':>10} {'valid%':>6} {'scale':>6} "
+          f"{'<1%':>5} {'<2%':>5} {'<5%':>5} {'<10%':>6} {'rigid%':>7} {'sim%':>6} "
+          f"{'aff%':>6} {'verdict':>20}")
+    fp = lambda i: os.path.join(frame_dir, f"f{i:04d}.png")
+    for s in strides:
+        i, j = m - s, m + s
+        if not all(os.path.exists(fp(k)) for k in (i, m, j)):
+            print(f"  {s:>6}  (missing a frame among {i},{m},{j})"); continue
+        ptsA, opA = ex.run_engine(fp(i), fp(m), os.path.join(SCRATCH, f"xrA_{m}_{s}.f32"))
+        ptsB, opB = ex.run_engine(fp(m), fp(j), os.path.join(SCRATCH, f"xrB_{m}_{s}.f32"))
+        XA, oA = ptsA[1].reshape(-1, 3), opA[1].reshape(-1)    # m as view1 of A
+        XB, oB = ptsB[0].reshape(-1, 3), opB[0].reshape(-1)    # m as view0 of B
+        mask = (oA > THR) & (oB > THR)
+        A, B = XA[mask], XB[mask]
+        if mask.sum() < 200:
+            print(f"  {s:>6}  (too few valid: {mask.sum()})"); continue
+        scene = align._rms(A - A.mean(0))
+        sc, R, t, inl = align.fit_similarity_ransac(B, A, thresh=0.02 * scene, iters=400)
+        res = np.linalg.norm(align.apply_sim((sc, R, t), B) - A, axis=1)
+        within = {q: 100 * (res < q * scene).mean() for q in (0.01, 0.02, 0.05, 0.10)}
+        L = align.diagnose(B[inl], A[inl])
+        gtrot = rc.rot_deg(rc.rel_pose(frames[i]["c2w"], frames[j]["c2w"]))
+        print(f"  {s:>6} {gtrot:>10.2f} {100*mask.mean():>6.1f} {sc:>6.3f} "
+              f"{within[0.01]:>5.0f} {within[0.02]:>5.0f} {within[0.05]:>5.0f} "
+              f"{within[0.10]:>6.0f} {100*L['rigid']/L['scene']:>7.1f} "
+              f"{100*L['similarity']/L['scene']:>6.1f} {100*L['affine']/L['scene']:>6.1f} "
+              f"{L['verdict']:>20}")
+    print("\n<q% = fraction of shared-frame pixels whose two reconstructions agree "
+          "within q% of scene extent under the best similarity. High => accumulation\n"
+          "is clean; low => most per-pixel gaussians are partner-dependent floaters "
+          "(need consensus fusion).")
+
+
+if __name__ == "__main__":
+    clip, frame_dir = sys.argv[1], sys.argv[2]
+    m = int(sys.argv[3]) if len(sys.argv) > 3 else 120
+    strides = tuple(int(x) for x in sys.argv[4].split(",")) if len(sys.argv) > 4 \
+        else (20, 40, 80)
+    crossrun(clip, frame_dir, m, strides)

From a3a6641ee883da367cf8ed6c281189248be6ed57 Mon Sep 17 00:00:00 2001
From: Richard Palethorpe <io@richiejp.com>
Date: Fri, 26 Jun 2026 21:33:56 +0100
Subject: [PATCH 04/33] CLAUDE.md: PnP now in scope; C++/Go-only policy,
 no-Python runtime

Policy update directed by the project owner:
- PnP pose recovery and cross-run Sim(3) alignment / accumulation are now IN
  scope (previously out of scope), alongside the engine.
- Everything ships in C++; Go only for the demo web server (purego -> C API ->
  Vulkan + WebGL). The CLI and C API must have NO Python dependency at runtime.
- Python is confined to (1) dev-time reference/conversion/validation in the CUDA
  docker (hf_dump/convert/compare_taps), never a runtime dep, and (2) the pose/
  research prototype TEMPORARILY -- continued in Python only until the approach is
  proven, then rewritten in C++ and the Python deleted.
- Per-component discipline now lists the C++ `pose` component, inheriting the
  parity discipline the Python prototype established (bit-exact to upstream
  estimate_poses; validated vs independent GT poses).

pose/README.md updated to match (temporary prototype, C++ port pending).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 CLAUDE.md      | 54 ++++++++++++++++++++++++++++++++++++++------------
 pose/README.md | 18 ++++++++++-------
 2 files changed, 52 insertions(+), 20 deletions(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index 1d1ec3e..d4e3a71 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -2,19 +2,43 @@
 
 ## What this project is
 
-A GGML/C++ inference engine for the **neural-network front half** of
-TencentARC/FreeSplatter: image patch tokenizer → multi-view self-attention
-transformer → per-pixel 3D-Gaussian parameter head. Given N uncalibrated views
-it returns, per input pixel, the activated Gaussian parameters.
-
-**Scope:** pieces 1–3 only. The **rasterizer and the PnP pose solver are OUT OF
-SCOPE** — they are a downstream consumer of the `[N, H, W, gaussian_channels]`
-tensor this engine produces. Do not add them here.
+A GGML/C++ engine for the **neural-network front half** of
+TencentARC/FreeSplatter (image patch tokenizer → multi-view self-attention
+transformer → per-pixel 3D-Gaussian parameter head) **plus the downstream
+camera-pose recovery and cross-run registration that consume its output**. Given
+N uncalibrated views it returns, per input pixel, the activated Gaussian
+parameters; **PnP** then recovers each view's camera, and successive runs are
+aligned into one accumulating world — the path toward live reconstruction from a
+moving camera.
+
+**Scope (updated):** the engine (pieces 1–3), **PnP pose recovery (now IN
+scope)**, and the cross-run Sim(3) alignment / accumulation. Keep the seam at the
+`[N, H, W, gaussian_channels]` tensor clean — it is the contract between the
+engine and the pose consumer. Rendering itself stays in the demo viewer
+(Vulkan/WebGL), not the engine.
 
 Target checkpoint first: **`freesplatter-scene`** (`gaussian_channels=23`,
 `sh_residual=true`, black background). The transformer backbone is identical
 across all three variants, so object/object-2dgs are cheap follow-ons.
 
+## Language & dependency policy
+
+- **Everything ships in C++** (the engine, PnP, and the alignment/accumulation
+  once proven). **Go only for the demo web server** — the purego layer that drives
+  the C API → Vulkan + WebGL viewer.
+- **The CLI and the C API must have NO Python dependency at runtime.** Every piece
+  of current and future functionality is reachable from `free_splatter-cli` and
+  `include/free_splatter.h` without invoking Python.
+- **Python is confined to two places, neither shipped:**
+  1. **Dev-time reference / conversion / validation** that runs in
+     `docker/Dockerfile.cuda` (`scripts/hf_dump.py`, `convert.py`,
+     `compare_taps.py`, …) — the only place torch runs; never a runtime dependency.
+  2. **The `pose/` research prototype, TEMPORARILY.** It may continue in Python
+     (numpy + cv2) **only until the accumulating-reconstruction approach is
+     proven**. Once proven it is **rewritten in C++**, exposed via the CLI + C API,
+     and **the Python is deleted.** `pose/` is not wired into CMake/ctest and is a
+     throwaway prototype, not a parallel implementation to maintain.
+
 ## Validation is the backbone (non-negotiable)
 
 This is a numerical port. Correctness means matching the PyTorch reference
@@ -47,11 +71,15 @@ This is a numerical port. Correctness means matching the PyTorch reference
 
 ## Per-component discipline
 
-Each component (`image`, `gguf_loader`, `backend`, `model`, the head) has its own
-unit test and is made independently green **before** cross-component parity.
-Component boundaries match the file layout. Keep the seam at the
-`[N,H,W,gaussian_channels]` tensor clean — that is the contract with any
-rasterizer.
+Each component (`image`, `gguf_loader`, `backend`, `model`, the head, and — once
+ported to C++ — **`pose` = PnP + focal + Sim(3) alignment**) has its own unit test
+and is made independently green **before** cross-component parity. Component
+boundaries match the file layout. Keep the seam at the
+`[N,H,W,gaussian_channels]` tensor clean — that is the contract between the engine
+and the pose/rendering consumers. The C++ `pose` component inherits the parity
+discipline the Python prototype already established: **bit-exact to upstream
+`estimate_poses`** (see `pose/check_upstream_parity.py`) and **validated against
+independent ground-truth poses** (`pose/re10k_experiment.py`).
 
 ## Debugging philosophy (mandatory sequence)
 
diff --git a/pose/README.md b/pose/README.md
index 431fe59..05f66f5 100644
--- a/pose/README.md
+++ b/pose/README.md
@@ -1,12 +1,16 @@
 # pose/ — camera recovery + cross-run alignment (downstream prototype)
 
-**Status: research prototype, deliberately OUTSIDE the validated engine.**
-
-The free-splatter.cpp engine is pieces 1–3 only; CLAUDE.md keeps the PnP pose
-solver out of `src/` on purpose. This directory is the *downstream consumer* of
-the engine's `[N, H, W, 23]` output — a self-contained, pure-Python prototype,
-**not wired into the CMake build or `ctest`**. It exists to prototype live,
-accumulating reconstruction from a moving-camera feed.
+**Status: TEMPORARY Python prototype — to be rewritten in C++ once proven, then
+deleted.**
+
+PnP pose recovery and cross-run alignment are now **in scope** (see CLAUDE.md).
+The shipped implementation will be **C++**, reachable from `free_splatter-cli` and
+the C API with **no Python dependency**. This directory is the throwaway research
+prototype that proves the accumulating-reconstruction approach first: a
+self-contained numpy + cv2 consumer of the engine's `[N, H, W, 23]` output,
+**not wired into the CMake build or `ctest`**. Once the approach is proven it is
+ported to C++ and this Python is removed — it is not a parallel implementation to
+maintain.
 
 ## Why it's needed
 

From 0ecadeea7afb1bd400b2ad028dddf34bdd84f0f3 Mon Sep 17 00:00:00 2001
From: Richard Palethorpe <io@richiejp.com>
Date: Fri, 26 Jun 2026 21:46:30 +0100
Subject: [PATCH 05/33] pose/: sliding-window accumulation prototype (the live
 idea, proven end-to-end)

accumulate.py assembles the validated pieces into the live pipeline (minus
realtime): slide a window over a re10k clip, recover each pair's camera (PnP), fit
a Sim(3) between consecutive runs from their SHARED-frame per-pixel correspondences
and compose into a per-run global transform (align.compose), drop every frame's
gaussians into one world (frame f_0's), and measure the recovered camera trajectory
vs ground truth. Engine dumps are cached so analysis re-runs skip inference.
render_ply.py projects the colored cloud through a pinhole camera to a PNG.

Result (13 pairs, stride 20, frames 0..260):
- per-link Sim(3) registration clean: residual ~1.0-1.4% of scene extent.
- scale drift accumulates monotonically (forward pan, no revisit): 0.755 over 12
  links, ~2.3%/link -- the monocular 1/d drift compounding.
- recovered camera trajectory tracks GT to ATE ~11% of extent (single global
  Sim(3) align); drift grows 7%->13% first->second half, worst at the endpoint --
  exactly what a Sim(3) pose-graph + loop closure bounds.
- the 2.6M-point accumulated cloud renders from camera 0 as a COHERENT room that
  matches the input frame (wider FOV due to the model's ~274-vs-439 focal bias).

The accumulating-reconstruction idea is proven; per CLAUDE.md the next
implementation step is the C++ port (CLI + C API), after which this Python goes.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 pose/README.md     |  27 ++++++--
 pose/accumulate.py | 157 +++++++++++++++++++++++++++++++++++++++++++++
 pose/render_ply.py |  54 ++++++++++++++++
 3 files changed, 234 insertions(+), 4 deletions(-)
 create mode 100644 pose/accumulate.py
 create mode 100644 pose/render_ply.py

diff --git a/pose/README.md b/pose/README.md
index 05f66f5..d0322e6 100644
--- a/pose/README.md
+++ b/pose/README.md
@@ -82,6 +82,21 @@ anywhere numpy is.
   loader (parser + GT geometry), yt-dlp/ffmpeg frame fetch, and the engine vs
   GT-pose check. **In distribution → the control works**: relative pose recovered
   to **0.4–1.5°** vs GT, opacity confident on 68–75% of pixels.
+- `re10k_crossrun.py` — cross-run consistency vs baseline (the accumulation
+  question): a shared frame's two reconstructions agree on **98% of pixels within
+  10%** of scene extent at small baseline.
+
+### Accumulation prototype (the live idea, assembled)
+
+- `accumulate.py` — slide a window over a clip, recover each pair (PnP), chain a
+  Sim(3) between consecutive runs via their shared-frame correspondences
+  (`align.compose`), drop every frame's gaussians into one world, and measure the
+  recovered camera trajectory vs GT. `render_ply.py` — pinhole projection of the
+  colored cloud to a PNG (visual coherence check).
+- **Result (13 pairs, stride 20):** per-link Sim(3) residual ~1.0–1.4%; trajectory
+  ATE **11%** of extent with drift growing 7%→13% (monotone monocular scale drift,
+  0.755 over 12 links); the 2.6M-point cloud renders as a **coherent room** matching
+  the input. The accumulating-reconstruction idea is **proven** end-to-end.
 
 ## Run the tests
 
@@ -109,12 +124,16 @@ FS_DEVICE=cpu nix develop -c python3 \
   in-distribution re10k (relative pose to 0.4–1.5°); T&T confirmed OOD. Finding:
   the model has a constant wide-FOV focal bias (recovers ~274 vs GT ~439) — benign
   for relative accumulation, off for metric scale.
+- ✅ **Accumulation prototype** — sliding-window Sim(3) chaining over a clip builds
+  one coherent world; trajectory tracks GT to ~11% ATE with bounded-able drift. The
+  idea is proven; per CLAUDE.md the next implementation step is the **C++ port**.
 
 ## Not done yet (honest)
 
-- **Cross-run consistency / accumulation test** on in-distribution data (the live
-  question: clean accumulation vs needing consensus fusion).
+- **Loop closure / Sim(3) pose-graph** to bound the monotone scale + pose drift
+  (the prototype chains open-loop; a forward pan never revisits).
+- **Consensus fusion** to remove the residual ~2% floaters during accumulation.
 - A **higher-motion clip** for the wide-baseline sweet-spot sweep (the sample clip
   is a slow pan, max ~6° over 160 frames).
-- The live pipeline itself (sliding window, sim3 pose-graph to bound drift) and
-  consensus fusion (the floater-removal step) — separate, later.
+- The **C++ port** (CLI + C API, no Python) once the design is locked — then this
+  prototype is deleted.
diff --git a/pose/accumulate.py b/pose/accumulate.py
new file mode 100644
index 0000000..decab3a
--- /dev/null
+++ b/pose/accumulate.py
@@ -0,0 +1,157 @@
+"""Sliding-window accumulation prototype -- assemble the validated pieces.
+
+Slide a window of overlapping pairs across a re10k clip:
+    pair k = (f_k, f_{k+1})   ->  gaussians in f_k's camera frame (run k's world)
+Consecutive runs share a frame (f_{k+1} is view1 of pair k and view0 of pair k+1),
+so their per-pixel gaussians for that frame are exact correspondences. Fit a Sim(3)
+S_k mapping run k's frame into run k-1's frame, and COMPOSE into a global transform
+    T_0 = I ,  T_k = T_{k-1} . S_k
+that drops every run's gaussians into ONE world (frame f_0's). Accumulate them,
+and measure the recovered camera trajectory against ground truth (drift).
+
+This is the live pipeline minus realtime: PnP (pnp.py) + Sim(3) align (align.py) +
+chaining (align.compose) over a real sequence, justified by re10k_crossrun.py.
+
+    FS_DEVICE=cpu nix develop -c python3 pose/accumulate.py CLIP.txt FRAME_DIR \\
+        --start 0 --stride 20 --count 13 --cache DIR --ply out.ply
+
+Engine dumps are cached (pair_*.f32); re-runs skip inference.
+"""
+import os
+import argparse
+import numpy as np
+
+import re10k_control as rc
+import re10k_fetch as rf
+import re10k_experiment as ex          # run_engine
+import pnp
+import align
+
+C0 = 0.28209479177387814
+THR = 0.05
+
+
+def load_dump(path):
+    a = np.fromfile(path, np.float32).reshape(2, 512, 512, 23)
+    pts = [a[v, ..., 0:3].astype(np.float64) for v in (0, 1)]
+    op = [a[v, ..., 15].astype(np.float64) for v in (0, 1)]
+    rgb = [np.clip(a[v, ..., 3:6] * C0 + 0.5, 0, 1) for v in (0, 1)]
+    return pts, op, rgb
+
+
+def write_ply(path, xyz, rgb):
+    n = len(xyz)
+    hdr = ("ply\nformat binary_little_endian 1.0\n"
+           f"element vertex {n}\n"
+           "property float x\nproperty float y\nproperty float z\n"
+           "property uchar red\nproperty uchar green\nproperty uchar blue\n"
+           "end_header\n")
+    rec = np.zeros(n, dtype=[("x", "<f4"), ("y", "<f4"), ("z", "<f4"),
+                             ("r", "u1"), ("g", "u1"), ("b", "u1")])
+    rec["x"], rec["y"], rec["z"] = xyz[:, 0], xyz[:, 1], xyz[:, 2]
+    c = (np.clip(rgb, 0, 1) * 255).astype(np.uint8)
+    rec["r"], rec["g"], rec["b"] = c[:, 0], c[:, 1], c[:, 2]
+    with open(path, "wb") as f:
+        f.write(hdr.encode())
+        f.write(rec.tobytes())
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("clip"); ap.add_argument("frame_dir")
+    ap.add_argument("--start", type=int, default=0)
+    ap.add_argument("--stride", type=int, default=20)
+    ap.add_argument("--count", type=int, default=13)        # number of pairs
+    ap.add_argument("--cache", default=os.environ.get("SCRATCH", "/tmp"))
+    ap.add_argument("--ply", default=None)
+    ap.add_argument("--max-points", type=int, default=2_000_000)
+    a = ap.parse_args()
+
+    url, frames = rc.parse_clip(a.clip)
+    idx = [a.start + k * a.stride for k in range(a.count + 1)]      # n+1 frame indices
+    assert idx[-1] < len(frames), f"frame {idx[-1]} beyond clip ({len(frames)})"
+    # make sure the frames exist (fetch on demand)
+    missing = [i for i in idx if not os.path.exists(os.path.join(a.frame_dir, f"f{i:04d}.png"))]
+    if missing:
+        rf.fetch(a.clip, a.frame_dir, indices=idx)
+    os.makedirs(a.cache, exist_ok=True)
+    fp = lambda i: os.path.join(a.frame_dir, f"f{i:04d}.png")
+
+    print(f"=== accumulate: {os.path.basename(a.clip)}  frames {idx[0]}..{idx[-1]} "
+          f"stride {a.stride}  ({a.count} pairs)  device={ex.DEVICE} ===\n{url}")
+
+    # --- run / load each pair, recover its relative pose ---
+    runs = []           # per pair: dict(pts, op, rgb, c2w)
+    for k in range(a.count):
+        dump = os.path.join(a.cache, f"pair_{idx[k]}_{idx[k+1]}.f32")
+        if not os.path.exists(dump):
+            ex.run_engine(fp(idx[k]), fp(idx[k + 1]), dump)
+            print(f"  ran pair {k}: ({idx[k]},{idx[k+1]})")
+        pts, op, rgb = load_dump(dump)
+        res = pnp.estimate_poses(pts, op, normalize=False)
+        runs.append({"pts": pts, "op": op, "rgb": rgb, "c2w": res["cam2world"]})
+
+    # --- chain Sim(3): T_k maps run k's frame -> global (frame f_0) ---
+    T = [align.identity()]
+    chain_info = []
+    for k in range(1, a.count):
+        XA, oA = runs[k]["pts"][0].reshape(-1, 3), runs[k]["op"][0].reshape(-1)     # f_k as view0 of run k
+        XB, oB = runs[k-1]["pts"][1].reshape(-1, 3), runs[k-1]["op"][1].reshape(-1) # f_k as view1 of run k-1
+        m = (oA > THR) & (oB > THR)
+        scene = align._rms(XB[m] - XB[m].mean(0))
+        s, R, t, inl = align.fit_similarity_ransac(XA[m], XB[m], thresh=0.02 * scene, iters=300)
+        S_k = (s, R, t)
+        T.append(align.compose(T[k-1], S_k))
+        resid = align._rms(align.apply_sim(S_k, XA[m][inl]) - XB[m][inl]) / scene
+        chain_info.append((k, s, 100*inl.mean(), 100*m.mean(), 100*resid))
+
+    # --- accumulate gaussians into the global frame (each frame once) ---
+    XYZ, RGB = [], []
+    def add(run_idx, view, Tk):
+        pts = runs[run_idx]["pts"][view].reshape(-1, 3)
+        op = runs[run_idx]["op"][view].reshape(-1)
+        rgb = runs[run_idx]["rgb"][view].reshape(-1, 3)
+        keep = op > THR
+        XYZ.append(align.apply_sim(Tk, pts[keep])); RGB.append(rgb[keep])
+    add(0, 0, T[0])                                  # f_0 (view0 of pair 0)
+    for k in range(a.count):                         # f_{k+1} = view1 of pair k
+        add(k, 1, T[k])
+    XYZ = np.concatenate(XYZ); RGB = np.concatenate(RGB)
+
+    # --- recovered camera trajectory (global) vs GT ---
+    rec = [align.apply_sim(T[k], runs[k]["c2w"][0][:3, 3]) for k in range(a.count)]
+    rec.append(align.apply_sim(T[a.count-1], runs[a.count-1]["c2w"][1][:3, 3]))
+    rec = np.array(rec)
+    gt = np.array([frames[i]["c2w"][:3, 3] for i in idx])
+    sA, RA, tA = align.fit_similarity(rec, gt)        # align recovered traj -> GT
+    rec_al = align.apply_sim((sA, RA, tA), rec)
+    perr = np.linalg.norm(rec_al - gt, axis=1)
+    gt_ext = align._rms(gt - gt.mean(0))
+
+    # --- report ---
+    print("\nchain (per shared frame): step  scale  inlier%  valid%  resid%scene")
+    for k, s, inlp, vp, rp in chain_info:
+        print(f"   {k:>3}   {s:>6.3f}  {inlp:>6.1f}  {vp:>6.1f}  {rp:>7.2f}")
+    total_scale = np.prod([ci[1] for ci in chain_info]) if chain_info else 1.0
+    print(f"cumulative scale drift over {a.count-1} links: {total_scale:.3f} "
+          f"(per-link mean {total_scale**(1/max(a.count-1,1)):.4f})")
+
+    print(f"\ntrajectory vs GT ({len(idx)} frames, after global Sim(3) align):")
+    print(f"   ATE rms = {align._rms(rec_al-gt):.4f}  ({100*align._rms(rec_al-gt)/gt_ext:.1f}% "
+          f"of GT extent {gt_ext:.4f})")
+    print(f"   per-frame error % of extent: " +
+          " ".join(f"{100*e/gt_ext:.0f}" for e in perr))
+    print(f"   drift: first-half {100*perr[:len(perr)//2].mean()/gt_ext:.1f}%  "
+          f"second-half {100*perr[len(perr)//2:].mean()/gt_ext:.1f}%")
+
+    print(f"\naccumulated cloud: {len(XYZ):,} points (opacity>{THR})")
+    if a.ply:
+        if len(XYZ) > a.max_points:
+            sel = np.random.default_rng(0).choice(len(XYZ), a.max_points, replace=False)
+            XYZ, RGB = XYZ[sel], RGB[sel]
+        write_ply(a.ply, XYZ, RGB)
+        print(f"   wrote {a.ply} ({len(XYZ):,} points)")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pose/render_ply.py b/pose/render_ply.py
new file mode 100644
index 0000000..1cde6c7
--- /dev/null
+++ b/pose/render_ply.py
@@ -0,0 +1,54 @@
+"""Quick pinhole projection of a colored PLY to a PNG (z-buffered point splat) --
+a visual coherence check for the accumulated cloud, no 3D viewer needed.
+
+    nix develop -c python3 pose/render_ply.py world.ply out.png [focal] [radius]
+"""
+import sys
+import numpy as np
+import cv2
+
+
+def read_ply(path):
+    with open(path, "rb") as f:
+        n = 0
+        while True:
+            l = f.readline()
+            if l.startswith(b"element vertex"):
+                n = int(l.split()[-1])
+            if l.strip() == b"end_header":
+                break
+        rec = np.frombuffer(f.read(n * 15),
+                            dtype=[("x", "<f4"), ("y", "<f4"), ("z", "<f4"),
+                                   ("r", "u1"), ("g", "u1"), ("b", "u1")])
+    xyz = np.stack([rec["x"], rec["y"], rec["z"]], 1).astype(np.float64)
+    rgb = np.stack([rec["r"], rec["g"], rec["b"]], 1)
+    return xyz, rgb
+
+
+def render(xyz, rgb, focal, H=512, W=512, c2w=None, radius=1):
+    w2c = np.eye(4) if c2w is None else np.linalg.inv(c2w)
+    Xc = (w2c[:3, :3] @ xyz.T).T + w2c[:3, 3]
+    good = Xc[:, 2] > 1e-3
+    Xc, col = Xc[good], rgb[good]
+    u = focal * Xc[:, 0] / Xc[:, 2] + W / 2
+    v = focal * Xc[:, 1] / Xc[:, 2] + H / 2
+    ui, vi = np.round(u).astype(int), np.round(v).astype(int)
+    img = np.zeros((H, W, 3), np.uint8)
+    order = np.argsort(-Xc[:, 2])                       # far first; near overwrites
+    ui, vi, col = ui[order], vi[order], col[order]
+    for dy in range(-radius, radius + 1):
+        for dx in range(-radius, radius + 1):
+            uu, vv = ui + dx, vi + dy
+            inb = (uu >= 0) & (uu < W) & (vv >= 0) & (vv < H)
+            img[vv[inb], uu[inb]] = col[inb]
+    return img
+
+
+if __name__ == "__main__":
+    ply, out = sys.argv[1], sys.argv[2]
+    focal = float(sys.argv[3]) if len(sys.argv) > 3 else 274.0
+    radius = int(sys.argv[4]) if len(sys.argv) > 4 else 1
+    xyz, rgb = read_ply(ply)
+    img = render(xyz, rgb, focal, radius=radius)
+    cv2.imwrite(out, cv2.cvtColor(img, cv2.COLOR_RGB2BGR))
+    print(f"rendered {len(xyz):,} pts -> {out}  ({(img.any(2)).mean()*100:.0f}% filled)")

From 17105fdc00a9aecc3635389f715a53cccbc93be8 Mon Sep 17 00:00:00 2001
From: Richard Palethorpe <io@richiejp.com>
Date: Fri, 26 Jun 2026 22:30:08 +0100
Subject: [PATCH 06/33] pose/: loop closure -- machinery verified; real loop
 reveals it's not the lever

find_loop.py searches re10k poses for a clip that revisits its start; loop_closure.py
chains open-loop, measures the loop error via a closing pair (f_0,f_n), distributes it
by even Sim(3) relaxation (D^(k/n)), and reports ATE before/after. Sim(3) 4x4 helpers
(sim_matrix, sim_frac_power) promoted into align.py; golden test added.

Honest result on a real loop clip (camera out to 2.29 and back to 0.23):
- the open-loop chain ALREADY closes the loop (loop error 4.4deg / scale 1.12 / 8%
  trans) -- there is almost no accumulated drift to distribute.
- the dominant ~34% ATE is per-link ODOMETRY NOISE (Sim(3) inlier% as low as 17-24%
  on the fast outbound leg) plus the model's focal-bias warp: self-consistent (the
  loop closes) but distorted vs GT. Loop closure can't fix that; naive distribution
  slightly hurts.
- diagnosis confirmed not a bug: the correction recovers SYNTHETIC uniform
  accumulated drift to ~0 (test_pose.py::test_loop_correction, ATE 1e-15).

Lesson: loop closure pays off on LONG trajectories with consistent accumulated drift
(cf. the forward clip's monotone 7%->13%); for short loops the lever is better
odometry -- smaller baselines, consensus fusion, and the focal bias.

Golden suite green (test_loop_correction added).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 pose/README.md       |  25 ++++++--
 pose/align.py        |  20 ++++++
 pose/find_loop.py    |  70 ++++++++++++++++++++
 pose/loop_closure.py | 150 +++++++++++++++++++++++++++++++++++++++++++
 pose/test_pose.py    |  25 +++++++-
 5 files changed, 284 insertions(+), 6 deletions(-)
 create mode 100644 pose/find_loop.py
 create mode 100644 pose/loop_closure.py

diff --git a/pose/README.md b/pose/README.md
index d0322e6..ded0297 100644
--- a/pose/README.md
+++ b/pose/README.md
@@ -97,6 +97,16 @@ anywhere numpy is.
   ATE **11%** of extent with drift growing 7%→13% (monotone monocular scale drift,
   0.755 over 12 links); the 2.6M-point cloud renders as a **coherent room** matching
   the input. The accumulating-reconstruction idea is **proven** end-to-end.
+- `find_loop.py` — search re10k poses for a clip that revisits its start (the loop
+  substrate). `loop_closure.py` — chain open-loop, measure the loop error via a
+  closing pair, then distribute it (even Sim(3) relaxation, `align.sim_frac_power`)
+  and report ATE before/after. **Finding:** loop closure helps only when drift
+  *accumulates* into a large loop error. On the real loop clip tested it did NOT
+  help — the open-loop chain already closes (loop error 4.4° / scale 1.12 / 8%
+  trans), so the ~34% ATE is per-link **odometry noise** (inlier% as low as 17%
+  on fast legs) + the focal-bias warp, *self-consistent but distorted vs GT*, which
+  loop closure can't fix. The correction machinery itself is verified to recover
+  synthetic accumulated drift to ~0 (`test_pose.py::test_loop_correction`).
 
 ## Run the tests
 
@@ -127,13 +137,18 @@ FS_DEVICE=cpu nix develop -c python3 \
 - ✅ **Accumulation prototype** — sliding-window Sim(3) chaining over a clip builds
   one coherent world; trajectory tracks GT to ~11% ATE with bounded-able drift. The
   idea is proven; per CLAUDE.md the next implementation step is the **C++ port**.
+- ✅ **Loop closure** — implemented + machinery verified on synthetic drift (recovers
+  to ~0). On a real loop clip it did NOT help: the chain already closes, so the error
+  is odometry noise + focal warp, not accumulated drift. Lesson: better odometry
+  (smaller baselines / fusion / focal) is the lever for short loops; loop closure
+  pays off on long trajectories with consistent accumulated drift.
 
 ## Not done yet (honest)
 
-- **Loop closure / Sim(3) pose-graph** to bound the monotone scale + pose drift
-  (the prototype chains open-loop; a forward pan never revisits).
-- **Consensus fusion** to remove the residual ~2% floaters during accumulation.
-- A **higher-motion clip** for the wide-baseline sweet-spot sweep (the sample clip
-  is a slow pan, max ~6° over 160 frames).
+- **Better odometry** is the bigger lever than loop closure for short clips:
+  smaller per-pair baselines, **consensus fusion** (remove the ~2% floaters /
+  lift the low-inlier links), and the **focal bias** (recovered ~274 vs GT ~440+).
+- A **higher-motion clip** wide-baseline sweep, and a **long** trajectory where
+  loop closure demonstrably pays off.
 - The **C++ port** (CLI + C API, no Python) once the design is locked — then this
   prototype is deleted.
diff --git a/pose/align.py b/pose/align.py
index 55d8652..0beec07 100644
--- a/pose/align.py
+++ b/pose/align.py
@@ -195,6 +195,26 @@ def identity():
     return (1.0, np.eye(3), np.zeros(3))
 
 
+# --- Sim(3) as 4x4 matrices (for pose-graph relaxation) --------------------
+
+def sim_matrix(s, R, t):
+    """Sim(3) as a 4x4 homogeneous matrix [[sR, t],[0,1]] (compose == matmul)."""
+    M = np.eye(4)
+    M[:3, :3] = s * R
+    M[:3, 3] = t
+    return M
+
+
+def sim_frac_power(M, f):
+    """Fractional power M^f of a Sim(3) 4x4, via eigendecomposition (real part).
+
+    The even loop-closure relaxation: distribute an accumulated drift D over n
+    nodes by applying D^(k/n) at node k. M^0=I, M^1=M, (M^0.5)^2=M; valid while the
+    rotation is < 180deg (principal branch). Tested in test_pose.py."""
+    w, V = np.linalg.eig(M)
+    return (V @ np.diag(w ** f) @ np.linalg.inv(V)).real
+
+
 def loop_closure_error(transforms):
     """Compose a closed loop of similarities and measure deviation from identity.
 
diff --git a/pose/find_loop.py b/pose/find_loop.py
new file mode 100644
index 0000000..ab732cf
--- /dev/null
+++ b/pose/find_loop.py
@@ -0,0 +1,70 @@
+"""Search RealEstate10K poses for a LOOPING clip -- camera wanders away from its
+start then returns near it (with a similar viewing direction, so the first and last
+frames overlap). That overlap is the loop-closure constraint we need to test drift
+correction. Pose-only: no video downloads until a candidate is chosen.
+
+A good loop: large max distance from the start (real motion) but small end-to-start
+distance (returns), and small first<->last rotation (same view => the closing pair
+(f_0, f_n) is a valid stereo pair the engine can reconstruct).
+
+    nix develop -c python3 pose/find_loop.py [split] [limit] [--live N]
+"""
+import sys
+import numpy as np
+
+import re10k_control as rc
+import re10k_fetch as rf
+
+
+def score(fr):
+    C = np.array([f["c2w"][:3, 3] for f in fr])
+    d = np.linalg.norm(C - C[0], axis=1)
+    maxd = d.max()
+    if maxd < 1e-2:
+        return None
+    rotend = rc.rot_deg(rc.rel_pose(fr[0]["c2w"], fr[-1]["c2w"]))
+    return {"n": len(fr), "maxd": float(maxd), "endd": float(d[-1]),
+            "return_ratio": float(d[-1] / maxd), "rotend": rotend,
+            "loopiness": float(maxd / (d[-1] + 0.05 * maxd))}
+
+
+def search(split="test", limit=None, min_frames=80,
+           max_return_ratio=0.25, max_rotend=45.0):
+    rows = []
+    clips = rc.list_clips(split)
+    if limit:
+        clips = clips[:limit]
+    for cp in clips:
+        try:
+            url, fr = rc.parse_clip(cp)
+        except Exception:
+            continue
+        if len(fr) < min_frames:
+            continue
+        s = score(fr)
+        if not s or s["return_ratio"] > max_return_ratio or s["rotend"] > max_rotend:
+            continue
+        s["clip"], s["id"] = cp, rc.clip_video_id(url)
+        rows.append(s)
+    rows.sort(key=lambda r: r["loopiness"], reverse=True)
+    return rows
+
+
+if __name__ == "__main__":
+    split = sys.argv[1] if len(sys.argv) > 1 and not sys.argv[1].startswith("-") else "test"
+    limit = int(sys.argv[2]) if len(sys.argv) > 2 and not sys.argv[2].startswith("-") else None
+    n_live = int(sys.argv[sys.argv.index("--live") + 1]) if "--live" in sys.argv else 0
+    rows = search(split, limit)
+    print(f"{len(rows)} loop candidates (split={split}, limit={limit})")
+    print(f"  {'loopiness':>9} {'n':>4} {'maxd':>6} {'ret%':>5} {'rotEnd':>6}  id")
+    shown = 0
+    for r in rows[:30]:
+        live = ""
+        if shown < n_live:
+            try:
+                live = "  LIVE" if rf.is_live(rc.parse_clip(r["clip"])[0]) else "  dead"
+            except Exception:
+                live = "  ?"
+            shown += 1
+        print(f"  {r['loopiness']:>9.1f} {r['n']:>4} {r['maxd']:>6.2f} "
+              f"{100*r['return_ratio']:>5.0f} {r['rotend']:>6.1f}  {r['id']}{live}")
diff --git a/pose/loop_closure.py b/pose/loop_closure.py
new file mode 100644
index 0000000..961af61
--- /dev/null
+++ b/pose/loop_closure.py
@@ -0,0 +1,150 @@
+"""Loop-closure drift correction on a LOOPING re10k clip (see find_loop.py).
+
+accumulate.py chains Sim(3) open-loop and the drift grows to the endpoint (~11%
+ATE). When the camera RETURNS near its start, frame 0 and frame n overlap -- a
+loop-closure constraint. Here we:
+  1. chain open-loop  -> per-frame global poses P_k, recovered trajectory, ATE vs GT.
+  2. measure the loop -> run the closing pair (f_0, f_n), scale-aligned to run 0,
+     giving f_n's pose via the loop (P_n_loop), INDEPENDENT of the chain.
+  3. the discrepancy D (P_n_loop = D . P_n) IS the accumulated drift (scale/rot/trans).
+  4. distribute D over the chain (even Sim(3) relaxation, D^(k/n)) and show the
+     trajectory ATE and endpoint error collapse.
+
+Closes the drift story accumulate.py left open; exercises align's chaining on a
+real loop. Poses are 4x4 similarity matrices [[sR,t],[0,1]] (compose = matmul).
+
+    FS_DEVICE=cpu nix develop -c python3 pose/loop_closure.py CLIP FRAME_DIR \\
+        --stride 20 --count 13 --cache DIR
+"""
+import os
+import argparse
+import numpy as np
+
+import re10k_control as rc
+import re10k_fetch as rf
+import re10k_experiment as ex
+import pnp
+import align
+
+THR = 0.05
+simmat = align.sim_matrix          # Sim(3) 4x4
+matfrac = align.sim_frac_power     # M^f for even loop-closure relaxation
+
+
+def fit_sim_M(X, Y):
+    scene = align._rms(Y - Y.mean(0))
+    s, R, t, inl = align.fit_similarity_ransac(X, Y, thresh=0.02 * scene, iters=300)
+    return simmat(s, R, t), inl
+
+
+def decompose(M):
+    A = M[:3, :3]
+    s = float(np.linalg.det(A) ** (1.0 / 3.0))
+    R = A / s
+    ang = float(np.degrees(np.arccos(np.clip((np.trace(R) - 1) / 2, -1, 1))))
+    return s, ang, M[:3, 3]
+
+
+def load(dump):
+    a = np.fromfile(dump, np.float32).reshape(2, 512, 512, 23)
+    pts = [a[v, ..., 0:3].astype(np.float64) for v in (0, 1)]
+    op = [a[v, ..., 15].astype(np.float64) for v in (0, 1)]
+    return pts, op
+
+
+def ate(C, gt):
+    sA, RA, tA = align.fit_similarity(C, gt)
+    Cal = align.apply_sim((sA, RA, tA), C)
+    return align._rms(Cal - gt), np.linalg.norm(Cal - gt, axis=1)
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("clip"); ap.add_argument("frame_dir")
+    ap.add_argument("--start", type=int, default=0)
+    ap.add_argument("--stride", type=int, default=20)
+    ap.add_argument("--count", type=int, default=13)
+    ap.add_argument("--cache", default=os.environ.get("SCRATCH", "/tmp"))
+    a = ap.parse_args()
+
+    url, frames = rc.parse_clip(a.clip)
+    idx = [a.start + k * a.stride for k in range(a.count + 1)]
+    os.makedirs(a.cache, exist_ok=True)
+    fp = lambda i: os.path.join(a.frame_dir, f"f{i:04d}.png")
+    if any(not os.path.exists(fp(i)) for i in idx):
+        rf.fetch(a.clip, a.frame_dir, indices=idx)
+    print(f"=== loop closure: {os.path.basename(a.clip)}  frames {idx[0]}..{idx[-1]} "
+          f"stride {a.stride}  device={ex.DEVICE} ===\n{url}")
+
+    # --- run / load chain pairs ---
+    runs = []
+    for k in range(a.count):
+        dump = os.path.join(a.cache, f"pair_{idx[k]}_{idx[k+1]}.f32")
+        if not os.path.exists(dump):
+            ex.run_engine(fp(idx[k]), fp(idx[k + 1]), dump)
+        pts, op = load(dump)
+        c2w = pnp.estimate_poses(pts, op, normalize=False)["cam2world"]
+        runs.append({"pts": pts, "op": op, "c2w": c2w})
+
+    # --- open-loop chain: T[k] maps run k -> global ---
+    T = [np.eye(4)]; chain_diag = []
+    for k in range(1, a.count):
+        XA = runs[k]["pts"][0].reshape(-1, 3); oA = runs[k]["op"][0].reshape(-1)
+        XB = runs[k-1]["pts"][1].reshape(-1, 3); oB = runs[k-1]["op"][1].reshape(-1)
+        m = (oA > THR) & (oB > THR)
+        S, inlk = fit_sim_M(XA[m], XB[m]); T.append(T[k-1] @ S)
+        sk, _, _ = decompose(S)
+        chain_diag.append((sk, 100 * inlk.mean(), 100 * m.mean()))
+    P = [T[k] @ runs[k]["c2w"][0] for k in range(a.count)]
+    P.append(T[a.count-1] @ runs[a.count-1]["c2w"][1])      # f_n from last run's view1
+    centers = np.array([p[:3, 3] for p in P])
+    gt = np.array([frames[i]["c2w"][:3, 3] for i in idx])
+    ext = align._rms(gt - gt.mean(0))
+
+    # --- loop measurement: closing pair (f_0, f_n) ---
+    cdump = os.path.join(a.cache, f"close_{idx[0]}_{idx[-1]}.f32")
+    if not os.path.exists(cdump):
+        ex.run_engine(fp(idx[0]), fp(idx[-1]), cdump)
+    cpts, cop = load(cdump)
+    cc2w = pnp.estimate_poses(cpts, cop, normalize=False)["cam2world"]
+    # scale/pose-align the closing run's f_0 (view0) to run 0's f_0 (view0, == global)
+    XA = cpts[0].reshape(-1, 3); oA = cop[0].reshape(-1)
+    XB = runs[0]["pts"][0].reshape(-1, 3); oB = runs[0]["op"][0].reshape(-1)
+    m = (oA > THR) & (oB > THR)
+    close_valid = 100 * m.mean()
+    G, inl = fit_sim_M(XA[m], XB[m])
+    P_n_loop = G @ cc2w[1]                                  # f_n via the loop, in global
+
+    # --- drift D (P_n_loop = D . P_n) and even distribution ---
+    D = P_n_loop @ np.linalg.inv(P[-1])
+    s, ang, t = decompose(D)
+    n = a.count
+    centers_c = np.array([(matfrac(D, k / n) @ np.append(centers[k], 1.0))[:3]
+                          for k in range(len(centers))])
+
+    # --- report ---
+    ate0, perr0 = ate(centers, gt)
+    ate1, perr1 = ate(centers_c, gt)
+    # diagnostics: is the open-loop chain a clean drift, and does it even return?
+    sA, RA, tA = align.fit_similarity(centers, gt)
+    pnl_al = align.apply_sim((sA, RA, tA), P_n_loop[:3, 3])
+    print("\nchain per-link scale/inlier%/valid%: " +
+          "  ".join(f"{c[0]:.2f}/{c[1]:.0f}/{c[2]:.0f}" for c in chain_diag))
+    print(f"return-to-start dist: recovered {np.linalg.norm(centers[-1]-centers[0]):.3f}  "
+          f"GT {np.linalg.norm(gt[-1]-gt[0]):.3f}  (GT extent {ext:.3f})")
+    print(f"loop measurement P_n_loop endpoint err vs GT: "
+          f"{100*np.linalg.norm(pnl_al-gt[-1])/ext:.0f}% of extent")
+    print(f"\nclosing pair (f_{idx[0]}, f_{idx[-1]}): valid%={close_valid:.1f}  "
+          f"inliers={100*inl.mean():.0f}%")
+    print(f"LOOP-CLOSURE ERROR (open-loop drift at the loop point):")
+    print(f"   scale {s:.3f} ({100*abs(np.log(s)):.1f}% log)  rotation {ang:.1f} deg  "
+          f"translation {np.linalg.norm(t):.3f} ({100*np.linalg.norm(t)/ext:.0f}% of extent)")
+    print(f"\ntrajectory ATE vs GT (after global Sim(3) align):")
+    print(f"   open-loop : {100*ate0/ext:5.1f}% of extent   endpoint {100*perr0[-1]/ext:.0f}%")
+    print(f"   closed    : {100*ate1/ext:5.1f}% of extent   endpoint {100*perr1[-1]/ext:.0f}%")
+    print(f"   per-frame % (open ): " + " ".join(f"{100*e/ext:.0f}" for e in perr0))
+    print(f"   per-frame % (closed): " + " ".join(f"{100*e/ext:.0f}" for e in perr1))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pose/test_pose.py b/pose/test_pose.py
index aec4af7..3ad0c80 100644
--- a/pose/test_pose.py
+++ b/pose/test_pose.py
@@ -115,6 +115,29 @@ def test_outlier_robustness():
           f"{(~inl[out_idx]).mean()*100:.0f}% of outliers rejected")
 
 
+def test_loop_correction():
+    """Sim(3) even-distribution loop closure recovers a loop from uniform drift."""
+    print("test_loop_correction")
+    R = rand_rotation(RNG)
+    M = align.sim_matrix(1.07, R, np.array([0.3, -0.2, 0.1]))
+    check("M^0 == I", np.allclose(align.sim_frac_power(M, 0.0), np.eye(4), atol=1e-9))
+    check("M^1 == M", np.allclose(align.sim_frac_power(M, 1.0), M, atol=1e-9))
+    check("(M^.5)^2 == M",
+          np.allclose(align.sim_frac_power(M, .5) @ align.sim_frac_power(M, .5), M, atol=1e-9))
+    # distributing a known accumulated drift D^(k/n) cancels it to ~0
+    n = 12
+    D = align.sim_matrix(1.15, R, np.array([0.4, -0.2, 0.1]))
+    clean = np.array([[np.cos(t), np.sin(t), 0.1 * t]
+                      for t in np.linspace(0, 2 * np.pi, n + 1)])
+    drift = np.array([(align.sim_frac_power(np.linalg.inv(D), k / n) @
+                       np.append(clean[k], 1))[:3] for k in range(n + 1)])
+    corr = np.array([(align.sim_frac_power(D, k / n) @
+                      np.append(drift[k], 1))[:3] for k in range(n + 1)])
+    check("drift present before", align._rms(drift - clean) > 0.1)
+    check("recovers clean loop", align._rms(corr - clean) < 1e-9,
+          f"ATE {align._rms(corr - clean):.1e}")
+
+
 def test_loop_closure():
     """A closed loop of consistent links ~= identity; injected scale drift shows."""
     print("test_loop_closure")
@@ -219,7 +242,7 @@ def test_pnp_outliers():
 def main():
     tests = [
         test_similarity_roundtrip, test_scale_detection, test_nonlinear_detection,
-        test_outlier_robustness, test_loop_closure,
+        test_outlier_robustness, test_loop_correction, test_loop_closure,
         test_focal_recovery, test_pnp_recovery, test_pnp_outliers,
     ]
     for t in tests:

From 1d31eef3863d0c48b8fff34d8326adea1714a918 Mon Sep 17 00:00:00 2001
From: Richard Palethorpe <io@richiejp.com>
Date: Sat, 27 Jun 2026 09:05:43 +0100
Subject: [PATCH 07/33] pose/: consensus fusion -- removes the edge-noise
 floaters from the accumulated cloud

fuse.py answers the question that started this thread ("around the edges there's a
lot of noise -- does accumulation remove it?"). Each per-pixel gaussian is
partner-view-dependent, so occlusion-edge / depth-ambiguous points are floaters; a
real surface point is reconstructed by several overlapping frames and they agree in
the global frame. So: voxelize the accumulated cloud at the consistency scale (~2%
of extent) and keep only voxels corroborated by >= K distinct frames, averaging the
agreeing predictions (which also denoises the surface). Reuses cached engine dumps
(no new inference).

Result (forward clip, 14 frames, K>=2): 46% of voxels are single-frame, holding
14% of points -- these render as INCOHERENT EDGE-HAZE (floaters + swept-volume
periphery). The >=2-frame consensus (86% of points) renders as a CLEAN, CRISP room
with the haze gone. Definitive yes: accumulation + consensus fusion removes the edge
noise. Honest tradeoff: dropping single-frame points also trims the single-view
periphery (coverage vs cleanliness).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 pose/README.md |  21 ++++++--
 pose/fuse.py   | 134 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 150 insertions(+), 5 deletions(-)
 create mode 100644 pose/fuse.py

diff --git a/pose/README.md b/pose/README.md
index ded0297..f2bee03 100644
--- a/pose/README.md
+++ b/pose/README.md
@@ -107,6 +107,12 @@ anywhere numpy is.
   on fast legs) + the focal-bias warp, *self-consistent but distorted vs GT*, which
   loop closure can't fix. The correction machinery itself is verified to recover
   synthetic accumulated drift to ~0 (`test_pose.py::test_loop_correction`).
+- `fuse.py` — **consensus fusion** (the edge-noise answer): voxelize the accumulated
+  cloud at the consistency scale and keep only voxels corroborated by ≥K distinct
+  frames, averaging the agreeing predictions. On the forward clip: **14% of points
+  are single-frame floaters** that render as incoherent edge-haze, while the ≥2-frame
+  consensus renders as a **clean, crisp room** with the haze gone. Tradeoff: dropping
+  single-frame points also trims the single-view periphery (coverage vs cleanliness).
 
 ## Run the tests
 
@@ -142,13 +148,18 @@ FS_DEVICE=cpu nix develop -c python3 \
   is odometry noise + focal warp, not accumulated drift. Lesson: better odometry
   (smaller baselines / fusion / focal) is the lever for short loops; loop closure
   pays off on long trajectories with consistent accumulated drift.
+- ✅ **Consensus fusion** — gating the accumulated cloud on ≥2-frame agreement removes
+  the 14% single-frame floaters (incoherent edge-haze) and yields a clean surface —
+  the definitive "yes, accumulation removes the edge noise." Trades single-view
+  periphery for a cleaner core.
 
 ## Not done yet (honest)
 
-- **Better odometry** is the bigger lever than loop closure for short clips:
-  smaller per-pair baselines, **consensus fusion** (remove the ~2% floaters /
-  lift the low-inlier links), and the **focal bias** (recovered ~274 vs GT ~440+).
-- A **higher-motion clip** wide-baseline sweep, and a **long** trajectory where
-  loop closure demonstrably pays off.
+- The **focal bias** (recovered ~274 vs GT ~440+) — the remaining geometry-distortion
+  source; worth understanding whether it's fixable.
+- A **higher-motion clip** wide-baseline sweep, and a **long** trajectory where loop
+  closure demonstrably pays off.
+- **Fuse-then-align**: feed the consensus surface back into the odometry (lift the
+  low-inlier links the loop-closure diagnosis flagged), not just the final cloud.
 - The **C++ port** (CLI + C API, no Python) once the design is locked — then this
   prototype is deleted.
diff --git a/pose/fuse.py b/pose/fuse.py
new file mode 100644
index 0000000..df7b1f7
--- /dev/null
+++ b/pose/fuse.py
@@ -0,0 +1,134 @@
+"""Consensus fusion -- remove the partner-dependent floaters (the edge noise).
+
+Each per-pixel gaussian is conditioned on its partner view, so occlusion-edge /
+depth-ambiguous points land differently per run = floaters. A REAL surface point,
+though, is reconstructed by several overlapping frames and they agree in the global
+frame. So: voxelize the accumulated cloud at the consistency scale and keep only
+voxels corroborated by >= K DISTINCT frames, averaging the agreeing predictions
+(which also denoises the surface). Singletons / scattered floaters are dropped.
+
+This is the direct answer to "does accumulation remove the edge noise?" -- yes, once
+you gate on cross-frame consensus. Reuses cached engine dumps from accumulate.py
+(no new inference).
+
+    nix develop -c python3 pose/fuse.py CLIP FRAME_DIR --cache DIR \\
+        --stride 20 --count 13 --voxel 0.02 --k 2 --ply-raw raw.ply --ply-fused fused.ply
+"""
+import os
+import argparse
+import numpy as np
+
+import align
+
+THR = 0.05
+C0 = 0.28209479177387814
+
+
+def load(dump):
+    a = np.fromfile(dump, np.float32).reshape(2, 512, 512, 23)
+    pts = [a[v, ..., 0:3].astype(np.float64) for v in (0, 1)]
+    op = [a[v, ..., 15].astype(np.float64) for v in (0, 1)]
+    rgb = [np.clip(a[v, ..., 3:6] * C0 + 0.5, 0, 1) for v in (0, 1)]
+    return pts, op, rgb
+
+
+def fit_sim(X, Y):
+    scene = align._rms(Y - Y.mean(0))
+    s, R, t, _ = align.fit_similarity_ransac(X, Y, thresh=0.02 * scene, iters=300)
+    return (s, R, t)
+
+
+def write_ply(path, xyz, rgb):
+    n = len(xyz)
+    hdr = ("ply\nformat binary_little_endian 1.0\n"
+           f"element vertex {n}\nproperty float x\nproperty float y\nproperty float z\n"
+           "property uchar red\nproperty uchar green\nproperty uchar blue\nend_header\n")
+    rec = np.zeros(n, dtype=[("x", "<f4"), ("y", "<f4"), ("z", "<f4"),
+                             ("r", "u1"), ("g", "u1"), ("b", "u1")])
+    rec["x"], rec["y"], rec["z"] = xyz[:, 0], xyz[:, 1], xyz[:, 2]
+    c = (np.clip(rgb, 0, 1) * 255).astype(np.uint8)
+    rec["r"], rec["g"], rec["b"] = c[:, 0], c[:, 1], c[:, 2]
+    with open(path, "wb") as f:
+        f.write(hdr.encode()); f.write(rec.tobytes())
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("clip"); ap.add_argument("frame_dir")
+    ap.add_argument("--start", type=int, default=0)
+    ap.add_argument("--stride", type=int, default=20)
+    ap.add_argument("--count", type=int, default=13)
+    ap.add_argument("--cache", default=os.environ.get("SCRATCH", "/tmp"))
+    ap.add_argument("--voxel", type=float, default=0.02)     # fraction of scene extent
+    ap.add_argument("--k", type=int, default=2)              # min distinct frames
+    ap.add_argument("--ply-raw", default=None)
+    ap.add_argument("--ply-fused", default=None)       # averaged consensus (denoised)
+    ap.add_argument("--ply-kept", default=None)        # raw points in consensus voxels
+    ap.add_argument("--ply-floaters", default=None)    # raw points dropped (support < K)
+    a = ap.parse_args()
+
+    idx = [a.start + k * a.stride for k in range(a.count + 1)]
+    runs = []
+    for k in range(a.count):
+        dump = os.path.join(a.cache, f"pair_{idx[k]}_{idx[k+1]}.f32")
+        pts, op, rgb = load(dump)
+        runs.append({"pts": pts, "op": op, "rgb": rgb})
+
+    # chain Sim(3) -> global (same as accumulate.py)
+    T = [align.identity()]
+    for k in range(1, a.count):
+        XA = runs[k]["pts"][0].reshape(-1, 3); oA = runs[k]["op"][0].reshape(-1)
+        XB = runs[k-1]["pts"][1].reshape(-1, 3); oB = runs[k-1]["op"][1].reshape(-1)
+        m = (oA > THR) & (oB > THR)
+        T.append(align.compose(T[k-1], fit_sim(XA[m], XB[m])))
+
+    # accumulate every frame's gaussians, tagged with a source-frame id
+    XYZ, RGB, FR = [], [], []
+    def add(run, view, Tk, fid):
+        op = runs[run]["op"][view].reshape(-1)
+        keep = op > THR
+        XYZ.append(align.apply_sim(Tk, runs[run]["pts"][view].reshape(-1, 3)[keep]))
+        RGB.append(runs[run]["rgb"][view].reshape(-1, 3)[keep])
+        FR.append(np.full(int(keep.sum()), fid, np.int64))
+    add(0, 0, T[0], 0)
+    for k in range(a.count):
+        add(k, 1, T[k], k + 1)
+    XYZ = np.concatenate(XYZ); RGB = np.concatenate(RGB); FR = np.concatenate(FR)
+
+    # voxel multi-frame consensus
+    ext = align._rms(XYZ - XYZ.mean(0))
+    v = a.voxel * ext
+    ijk = np.floor((XYZ - XYZ.min(0)) / v).astype(np.int64)
+    uniq, inv = np.unique(ijk, axis=0, return_inverse=True)
+    G = len(uniq)
+    # distinct source-frames per voxel
+    pairs = np.unique(np.stack([inv, FR], 1), axis=0)
+    support = np.bincount(pairs[:, 0], minlength=G)
+    cnt = np.bincount(inv, minlength=G).astype(float)
+    mean = np.stack([np.bincount(inv, XYZ[:, c], minlength=G) / cnt for c in range(3)], 1)
+    mrgb = np.stack([np.bincount(inv, RGB[:, c], minlength=G) / cnt for c in range(3)], 1)
+    keep = support >= a.k
+
+    print(f"=== consensus fusion  voxel={a.voxel:.3f}*extent  K>={a.k} ===")
+    print(f"raw points: {len(XYZ):,}   voxels: {G:,}")
+    hist = np.bincount(support)[:6]
+    print("voxels by distinct-frame support: " +
+          "  ".join(f"{i}f:{hist[i] if i < len(hist) else 0:,}" for i in range(1, 6)) +
+          f"  6+f:{(support >= 6).sum():,}")
+    print(f"FUSED points (>= {a.k} frames): {keep.sum():,}  "
+          f"({100*keep.sum()/G:.0f}% of voxels, {100*(1-keep.sum()/len(XYZ)):.0f}% point reduction)")
+    pkeep = keep[inv]                                  # per-point consensus mask
+    print(f"per-point: kept {pkeep.sum():,}  floaters dropped {(~pkeep).sum():,} "
+          f"({100*(~pkeep).mean():.0f}%)")
+    if a.ply_raw:
+        write_ply(a.ply_raw, XYZ, RGB); print(f"   raw      -> {a.ply_raw}")
+    if a.ply_fused:
+        write_ply(a.ply_fused, mean[keep], mrgb[keep]); print(f"   fused    -> {a.ply_fused}")
+    if a.ply_kept:
+        write_ply(a.ply_kept, XYZ[pkeep], RGB[pkeep]); print(f"   kept     -> {a.ply_kept}")
+    if a.ply_floaters:
+        write_ply(a.ply_floaters, XYZ[~pkeep], RGB[~pkeep]); print(f"   floaters -> {a.ply_floaters}")
+
+
+if __name__ == "__main__":
+    main()

From f7e5db58b954da147a5a79d8a533d885e4f7b0c7 Mon Sep 17 00:00:00 2001
From: Richard Palethorpe <io@richiejp.com>
Date: Sat, 27 Jun 2026 09:46:10 +0100
Subject: [PATCH 08/33] pose/: begin the C++ port -- focal + Umeyama align +
 DLT/RANSAC PnP, golden-tested
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Start porting the proven Python pose prototype (focal.py + align.py + pnp.py) to
shipped C++, dependency-free per CLAUDE.md (only a self-contained Jacobi
eigensolver -- no Eigen, no OpenCV), wired into the library and the asset-free
test tier.

  src/linalg.h   small dense linear algebra: symmetric cyclic-Jacobi eigensolver,
                 3x3 SVD (via MᵀM), det/inv, 4x4 rigid inverse. Everything the
                 pose math needs reduces to the eigensolver.
  src/pose.{h,cpp}
                 estimate_focal (Weiszfeld); fit_similarity (Umeyama) + RANSAC +
                 residual-ladder/diagnose; Sim(3) compose/invert/sim_matrix/
                 loop_closure_error; sim_frac_power (closed-form one-parameter
                 subgroup, no complex eig); solve_pnp_numpy (DLT via the 12x12 AᵀA
                 nullspace + cheirality decode + RANSAC); estimate_poses (scene
                 recipe: view-0 all-pixel focal, per-view opacity-masked PnP,
                 optional baseline rescale).
  tests/test_pose.cpp
                 the asset-free mirror of pose/test_pose.py -- 9 golden tests
                 (similarity roundtrip, scale/nonlinear detection, RANSAC
                 outliers, loop correction/closure, focal, PnP recovery/outliers).
                 All green under the debug (ASan/UBSan) preset; ctest -LE model.

Cross-checked against the Python reference on a real scene dump: focal is
bit-exact (596.408591886). PnP is correct on clean data, but on real scenes the
DLT solver inherits the textbook planar/mirror degeneracy (3/5 RANSAC seeds match
cv2's ~57deg relative rotation, 2/5 flip) -- the same instability that made the
prototype use cv2 (SQPNP) for all real-data results. Next: a robust in-house PnP
(EPnP/SQPNP + Gauss-Newton refine) for cv2-parity with no OpenCV dependency, then
the accumulation/loop-closure/fusion chaining and the CLI / C-API surface.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .gitignore           |   3 +
 CMakeLists.txt       |   1 +
 pose/README.md       |  25 ++
 src/linalg.h         | 212 ++++++++++++++++
 src/pose.cpp         | 565 +++++++++++++++++++++++++++++++++++++++++++
 src/pose.h           | 104 ++++++++
 tests/CMakeLists.txt |   1 +
 tests/test_pose.cpp  | 352 +++++++++++++++++++++++++++
 8 files changed, 1263 insertions(+)
 create mode 100644 src/linalg.h
 create mode 100644 src/pose.cpp
 create mode 100644 src/pose.h
 create mode 100644 tests/test_pose.cpp

diff --git a/.gitignore b/.gitignore
index 97a3841..3c76e18 100644
--- a/.gitignore
+++ b/.gitignore
@@ -20,6 +20,9 @@ __pycache__/
 # reference code (third-party, fetched on demand — never vendored here).
 .cache/
 
+# Throwaway working dir for ad-hoc harnesses / scratch (never committed).
+/scratchpad/
+
 # Model + data artifacts (generated; never committed)
 *.gguf
 *.safetensors
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 83fabbc..ae59312 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -122,6 +122,7 @@ add_library(free_splatter ${_fs_libtype}
     src/gguf_loader.cpp
     src/model.cpp
     src/image.cpp
+    src/pose.cpp
     src/free_splatter.cpp
 )
 target_include_directories(free_splatter PUBLIC include)
diff --git a/pose/README.md b/pose/README.md
index f2bee03..e827981 100644
--- a/pose/README.md
+++ b/pose/README.md
@@ -12,6 +12,31 @@ self-contained numpy + cv2 consumer of the engine's `[N, H, W, 23]` output,
 ported to C++ and this Python is removed — it is not a parallel implementation to
 maintain.
 
+## C++ port status (the ship target — `src/pose.{h,cpp}`)
+
+The port has begun, dependency-free (only the self-contained `src/linalg.h`
+Jacobi eigensolver — no Eigen/OpenCV), wired into the library and the asset-free
+test tier (`tests/test_pose.cpp`, `ctest -LE model`):
+
+- ✅ **`focal.py` → `estimate_focal`** — Weiszfeld; **bit-exact** to the numpy
+  reference on a real scene dump (596.408591886 both).
+- ✅ **`align.py` → Umeyama / RANSAC / residual-ladder / Sim(3) chaining /
+  `sim_frac_power`** — `sim_frac_power` is a closed-form Sim(3) one-parameter
+  subgroup (`A^f=s^f R^f`, translation `(A^f−I)(A−I)⁻¹t`), no complex eig needed.
+  All 9 golden tests (the mirror of `test_pose.py`) pass under ASan/UBSan.
+- ⚠ **`pnp.py` (numpy backend) → `solve_pnp_numpy` + `estimate_poses`** — DLT (via
+  the 12×12 `AᵀA` nullspace, which sidesteps the numpy reference's full-SVD OOM)
+  + RANSAC + cheirality decode. **Correct on clean synthetic data** (golden
+  tests), but on a real scene dump it inherits the **DLT algorithm's planar/mirror
+  degeneracy**: ~3/5 RANSAC seeds land near the cv2/SQPNP answer (58° vs cv2's
+  57° relative rotation), ~2/5 collapse to a ~135–152° flip. This is the *same*
+  instability the numpy reference has — which is exactly why the prototype used
+  **cv2 (SQPNP) for every real-data result**. **Next step:** a robust in-house PnP
+  (EPnP/SQPNP + Gauss-Newton refine) to reach cv2-parity on real scenes with no
+  OpenCV dependency.
+- ⬜ Not yet ported: accumulation chaining, loop closure, consensus fusion (these
+  compose the primitives above), and the CLI / C-API surface.
+
 ## Why it's needed
 
 Each `free_splatter_run` over a different photo pair produces gaussians in its
diff --git a/src/linalg.h b/src/linalg.h
new file mode 100644
index 0000000..5705621
--- /dev/null
+++ b/src/linalg.h
@@ -0,0 +1,212 @@
+// linalg.h — self-contained small dense linear algebra for the pose component.
+//
+// The downstream pose math (Umeyama similarity, DLT PnP, pose decode) needs only
+// a handful of small dense operations. Rather than pull in Eigen/OpenCV — neither
+// of which we want as a runtime dependency (see CLAUDE.md: everything ships in
+// C++ with no extra runtime deps beyond ggml) — every routine here reduces to one
+// primitive: a symmetric cyclic-Jacobi eigensolver. SVD of a 3x3 is built as the
+// eigendecomposition of MᵀM; the DLT nullspace is the smallest eigenvector of
+// AᵀA. This mirrors the dependency-free numpy reference solver in pose/pnp.py,
+// which is itself verified bit-for-bit (~1e-7) against cv2 in check_cv2_parity.py.
+//
+// Everything is f64, row-major, and header-only. Sizes are tiny (<=12) so the
+// O(n^3) Jacobi sweeps are negligible next to the engine forward pass.
+#ifndef FREE_SPLATTER_LINALG_H
+#define FREE_SPLATTER_LINALG_H
+
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <cstdint>
+#include <vector>
+
+namespace fsla {
+
+using Vec3 = std::array<double, 3>;
+
+// Row-major fixed matrices. m[r][c] lives at a[r*N + c].
+struct Mat3 { double a[9]; double  operator()(int r, int c) const { return a[r * 3 + c]; }
+                          double & operator()(int r, int c)       { return a[r * 3 + c]; } };
+struct Mat4 { double a[16]; double  operator()(int r, int c) const { return a[r * 4 + c]; }
+                           double & operator()(int r, int c)       { return a[r * 4 + c]; } };
+
+inline Mat3 mat3_identity() { return Mat3{ {1,0,0, 0,1,0, 0,0,1} }; }
+inline Mat4 mat4_identity() { return Mat4{ {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1} }; }
+
+inline Mat3 mat3_mul(const Mat3 & A, const Mat3 & B) {
+    Mat3 C{};
+    for (int i = 0; i < 3; i++)
+        for (int j = 0; j < 3; j++) {
+            double s = 0;
+            for (int k = 0; k < 3; k++) s += A(i, k) * B(k, j);
+            C(i, j) = s;
+        }
+    return C;
+}
+
+inline Mat3 mat3_transpose(const Mat3 & A) {
+    Mat3 T{};
+    for (int i = 0; i < 3; i++) for (int j = 0; j < 3; j++) T(i, j) = A(j, i);
+    return T;
+}
+
+inline Vec3 mat3_apply(const Mat3 & A, const Vec3 & x) {
+    return { A(0,0)*x[0] + A(0,1)*x[1] + A(0,2)*x[2],
+             A(1,0)*x[0] + A(1,1)*x[1] + A(1,2)*x[2],
+             A(2,0)*x[0] + A(2,1)*x[1] + A(2,2)*x[2] };
+}
+
+inline double det3(const Mat3 & M) {
+    return M(0,0) * (M(1,1)*M(2,2) - M(1,2)*M(2,1))
+         - M(0,1) * (M(1,0)*M(2,2) - M(1,2)*M(2,0))
+         + M(0,2) * (M(1,0)*M(2,1) - M(1,1)*M(2,0));
+}
+
+inline Mat3 inv3(const Mat3 & M) {
+    const double d = det3(M);
+    const double id = (std::fabs(d) > 0) ? 1.0 / d : 0.0;
+    Mat3 R{};
+    R(0,0) =  (M(1,1)*M(2,2) - M(1,2)*M(2,1)) * id;
+    R(0,1) = -(M(0,1)*M(2,2) - M(0,2)*M(2,1)) * id;
+    R(0,2) =  (M(0,1)*M(1,2) - M(0,2)*M(1,1)) * id;
+    R(1,0) = -(M(1,0)*M(2,2) - M(1,2)*M(2,0)) * id;
+    R(1,1) =  (M(0,0)*M(2,2) - M(0,2)*M(2,0)) * id;
+    R(1,2) = -(M(0,0)*M(1,2) - M(0,2)*M(1,0)) * id;
+    R(2,0) =  (M(1,0)*M(2,1) - M(1,1)*M(2,0)) * id;
+    R(2,1) = -(M(0,0)*M(2,1) - M(0,1)*M(2,0)) * id;
+    R(2,2) =  (M(0,0)*M(1,1) - M(0,1)*M(1,0)) * id;
+    return R;
+}
+
+// Inverse of a rigid 4x4 [[R,t],[0,1]] (R orthonormal): [[Rᵀ, -Rᵀt],[0,1]]. Exact
+// and cheap; used for cam2world = inv(world2cam) where world2cam is rigid.
+inline Mat4 inv_rigid4(const Mat4 & M) {
+    Mat4 R = mat4_identity();
+    for (int i = 0; i < 3; i++) for (int j = 0; j < 3; j++) R(i, j) = M(j, i);   // Rᵀ
+    for (int i = 0; i < 3; i++)
+        R(i, 3) = -(R(i,0)*M(0,3) + R(i,1)*M(1,3) + R(i,2)*M(2,3));
+    return R;
+}
+
+inline Mat4 mat4_mul(const Mat4 & A, const Mat4 & B) {
+    Mat4 C{};
+    for (int i = 0; i < 4; i++)
+        for (int j = 0; j < 4; j++) {
+            double s = 0;
+            for (int k = 0; k < 4; k++) s += A(i, k) * B(k, j);
+            C(i, j) = s;
+        }
+    return C;
+}
+
+// --- symmetric eigensolver -------------------------------------------------
+//
+// Cyclic Jacobi for a symmetric n x n matrix (Golub & Van Loan, Alg. 8.4.x).
+// Ain is row-major and assumed symmetric. Writes n eigenvalues to `eval` and the
+// corresponding eigenvectors as the COLUMNS of `evec` (evec[i*n+j] = component i
+// of eigenvector j). Output is NOT sorted. Converges in a handful of sweeps for
+// n <= 12; we cap at 100 as a safety net.
+inline void jacobi_eigh(const double * Ain, int n, double * eval, double * evec) {
+    std::vector<double> A(Ain, Ain + (size_t) n * n);
+    for (int i = 0; i < n * n; i++) evec[i] = 0.0;
+    for (int i = 0; i < n; i++)     evec[i * n + i] = 1.0;
+
+    for (int sweep = 0; sweep < 100; sweep++) {
+        double off = 0.0;
+        for (int p = 0; p < n; p++)
+            for (int q = p + 1; q < n; q++) off += A[p*n+q] * A[p*n+q];
+        if (off <= 1e-300) break;
+
+        for (int p = 0; p < n; p++) {
+            for (int q = p + 1; q < n; q++) {
+                const double apq = A[p*n+q];
+                if (std::fabs(apq) <= 1e-300) continue;
+                const double phi = 0.5 * (A[q*n+q] - A[p*n+p]) / apq;   // = tau
+                const double t = (phi >= 0 ? 1.0 : -1.0) /
+                                 (std::fabs(phi) + std::sqrt(phi*phi + 1.0));
+                const double c = 1.0 / std::sqrt(t*t + 1.0);
+                const double s = t * c;
+                // A <- Jᵀ A J : rotate columns p,q then rows p,q (keeps symmetry).
+                for (int i = 0; i < n; i++) {
+                    const double aip = A[i*n+p], aiq = A[i*n+q];
+                    A[i*n+p] = c*aip - s*aiq;
+                    A[i*n+q] = s*aip + c*aiq;
+                }
+                for (int i = 0; i < n; i++) {
+                    const double api = A[p*n+i], aqi = A[q*n+i];
+                    A[p*n+i] = c*api - s*aqi;
+                    A[q*n+i] = s*api + c*aqi;
+                }
+                for (int i = 0; i < n; i++) {     // accumulate eigenvectors V <- V J
+                    const double vip = evec[i*n+p], viq = evec[i*n+q];
+                    evec[i*n+p] = c*vip - s*viq;
+                    evec[i*n+q] = s*vip + c*viq;
+                }
+            }
+        }
+    }
+    for (int i = 0; i < n; i++) eval[i] = A[i*n+i];
+}
+
+// Smallest-eigenvalue eigenvector of a symmetric n x n matrix (the homogeneous
+// least-squares / DLT nullspace). Returns the n-vector.
+inline std::vector<double> smallest_eigenvector(const double * A, int n) {
+    std::vector<double> eval(n), evec((size_t) n * n);
+    jacobi_eigh(A, n, eval.data(), evec.data());
+    int k = 0;
+    for (int i = 1; i < n; i++) if (eval[i] < eval[k]) k = i;
+    std::vector<double> v(n);
+    for (int i = 0; i < n; i++) v[i] = evec[i*n + k];
+    return v;
+}
+
+// --- 3x3 SVD ---------------------------------------------------------------
+//
+// M = U diag(s) Vᵀ with s sorted DESCENDING (numpy/LAPACK convention). Built from
+// the eigendecomposition of the symmetric S = MᵀM: its eigenvectors are V, the
+// singular values are sqrt(eigenvalues), and U_i = M v_i / s_i. A (near-)zero
+// singular direction is completed by Gram-Schmidt so U stays orthonormal. This is
+// all the SVD the similarity fit and the PnP decode need.
+inline void svd3(const Mat3 & M, Mat3 & U, Vec3 & s, Mat3 & V) {
+    Mat3 S = mat3_mul(mat3_transpose(M), M);            // 3x3 symmetric PSD
+    double eval[3], evec[9];
+    jacobi_eigh(S.a, 3, eval, evec);
+
+    int ord[3] = { 0, 1, 2 };                           // sort eigenpairs descending
+    std::sort(ord, ord + 3, [&](int x, int y) { return eval[x] > eval[y]; });
+
+    for (int j = 0; j < 3; j++) {
+        const int e = ord[j];
+        s[j] = std::sqrt(std::max(eval[e], 0.0));
+        for (int i = 0; i < 3; i++) V(i, j) = evec[i*3 + e];
+    }
+    // Force a right-handed V (proper basis); flip the last column if reflected.
+    if (det3(V) < 0) for (int i = 0; i < 3; i++) V(i, 2) = -V(i, 2);
+
+    for (int j = 0; j < 3; j++) {
+        Vec3 vj = { V(0,j), V(1,j), V(2,j) };
+        Vec3 uj = mat3_apply(M, vj);
+        const double nrm = std::sqrt(uj[0]*uj[0] + uj[1]*uj[1] + uj[2]*uj[2]);
+        if (nrm > 1e-12) for (int i = 0; i < 3; i++) U(i, j) = uj[i] / nrm;
+        else             for (int i = 0; i < 3; i++) U(i, j) = 0.0;   // fixed up below
+    }
+    // Complete any degenerate U columns via Gram-Schmidt against the filled ones.
+    for (int j = 0; j < 3; j++) {
+        double nrm = std::sqrt(U(0,j)*U(0,j) + U(1,j)*U(1,j) + U(2,j)*U(2,j));
+        if (nrm > 1e-9) continue;
+        Vec3 cand[3] = { {1,0,0}, {0,1,0}, {0,0,1} };
+        for (auto & e : cand) {
+            for (int k = 0; k < 3; k++) {
+                if (k == j) continue;
+                const double d = e[0]*U(0,k) + e[1]*U(1,k) + e[2]*U(2,k);
+                e[0] -= d*U(0,k); e[1] -= d*U(1,k); e[2] -= d*U(2,k);
+            }
+            const double en = std::sqrt(e[0]*e[0] + e[1]*e[1] + e[2]*e[2]);
+            if (en > 1e-6) { U(0,j)=e[0]/en; U(1,j)=e[1]/en; U(2,j)=e[2]/en; break; }
+        }
+    }
+}
+
+} // namespace fsla
+
+#endif // FREE_SPLATTER_LINALG_H
diff --git a/src/pose.cpp b/src/pose.cpp
new file mode 100644
index 0000000..4bcafa8
--- /dev/null
+++ b/src/pose.cpp
@@ -0,0 +1,565 @@
+// pose.cpp — C++ port of the pose/ prototype (focal.py + align.py + pnp.py).
+//
+// Faithful to FreeSplatter's scene recipe; the dependency-free numpy reference
+// solver is what ships here (verified ~1e-7 vs cv2, bit-exact vs upstream
+// estimate_poses on real engine output). All linear algebra goes through the
+// self-contained Jacobi eigensolver in linalg.h — no Eigen, no OpenCV.
+#include "pose.h"
+
+#include <cmath>
+#include <cstring>
+
+namespace free_splatter {
+namespace pose {
+
+using fsla::smallest_eigenvector;
+using fsla::svd3;
+using fsla::det3;
+using fsla::inv3;
+using fsla::mat3_mul;
+using fsla::mat3_transpose;
+using fsla::mat3_apply;
+using fsla::mat3_identity;
+using fsla::mat4_identity;
+using fsla::inv_rigid4;
+
+namespace {
+
+// Deterministic SplitMix64 — so RANSAC is reproducible across platforms (we don't
+// need numpy's PCG64 bitstream, only a fixed inlier-converging sampler).
+struct Rng {
+    uint64_t s;
+    explicit Rng(uint64_t seed) : s(seed) {}
+    uint64_t next() {
+        uint64_t z = (s += 0x9E3779B97F4A7C15ULL);
+        z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ULL;
+        z = (z ^ (z >> 27)) * 0x94D049BB133111EBULL;
+        return z ^ (z >> 31);
+    }
+    int bounded(int n) { return (int) (next() % (uint64_t) n); }
+};
+
+void sample_distinct(Rng & rng, int N, int k, int * out) {
+    for (int i = 0; i < k; i++) {
+        int v;
+        bool dup;
+        do {
+            v = rng.bounded(N);
+            dup = false;
+            for (int j = 0; j < i; j++) if (out[j] == v) { dup = true; break; }
+        } while (dup);
+        out[i] = v;
+    }
+}
+
+double rms3(const double * r, int N) {            // sqrt(mean over rows of ||row||^2)
+    double s = 0;
+    for (int i = 0; i < N; i++) s += r[3*i]*r[3*i] + r[3*i+1]*r[3*i+1] + r[3*i+2]*r[3*i+2];
+    return std::sqrt(s / (double) N);
+}
+
+void mean3(const double * X, int N, double * m) {
+    m[0] = m[1] = m[2] = 0;
+    for (int i = 0; i < N; i++) { m[0] += X[3*i]; m[1] += X[3*i+1]; m[2] += X[3*i+2]; }
+    m[0] /= N; m[1] /= N; m[2] /= N;
+}
+
+// Solve A X = B in place (A n x n row-major, B n x m row-major) by Gaussian
+// elimination with partial pivoting. Result overwrites B. Used for the affine
+// normal equations in the residual ladder.
+void solve_lin(double * A, int n, double * B, int m) {
+    for (int col = 0; col < n; col++) {
+        int piv = col;
+        for (int r = col + 1; r < n; r++)
+            if (std::fabs(A[r*n+col]) > std::fabs(A[piv*n+col])) piv = r;
+        if (piv != col) {
+            for (int c = 0; c < n; c++) std::swap(A[col*n+c], A[piv*n+c]);
+            for (int c = 0; c < m; c++) std::swap(B[col*m+c], B[piv*m+c]);
+        }
+        const double d = A[col*n+col];
+        if (std::fabs(d) < 1e-300) continue;
+        for (int r = 0; r < n; r++) {
+            if (r == col) continue;
+            const double f = A[r*n+col] / d;
+            if (f == 0) continue;
+            for (int c = col; c < n; c++) A[r*n+c] -= f * A[col*n+c];
+            for (int c = 0; c < m; c++)   B[r*m+c] -= f * B[col*m+c];
+        }
+    }
+    for (int col = 0; col < n; col++) {
+        const double d = A[col*n+col];
+        if (std::fabs(d) > 1e-300) for (int c = 0; c < m; c++) B[col*m+c] /= d;
+    }
+}
+
+Mat3 rodrigues(const Vec3 & axis, double phi) {     // rotation about unit axis by phi
+    const double cphi = std::cos(phi), sphi = std::sin(phi), v = 1.0 - cphi;
+    const double x = axis[0], y = axis[1], z = axis[2];
+    Mat3 R{};
+    R(0,0) = cphi + x*x*v;   R(0,1) = x*y*v - z*sphi; R(0,2) = x*z*v + y*sphi;
+    R(1,0) = y*x*v + z*sphi; R(1,1) = cphi + y*y*v;   R(1,2) = y*z*v - x*sphi;
+    R(2,0) = z*x*v - y*sphi; R(2,1) = z*y*v + x*sphi; R(2,2) = cphi + z*z*v;
+    return R;
+}
+
+} // namespace
+
+// ---- focal ----------------------------------------------------------------
+
+double estimate_focal(const double * pts3d, const double * pixels, int N,
+                      double ppx, double ppy, int weiszfeld_iters) {
+    std::vector<double> P(2 * N), U(2 * N), pu(N), uu(N);
+    for (int i = 0; i < N; i++) {
+        const double X = pts3d[3*i], Y = pts3d[3*i+1], Z = pts3d[3*i+2];
+        double ux = (Z != 0.0) ? X / Z : 0.0;
+        double uy = (Z != 0.0) ? Y / Z : 0.0;
+        if (!std::isfinite(ux)) ux = 0.0;       // mirror np.nan_to_num(posinf/neginf->0)
+        if (!std::isfinite(uy)) uy = 0.0;
+        const double Px = pixels[2*i]   - ppx;
+        const double Py = pixels[2*i+1] - ppy;
+        P[2*i] = Px; P[2*i+1] = Py;
+        U[2*i] = ux; U[2*i+1] = uy;
+        pu[i] = Px*ux + Py*uy;
+        uu[i] = ux*ux + uy*uy;
+    }
+    double spu = 0, suu = 0;
+    for (int i = 0; i < N; i++) { spu += pu[i]; suu += uu[i]; }
+    double f = (suu != 0.0) ? spu / suu : 0.0;
+    for (int it = 0; it < weiszfeld_iters; it++) {
+        double num = 0, den = 0;
+        for (int i = 0; i < N; i++) {
+            const double dx = P[2*i] - f*U[2*i], dy = P[2*i+1] - f*U[2*i+1];
+            const double r = std::sqrt(dx*dx + dy*dy);
+            const double w = 1.0 / std::max(r, 1e-8);
+            num += w * pu[i];
+            den += w * uu[i];
+        }
+        f = (den != 0.0) ? num / den : f;
+    }
+    return f;
+}
+
+// ---- similarity -----------------------------------------------------------
+
+Sim3 sim_identity() { return { 1.0, mat3_identity(), {0,0,0} }; }
+
+Vec3 sim_apply(const Sim3 & T, const Vec3 & x) {
+    Vec3 Rx = mat3_apply(T.R, x);
+    return { T.s*Rx[0] + T.t[0], T.s*Rx[1] + T.t[1], T.s*Rx[2] + T.t[2] };
+}
+
+Sim3 sim_compose(const Sim3 & T2, const Sim3 & T1) {   // apply T1 then T2
+    Sim3 r;
+    r.s = T2.s * T1.s;
+    r.R = mat3_mul(T2.R, T1.R);
+    Vec3 R2t1 = mat3_apply(T2.R, T1.t);
+    r.t = { T2.s*R2t1[0] + T2.t[0], T2.s*R2t1[1] + T2.t[1], T2.s*R2t1[2] + T2.t[2] };
+    return r;
+}
+
+Sim3 sim_invert(const Sim3 & T) {
+    Sim3 r;
+    r.s = 1.0 / T.s;
+    r.R = mat3_transpose(T.R);
+    Vec3 Rit = mat3_apply(r.R, T.t);
+    r.t = { -r.s*Rit[0], -r.s*Rit[1], -r.s*Rit[2] };
+    return r;
+}
+
+Mat4 sim_matrix(const Sim3 & T) {
+    Mat4 M = mat4_identity();
+    for (int i = 0; i < 3; i++) for (int j = 0; j < 3; j++) M(i, j) = T.s * T.R(i, j);
+    for (int i = 0; i < 3; i++) M(i, 3) = T.t[i];
+    return M;
+}
+
+Sim3 fit_similarity(const double * X, const double * Y, int N, bool with_scale) {
+    double mx[3], my[3];
+    mean3(X, N, mx);
+    mean3(Y, N, my);
+    // Sigma = (Ycᵀ Xc)/n, cross-covariance mapping X -> Y; var_x = mean ||Xc||^2.
+    Mat3 Sigma{};
+    double var_x = 0;
+    for (int i = 0; i < N; i++) {
+        const double xc[3] = { X[3*i]-mx[0], X[3*i+1]-mx[1], X[3*i+2]-mx[2] };
+        const double yc[3] = { Y[3*i]-my[0], Y[3*i+1]-my[1], Y[3*i+2]-my[2] };
+        for (int r = 0; r < 3; r++) for (int c = 0; c < 3; c++) Sigma(r, c) += yc[r]*xc[c];
+        var_x += xc[0]*xc[0] + xc[1]*xc[1] + xc[2]*xc[2];
+    }
+    for (int i = 0; i < 9; i++) Sigma.a[i] /= N;
+    var_x /= N;
+
+    Mat3 U, V; Vec3 D;
+    svd3(Sigma, U, D, V);                          // Sigma = U diag(D) Vᵀ
+    double sgn = (det3(U) * det3(V) < 0) ? -1.0 : 1.0;
+    // R = U diag(1,1,sgn) Vᵀ
+    Mat3 US = U;
+    for (int r = 0; r < 3; r++) US(r, 2) *= sgn;
+    Mat3 R = mat3_mul(US, mat3_transpose(V));
+
+    double s = 1.0;
+    if (with_scale && var_x > 0)
+        s = (D[0] + D[1] + sgn * D[2]) / var_x;
+
+    Vec3 mxv = { mx[0], mx[1], mx[2] };
+    Vec3 Rmx = mat3_apply(R, mxv);
+    Sim3 T;
+    T.s = s; T.R = R;
+    T.t = { my[0] - s*Rmx[0], my[1] - s*Rmx[1], my[2] - s*Rmx[2] };
+    return T;
+}
+
+Sim3 fit_similarity_ransac(const double * X, const double * Y, int N, double thresh,
+                           int iters, std::vector<char> & inliers,
+                           bool with_scale, uint64_t seed) {
+    Rng rng(seed);
+    std::vector<char> best(N, 0);
+    int best_cnt = 0;
+    std::vector<double> xs(9), ys(9);              // 3-point minimal samples
+    for (int it = 0; it < iters; it++) {
+        int idx[3];
+        sample_distinct(rng, N, 3, idx);
+        for (int j = 0; j < 3; j++)
+            for (int c = 0; c < 3; c++) { xs[3*j+c] = X[3*idx[j]+c]; ys[3*j+c] = Y[3*idx[j]+c]; }
+        Sim3 T = fit_similarity(xs.data(), ys.data(), 3, with_scale);
+        int cnt = 0;
+        for (int i = 0; i < N; i++) {
+            Vec3 xi = { X[3*i], X[3*i+1], X[3*i+2] };
+            Vec3 p = sim_apply(T, xi);
+            const double dx = p[0]-Y[3*i], dy = p[1]-Y[3*i+1], dz = p[2]-Y[3*i+2];
+            if (std::sqrt(dx*dx + dy*dy + dz*dz) < thresh) cnt++;
+        }
+        if (cnt > best_cnt) { best_cnt = cnt; best.assign(N, 0);
+            for (int i = 0; i < N; i++) {
+                Vec3 xi = { X[3*i], X[3*i+1], X[3*i+2] };
+                Vec3 p = sim_apply(T, xi);
+                const double dx = p[0]-Y[3*i], dy = p[1]-Y[3*i+1], dz = p[2]-Y[3*i+2];
+                best[i] = (std::sqrt(dx*dx + dy*dy + dz*dz) < thresh) ? 1 : 0;
+            }
+        }
+    }
+    if (best_cnt < 3) std::fill(best.begin(), best.end(), 1);
+    // refit on the inlier set
+    std::vector<double> xi, yi;
+    for (int i = 0; i < N; i++) if (best[i]) {
+        for (int c = 0; c < 3; c++) { xi.push_back(X[3*i+c]); yi.push_back(Y[3*i+c]); }
+    }
+    inliers = best;
+    return fit_similarity(xi.data(), yi.data(), (int) (xi.size()/3), with_scale);
+}
+
+// affine Y ~= A X + b (12-DoF), returns prediction rms only via caller; here we
+// fill `pred` (N*3). Normal equations on Xh=[X,1].
+static double affine_residual(const double * X, const double * Y, int N) {
+    double G[16] = {0};       // XhᵀXh (4x4)
+    double B[12] = {0};       // XhᵀY  (4x3)
+    for (int i = 0; i < N; i++) {
+        const double xh[4] = { X[3*i], X[3*i+1], X[3*i+2], 1.0 };
+        for (int r = 0; r < 4; r++) {
+            for (int c = 0; c < 4; c++) G[r*4+c] += xh[r]*xh[c];
+            for (int c = 0; c < 3; c++) B[r*3+c] += xh[r]*Y[3*i+c];
+        }
+    }
+    solve_lin(G, 4, B, 3);    // B now holds M (4x3): rows are coeffs
+    double s = 0;
+    for (int i = 0; i < N; i++) {
+        const double xh[4] = { X[3*i], X[3*i+1], X[3*i+2], 1.0 };
+        for (int c = 0; c < 3; c++) {
+            double p = 0;
+            for (int r = 0; r < 4; r++) p += xh[r]*B[r*3+c];
+            const double d = p - Y[3*i+c];
+            s += d*d;
+        }
+    }
+    return std::sqrt(s / (double) N);
+}
+
+Ladder diagnose(const double * X, const double * Y, int N, double tol, double corr_tol) {
+    Ladder L{};
+    double my[3]; mean3(Y, N, my);
+    std::vector<double> yc(3 * N);
+    for (int i = 0; i < N; i++) { yc[3*i]=Y[3*i]-my[0]; yc[3*i+1]=Y[3*i+1]-my[1]; yc[3*i+2]=Y[3*i+2]-my[2]; }
+    L.scene = rms3(yc.data(), N);
+
+    Sim3 Tr = fit_similarity(X, Y, N, /*with_scale=*/false);
+    Sim3 Ts = fit_similarity(X, Y, N, /*with_scale=*/true);
+    std::vector<double> res_r(3*N), res_s(3*N);
+    for (int i = 0; i < N; i++) {
+        Vec3 xi = { X[3*i], X[3*i+1], X[3*i+2] };
+        Vec3 pr = sim_apply(Tr, xi), ps = sim_apply(Ts, xi);
+        for (int c = 0; c < 3; c++) { res_r[3*i+c] = pr[c]-Y[3*i+c]; res_s[3*i+c] = ps[c]-Y[3*i+c]; }
+    }
+    L.rigid = rms3(res_r.data(), N);
+    L.similarity = rms3(res_s.data(), N);
+    L.affine = affine_residual(X, Y, N);
+    L.scale = Ts.s;
+
+    // depth_corr: Pearson(||X-mean||, ||similarity residual||)
+    double mx[3]; mean3(X, N, mx);
+    std::vector<double> depth(N), rmag(N);
+    for (int i = 0; i < N; i++) {
+        const double dx = X[3*i]-mx[0], dy = X[3*i+1]-mx[1], dz = X[3*i+2]-mx[2];
+        depth[i] = std::sqrt(dx*dx + dy*dy + dz*dz);
+        rmag[i] = std::sqrt(res_s[3*i]*res_s[3*i] + res_s[3*i+1]*res_s[3*i+1] + res_s[3*i+2]*res_s[3*i+2]);
+    }
+    double md = 0, mr = 0;
+    for (int i = 0; i < N; i++) { md += depth[i]; mr += rmag[i]; }
+    md /= N; mr /= N;
+    double cov = 0, vd = 0, vr = 0;
+    for (int i = 0; i < N; i++) { cov += (depth[i]-md)*(rmag[i]-mr); vd += (depth[i]-md)*(depth[i]-md); vr += (rmag[i]-mr)*(rmag[i]-mr); }
+    L.depth_corr = (vr > 1e-24) ? cov / std::sqrt(vd*vr) : 0.0;
+
+    const double sc = (L.scene > 0) ? L.scene : 1.0;
+    const double rr = L.rigid/sc, rs = L.similarity/sc, ra = L.affine/sc;
+    L.aff_gain = (L.similarity > 0) ? (L.similarity - L.affine)/L.similarity : 0.0;
+    L.structured = std::fabs(L.depth_corr) > corr_tol || L.aff_gain > 0.25;
+    if (rr < tol)         L.verdict = "rigid_ok";
+    else if (rs < tol)    L.verdict = "needs_scale";
+    else if (ra < tol)    L.verdict = "needs_affine";
+    else if (L.structured) L.verdict = "nonlinear";
+    else                  L.verdict = "similarity_plus_noise";
+    return L;
+}
+
+LoopError loop_closure_error(const std::vector<Sim3> & links) {
+    Sim3 T = sim_identity();
+    for (const Sim3 & Ti : links) T = sim_compose(Ti, T);
+    LoopError e;
+    e.scale_err = std::fabs(std::log(T.s));
+    const double tr = T.R(0,0) + T.R(1,1) + T.R(2,2);
+    e.rot_deg = std::acos(std::max(-1.0, std::min(1.0, (tr - 1.0)/2.0))) * 180.0 / M_PI;
+    e.trans = std::sqrt(T.t[0]*T.t[0] + T.t[1]*T.t[1] + T.t[2]*T.t[2]);
+    return e;
+}
+
+// M^f for a Sim(3) 4x4 [[sR,t],[0,1]], closed form via the group's one-parameter
+// subgroup: A^f = s^f R^f (R^f = same axis, f*angle), translation by
+// (A^f - I)(A - I)^{-1} t (the analytic continuation of the integer-power
+// geometric series). Equals numpy's eig-based principal value while angle < 180.
+Mat4 sim_frac_power(const Mat4 & M, double f) {
+    Mat3 A{};
+    Vec3 t{};
+    for (int i = 0; i < 3; i++) { for (int j = 0; j < 3; j++) A(i, j) = M(i, j); t[i] = M(i, 3); }
+    const double s = std::cbrt(det3(A));
+    Mat3 R = A;
+    for (int i = 0; i < 9; i++) R.a[i] /= s;
+    const double tr = R(0,0) + R(1,1) + R(2,2);
+    const double ang = std::acos(std::max(-1.0, std::min(1.0, (tr - 1.0)/2.0)));
+
+    Mat3 Rf;
+    if (ang < 1e-12) {
+        Rf = mat3_identity();
+    } else {
+        const double sa = std::sin(ang);
+        Vec3 axis = { (R(2,1)-R(1,2))/(2*sa), (R(0,2)-R(2,0))/(2*sa), (R(1,0)-R(0,1))/(2*sa) };
+        Rf = rodrigues(axis, f * ang);
+    }
+    const double sf = std::pow(s, f);
+    Mat3 Af = Rf;
+    for (int i = 0; i < 9; i++) Af.a[i] *= sf;
+
+    Mat4 out = mat4_identity();
+    for (int i = 0; i < 3; i++) for (int j = 0; j < 3; j++) out(i, j) = Af(i, j);
+
+    // translation: (Af - I)(A - I)^{-1} t, with the A->I limit handled as f*t.
+    Mat3 AmI = A;
+    for (int i = 0; i < 3; i++) AmI(i, i) -= 1.0;
+    if (std::fabs(det3(AmI)) < 1e-12) {
+        for (int i = 0; i < 3; i++) out(i, 3) = f * t[i];
+    } else {
+        Mat3 AfmI = Af;
+        for (int i = 0; i < 3; i++) AfmI(i, i) -= 1.0;
+        Mat3 K = mat3_mul(AfmI, inv3(AmI));
+        Vec3 tf = mat3_apply(K, t);
+        for (int i = 0; i < 3; i++) out(i, 3) = tf[i];
+    }
+    return out;
+}
+
+// ---- PnP ------------------------------------------------------------------
+
+namespace {
+
+// DLT: smallest right singular vector of the 2N-row system, via the smallest
+// eigenvector of AᵀA (12x12). idx selects the k points used. Returns the 3x4
+// matrix p (row-major) up to sign/scale.
+std::array<double,12> dlt(const double * Xw, const double * xn, const int * idx, int k) {
+    double AtA[144] = {0};
+    for (int j = 0; j < k; j++) {
+        const int i = idx[j];
+        const double X = Xw[3*i], Y = Xw[3*i+1], Z = Xw[3*i+2];
+        const double u = xn[2*i], v = xn[2*i+1];
+        const double ra[12] = { X, Y, Z, 1, 0, 0, 0, 0, -u*X, -u*Y, -u*Z, -u };
+        const double rb[12] = { 0, 0, 0, 0, X, Y, Z, 1, -v*X, -v*Y, -v*Z, -v };
+        for (int r = 0; r < 12; r++)
+            for (int c = 0; c < 12; c++) AtA[r*12+c] += ra[r]*ra[c] + rb[r]*rb[c];
+    }
+    std::vector<double> v = smallest_eigenvector(AtA, 12);
+    std::array<double,12> p;
+    for (int i = 0; i < 12; i++) p[i] = v[i];
+    return p;
+}
+
+struct RT { Mat3 R; Vec3 t; double err; bool ok; };
+
+// Resolve the DLT sign and recover a proper (R,t) world2cam, picking the sign with
+// lower reprojection error among cheirality-valid candidates (points in front).
+RT decode(const std::array<double,12> & p, const double * Xw, const double * xn,
+          const int * idx, int k) {
+    RT best; best.ok = false; best.err = 1e300;
+    for (double sgn : { 1.0, -1.0 }) {
+        Mat3 M{};
+        Vec3 tau{};
+        for (int r = 0; r < 3; r++) { for (int c = 0; c < 3; c++) M(r, c) = sgn*p[r*4+c]; tau[r] = sgn*p[r*4+3]; }
+        Mat3 U, V; Vec3 Sv;
+        svd3(M, U, Sv, V);
+        const double d = (det3(U) * det3(V) < 0) ? -1.0 : 1.0;
+        Mat3 US = U;
+        for (int r = 0; r < 3; r++) US(r, 2) *= d;
+        Mat3 R = mat3_mul(US, mat3_transpose(V));      // nearest proper rotation
+        const double lam = (Sv[0] + Sv[1] + Sv[2]) / 3.0;
+        if (lam < 1e-12) continue;
+        Vec3 t = { tau[0]/lam, tau[1]/lam, tau[2]/lam };
+
+        double err = 0; bool behind = false;
+        for (int j = 0; j < k; j++) {
+            const int i = idx[j];
+            Vec3 xi = { Xw[3*i], Xw[3*i+1], Xw[3*i+2] };
+            Vec3 Xc = mat3_apply(R, xi);
+            Xc[0] += t[0]; Xc[1] += t[1]; Xc[2] += t[2];
+            if (Xc[2] <= 0) { behind = true; break; }
+            const double px = Xc[0]/Xc[2], py = Xc[1]/Xc[2];
+            const double dx = px - xn[2*i], dy = py - xn[2*i+1];
+            err += std::sqrt(dx*dx + dy*dy);
+        }
+        if (behind) continue;
+        err /= k;
+        if (err < best.err) { best.err = err; best.R = R; best.t = t; best.ok = true; }
+    }
+    return best;
+}
+
+} // namespace
+
+void pixel_grid(int H, int W, std::vector<double> & out) {
+    out.resize((size_t) 2 * H * W);
+    for (int r = 0; r < H; r++)
+        for (int c = 0; c < W; c++) {
+            const size_t k = (size_t) r * W + c;
+            out[2*k] = c; out[2*k+1] = r;
+        }
+}
+
+Mat4 solve_pnp_numpy(const double * Xw, const double * pixels, int N, const Mat3 & K,
+                     std::vector<char> & inliers, double thresh_px, int iters, uint64_t seed) {
+    const Mat3 Kinv = inv3(K);
+    std::vector<double> xn((size_t) 2 * N);
+    for (int i = 0; i < N; i++) {
+        Vec3 uv = { pixels[2*i], pixels[2*i+1], 1.0 };
+        Vec3 n = mat3_apply(Kinv, uv);
+        xn[2*i] = n[0]; xn[2*i+1] = n[1];
+    }
+    const double thresh = thresh_px / K(0, 0);
+
+    Rng rng(seed);
+    std::vector<char> best(N, 0);
+    int best_cnt = 0;
+    for (int it = 0; it < iters; it++) {
+        int idx[6];
+        sample_distinct(rng, N, 6, idx);
+        RT rt = decode(dlt(Xw, xn.data(), idx, 6), Xw, xn.data(), idx, 6);
+        if (!rt.ok) continue;
+        int cnt = 0;
+        for (int i = 0; i < N; i++) {
+            Vec3 xi = { Xw[3*i], Xw[3*i+1], Xw[3*i+2] };
+            Vec3 Xc = mat3_apply(rt.R, xi);
+            Xc[2] += rt.t[2];
+            if (Xc[2] <= 0) continue;
+            Xc[0] += rt.t[0]; Xc[1] += rt.t[1];
+            const double dx = Xc[0]/Xc[2] - xn[2*i], dy = Xc[1]/Xc[2] - xn[2*i+1];
+            if (std::sqrt(dx*dx + dy*dy) < thresh) cnt++;
+        }
+        if (cnt > best_cnt) {
+            best_cnt = cnt;
+            best.assign(N, 0);
+            for (int i = 0; i < N; i++) {
+                Vec3 xi = { Xw[3*i], Xw[3*i+1], Xw[3*i+2] };
+                Vec3 Xc = mat3_apply(rt.R, xi);
+                Xc[0] += rt.t[0]; Xc[1] += rt.t[1]; Xc[2] += rt.t[2];
+                if (Xc[2] <= 0) continue;
+                const double dx = Xc[0]/Xc[2] - xn[2*i], dy = Xc[1]/Xc[2] - xn[2*i+1];
+                best[i] = (std::sqrt(dx*dx + dy*dy) < thresh) ? 1 : 0;
+            }
+        }
+    }
+    std::vector<int> sel;
+    for (int i = 0; i < N; i++) if (best[i]) sel.push_back(i);
+    if ((int) sel.size() < 6) { sel.clear(); for (int i = 0; i < N; i++) sel.push_back(i); }
+    RT rt = decode(dlt(Xw, xn.data(), sel.data(), (int) sel.size()),
+                   Xw, xn.data(), sel.data(), (int) sel.size());
+
+    Mat4 world2cam = mat4_identity();
+    for (int i = 0; i < 3; i++) { for (int j = 0; j < 3; j++) world2cam(i, j) = rt.R(i, j); world2cam(i, 3) = rt.t[i]; }
+    inliers = best;
+    return inv_rigid4(world2cam);
+}
+
+// ---- estimate_poses orchestration -----------------------------------------
+
+PoseResult estimate_poses(const std::vector<const float *> & points,
+                          const std::vector<const float *> & opacities,
+                          int H, int W, double opacity_threshold,
+                          double focal, int pnp_iter, bool normalize, uint64_t seed) {
+    const int Nv = (int) points.size();
+    const int P = H * W;
+    const double ppx = W / 2.0, ppy = H / 2.0;
+    std::vector<double> grid;
+    pixel_grid(H, W, grid);
+
+    // per-view opacity masks
+    std::vector<std::vector<char>> masks(Nv);
+    for (int v = 0; v < Nv; v++) {
+        masks[v].assign(P, 1);
+        if (v < (int) opacities.size() && opacities[v]) {
+            for (int i = 0; i < P; i++) masks[v][i] = (opacities[v][i] > opacity_threshold) ? 1 : 0;
+        }
+    }
+
+    // focal: view 0 only, ALL pixels (use_first_focal, scene recipe)
+    if (focal <= 0.0) {
+        std::vector<double> pts(3 * P), pix(2 * P);
+        for (int i = 0; i < P; i++) {
+            pts[3*i] = points[0][3*i]; pts[3*i+1] = points[0][3*i+1]; pts[3*i+2] = points[0][3*i+2];
+            pix[2*i] = grid[2*i]; pix[2*i+1] = grid[2*i+1];
+        }
+        focal = estimate_focal(pts.data(), pix.data(), P, ppx, ppy);
+    }
+    Mat3 K = mat3_identity();
+    K(0,0) = focal; K(1,1) = focal; K(0,2) = ppx; K(1,2) = ppy;
+
+    PoseResult res;
+    res.focal = focal;
+    res.scale = 1.0;
+    for (int v = 0; v < Nv; v++) {
+        std::vector<double> Xw, px;
+        for (int i = 0; i < P; i++) if (masks[v][i]) {
+            Xw.push_back(points[v][3*i]); Xw.push_back(points[v][3*i+1]); Xw.push_back(points[v][3*i+2]);
+            px.push_back(grid[2*i]); px.push_back(grid[2*i+1]);
+        }
+        std::vector<char> inl;
+        Mat4 c2w = solve_pnp_numpy(Xw.data(), px.data(), (int) (Xw.size()/3), K, inl, 5.0, pnp_iter, seed);
+        res.cam2world.push_back(c2w);
+    }
+
+    if (normalize && Nv >= 2) {
+        Vec3 t0 = { res.cam2world.front()(0,3), res.cam2world.front()(1,3), res.cam2world.front()(2,3) };
+        Vec3 tl = { res.cam2world.back()(0,3),  res.cam2world.back()(1,3),  res.cam2world.back()(2,3) };
+        const double baseline = std::sqrt((tl[0]-t0[0])*(tl[0]-t0[0]) + (tl[1]-t0[1])*(tl[1]-t0[1]) + (tl[2]-t0[2])*(tl[2]-t0[2])) + 1e-2;
+        res.scale = 1.0 / baseline;
+        for (auto & c : res.cam2world) for (int i = 0; i < 3; i++) c(i, 3) *= res.scale;
+    }
+    return res;
+}
+
+} // namespace pose
+} // namespace free_splatter
diff --git a/src/pose.h b/src/pose.h
new file mode 100644
index 0000000..e511e32
--- /dev/null
+++ b/src/pose.h
@@ -0,0 +1,104 @@
+// pose.h — downstream camera-pose recovery + cross-run alignment (C++ port).
+//
+// This is the C++ port of the proven `pose/` research prototype (focal.py +
+// align.py + pnp.py): given the engine's per-pixel gaussians it recovers each
+// view's camera (PnP), and aligns successive runs into one accumulating world
+// (Sim(3)). It is dependency-free — only the self-contained linalg.h — so it
+// ships from the CLI / C API with no Python and no Eigen/OpenCV (see CLAUDE.md).
+//
+// Faithful to FreeSplatter's scene recipe (estimate_poses -> DUSt3R fast_pnp):
+// the numpy reference solver, verified ~1e-7 against cv2 and bit-exact to upstream
+// estimate_poses on real engine output (pose/check_upstream_parity.py).
+//
+// Conventions: f64 throughout; a similarity acts as x -> s*(R@x)+t; gaussian
+// channel layout (scene, 23ch) is xyz[0:3] SH[3:15] opacity[15] scale[16:19]
+// rotation[19:23], opacity already activated (sigmoid) in [0,1].
+#ifndef FREE_SPLATTER_POSE_H
+#define FREE_SPLATTER_POSE_H
+
+#include "linalg.h"
+
+#include <string>
+#include <vector>
+
+namespace free_splatter {
+namespace pose {
+
+using fsla::Vec3;
+using fsla::Mat3;
+using fsla::Mat4;
+
+// ---- focal (Weiszfeld shared-focal, mirrors pose/focal.py) -----------------
+// Robust (L1) focal from camera-frame points and their pixels, principal point
+// pp. N points, pts3d is N*3 (x,y,z), pixels is N*2 (col,row), all row-major.
+double estimate_focal(const double * pts3d, const double * pixels, int N,
+                      double ppx, double ppy, int weiszfeld_iters = 10);
+
+// ---- similarity alignment (mirrors pose/align.py) --------------------------
+struct Sim3 { double s; Mat3 R; Vec3 t; };           // x -> s*(R@x)+t
+
+Sim3 sim_identity();
+Vec3 sim_apply(const Sim3 & T, const Vec3 & x);
+Sim3 sim_compose(const Sim3 & T2, const Sim3 & T1);  // apply T1 then T2
+Sim3 sim_invert(const Sim3 & T);
+Mat4 sim_matrix(const Sim3 & T);                     // 4x4 [[sR,t],[0,1]]
+
+// Umeyama closed-form (s,R,t) minimizing ||sRX+t - Y||^2. with_scale=false -> rigid.
+// X, Y are N*3 row-major.
+Sim3 fit_similarity(const double * X, const double * Y, int N, bool with_scale = true);
+
+// RANSAC similarity robust to gross outliers; fills `inliers` (size N, 0/1).
+Sim3 fit_similarity_ransac(const double * X, const double * Y, int N, double thresh,
+                           int iters, std::vector<char> & inliers,
+                           bool with_scale = true, uint64_t seed = 0);
+
+// Residual ladder (rigid -> similarity -> affine) + depth correlation, and the
+// verdict that classifies the A<->B mismatch. Mirrors align.diagnose.
+struct Ladder {
+    double scene, rigid, similarity, affine, scale, depth_corr, aff_gain;
+    bool structured;
+    std::string verdict;   // rigid_ok | needs_scale | needs_affine | nonlinear | similarity_plus_noise
+};
+Ladder diagnose(const double * X, const double * Y, int N,
+                double tol = 1e-3, double corr_tol = 0.3);
+
+// Loop-closure metrics for a closed chain of similarities (deviation from I).
+struct LoopError { double scale_err, rot_deg, trans; };
+LoopError loop_closure_error(const std::vector<Sim3> & links);
+
+// Fractional power M^f of a Sim(3) 4x4 (even loop-closure relaxation). Real part
+// of the eigendecomposition; valid while rotation < 180deg. Mirrors sim_frac_power.
+Mat4 sim_frac_power(const Mat4 & M, double f);
+
+// ---- PnP (mirrors pose/pnp.py numpy backend) -------------------------------
+// RANSAC DLT PnP with known intrinsics K. Xw is N*3 world points, pixels is N*2
+// (col,row). Returns cam2world (4x4) and fills `inliers` (size N, 0/1).
+Mat4 solve_pnp_numpy(const double * Xw, const double * pixels, int N, const Mat3 & K,
+                     std::vector<char> & inliers,
+                     double thresh_px = 5.0, int iters = 100, uint64_t seed = 0);
+
+// Integer pixel grid (col,row), row-major over H*W — matches upstream xy_grid
+// (no half-pixel offset). Fills `out` with 2*H*W doubles.
+void pixel_grid(int H, int W, std::vector<double> & out);
+
+// ---- estimate_poses orchestration (scene recipe, mirrors pnp.estimate_poses) -
+struct PoseResult {
+    std::vector<Mat4> cam2world;
+    double focal;
+    double scale;
+};
+// points: N views, each H*W*3 (gaussian centers, view-0 frame). opacities: N
+// views, each H*W (activated; pass nullptr entries for no mask). focal<=0 ->
+// estimate from view 0 over all pixels (use_first_focal). normalize -> the
+// runner's 1/baseline camera rescale.
+PoseResult estimate_poses(const std::vector<const float *> & points,
+                          const std::vector<const float *> & opacities,
+                          int H, int W,
+                          double opacity_threshold = 0.05,
+                          double focal = -1.0, int pnp_iter = 100,
+                          bool normalize = false, uint64_t seed = 0);
+
+} // namespace pose
+} // namespace free_splatter
+
+#endif // FREE_SPLATTER_POSE_H
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 7f9f3dc..49ddd7a 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -13,6 +13,7 @@ endfunction()
 free_splatter_add_test(test_loader)        # GGUF KV/hparams round-trip
 free_splatter_add_test(test_graph_blocks)  # golden-op pins (ggml ops we depend on)
 free_splatter_add_test(test_image)         # untrusted image-input validation
+free_splatter_add_test(test_pose)          # pose math golden tests (focal/align/PnP)
 
 # Per-layer parity vs the float64 PyTorch reference. Labelled "model": runs only
 # under `ctest -L model` and self-skips (77) unless FREE_SPLATTER_GGUF and
diff --git a/tests/test_pose.cpp b/tests/test_pose.cpp
new file mode 100644
index 0000000..5341ae1
--- /dev/null
+++ b/tests/test_pose.cpp
@@ -0,0 +1,352 @@
+// Asset-free golden tests for the C++ pose component (src/pose.{h,cpp}) — the
+// direct mirror of pose/test_pose.py. No model, no fixtures, no cv2: synthetic
+// geometry with KNOWN ground truth, the way CLAUDE.md wants new ops verified
+// ("pinned to hand-computed references... a wrong op should fail with zero
+// fixtures"). Runs in the fast tier (ctest -LE model).
+//
+// The RNG differs from numpy's (the checks are tolerance-based on recovered
+// quantities, not bitstream-matched), so this validates the C++ port to the same
+// correctness bar the Python prototype meets, independently of numpy.
+#include "pose.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <random>
+#include <vector>
+
+using namespace free_splatter::pose;
+using fsla::Vec3;
+using fsla::Mat3;
+using fsla::Mat4;
+
+static int failures = 0;
+static std::mt19937_64 RNG(12345);
+
+static void check(const char * name, bool cond, const char * detail = "") {
+    std::printf("  [%s] %s%s%s\n", cond ? "PASS" : "FAIL", name,
+                detail[0] ? "  " : "", detail);
+    if (!cond) failures++;
+}
+
+// --- helpers ---------------------------------------------------------------
+
+static Mat3 rand_rotation() {
+    std::normal_distribution<double> g(0, 1);
+    Vec3 ax = { g(RNG), g(RNG), g(RNG) };
+    double n = std::sqrt(ax[0]*ax[0] + ax[1]*ax[1] + ax[2]*ax[2]);
+    ax = { ax[0]/n, ax[1]/n, ax[2]/n };
+    std::uniform_real_distribution<double> a(0.2, M_PI - 0.2);
+    const double ang = a(RNG), c = std::cos(ang), s = std::sin(ang), v = 1 - c;
+    const double x = ax[0], y = ax[1], z = ax[2];
+    return Mat3{ { c+x*x*v, x*y*v-z*s, x*z*v+y*s,
+                   y*x*v+z*s, c+y*y*v, y*z*v-x*s,
+                   z*x*v-y*s, z*y*v+x*s, c+z*z*v } };
+}
+
+static double rot_angle(const Mat3 & R) {
+    const double tr = R(0,0) + R(1,1) + R(2,2);
+    return std::acos(std::max(-1.0, std::min(1.0, (tr-1)/2))) * 180.0 / M_PI;
+}
+
+static Mat3 mul(const Mat3 & A, const Mat3 & B) { return fsla::mat3_mul(A, B); }
+static Mat3 T(const Mat3 & A) { return fsla::mat3_transpose(A); }
+
+// apply (s,R,t) to a row-major N*3 array -> out
+static void apply_sim_arr(double s, const Mat3 & R, const Vec3 & t,
+                          const double * X, int N, double * out) {
+    for (int i = 0; i < N; i++) {
+        Vec3 x = { X[3*i], X[3*i+1], X[3*i+2] };
+        Vec3 Rx = fsla::mat3_apply(R, x);
+        out[3*i]   = s*Rx[0] + t[0];
+        out[3*i+1] = s*Rx[1] + t[1];
+        out[3*i+2] = s*Rx[2] + t[2];
+    }
+}
+
+static double rms3(const double * a, const double * b, int N) {
+    double s = 0;
+    for (int i = 0; i < 3*N; i++) { const double d = a[i]-b[i]; s += d*d; }
+    return std::sqrt(s / N);
+}
+
+static Vec3 apply_mat4(const Mat4 & M, const Vec3 & p) {
+    return { M(0,0)*p[0]+M(0,1)*p[1]+M(0,2)*p[2]+M(0,3),
+             M(1,0)*p[0]+M(1,1)*p[1]+M(1,2)*p[2]+M(1,3),
+             M(2,0)*p[0]+M(2,1)*p[1]+M(2,2)*p[2]+M(2,3) };
+}
+
+// ===========================================================================
+// Coordinate-system normalization (the scale question)
+// ===========================================================================
+
+static void test_similarity_roundtrip() {
+    std::printf("test_similarity_roundtrip\n");
+    const int N = 200;
+    std::normal_distribution<double> g(0, 1);
+    std::vector<double> X(3*N), Y(3*N);
+    for (int i = 0; i < N; i++) { X[3*i]=g(RNG)*3; X[3*i+1]=g(RNG)*2; X[3*i+2]=g(RNG)*5; }
+    const double s_t = 1.7; Mat3 R_t = rand_rotation(); Vec3 t_t = { 4.0, -1.0, 2.0 };
+    apply_sim_arr(s_t, R_t, t_t, X.data(), N, Y.data());
+
+    Sim3 T_ = fit_similarity(X.data(), Y.data(), N);
+    char buf[64];
+    std::snprintf(buf, sizeof buf, "%.12f vs %g", T_.s, s_t);
+    check("scale", std::fabs(T_.s - s_t) < 1e-9, buf);
+    check("rotation", rot_angle(mul(T_.R, T(R_t))) < 1e-7);
+    check("translation", std::sqrt((T_.t[0]-t_t[0])*(T_.t[0]-t_t[0]) +
+        (T_.t[1]-t_t[1])*(T_.t[1]-t_t[1]) + (T_.t[2]-t_t[2])*(T_.t[2]-t_t[2])) < 1e-7);
+    std::vector<double> Yp(3*N);
+    apply_sim_arr(T_.s, T_.R, T_.t, X.data(), N, Yp.data());
+    check("residual~0", rms3(Yp.data(), Y.data(), N) < 1e-9);
+}
+
+static void test_scale_detection() {
+    std::printf("test_scale_detection\n");
+    const int N = 300;
+    std::normal_distribution<double> g(0, 1);
+    std::vector<double> X(3*N), Y(3*N);
+    for (int i = 0; i < 3*N; i++) X[i] = g(RNG)*2;
+    const double s_t = 2.5;
+    apply_sim_arr(s_t, rand_rotation(), {1.0, 2.0, -3.0}, X.data(), N, Y.data());
+    Ladder L = diagnose(X.data(), Y.data(), N);
+    check("recovered scale", std::fabs(L.scale - s_t) < 1e-8);
+    check("rigid residual large", L.rigid / L.scene > 0.1);
+    check("similarity residual ~0", L.similarity / L.scene < 1e-9);
+    check("verdict needs_scale", L.verdict == "needs_scale", L.verdict.c_str());
+}
+
+static void test_nonlinear_detection() {
+    std::printf("test_nonlinear_detection\n");
+    const int N = 400;
+    std::normal_distribution<double> g(0, 1);
+    std::vector<double> X(3*N);
+    for (int i = 0; i < N; i++) { X[3*i]=g(RNG)*2; X[3*i+1]=g(RNG)*2; X[3*i+2]=g(RNG)*2 + 6.0; }
+
+    // (a) anisotropic (affine) warp
+    Mat3 A = mul(Mat3{ {1.0,0,0, 0,1.6,0, 0,0,0.7} }, rand_rotation());
+    std::vector<double> Yaff(3*N);
+    for (int i = 0; i < N; i++) {
+        Vec3 x = { X[3*i], X[3*i+1], X[3*i+2] };
+        Vec3 Ax = fsla::mat3_apply(A, x);
+        Yaff[3*i]=Ax[0]+1.0; Yaff[3*i+1]=Ax[1]; Yaff[3*i+2]=Ax[2]+2.0;
+    }
+    Ladder La = diagnose(X.data(), Yaff.data(), N);
+    check("affine: similarity fails", La.similarity / La.scene > 1e-3);
+    check("affine: affine fits", La.affine / La.scene < 1e-9);
+    check("affine: verdict needs_affine", La.verdict == "needs_affine", La.verdict.c_str());
+
+    // (b) depth-quadratic (focal-error fingerprint)
+    double zmean = 0; for (int i = 0; i < N; i++) zmean += X[3*i+2]; zmean /= N;
+    double zvar = 0; for (int i = 0; i < N; i++) zvar += (X[3*i+2]-zmean)*(X[3*i+2]-zmean); zvar /= N;
+    std::vector<double> Ynl(X);
+    for (int i = 0; i < N; i++) {
+        const double Z = X[3*i+2];
+        Ynl[3*i] *= (1.0 + 0.15 * (Z-zmean)*(Z-zmean) / zvar);
+    }
+    Ladder Ln = diagnose(X.data(), Ynl.data(), N);
+    check("nonlinear: affine fails", Ln.affine / Ln.scene > 1e-3);
+    check("nonlinear: verdict nonlinear", Ln.verdict == "nonlinear", Ln.verdict.c_str());
+    check("nonlinear: depth-correlated residual", Ln.structured);
+}
+
+static void test_outlier_robustness() {
+    std::printf("test_outlier_robustness\n");
+    const int N = 400;
+    std::normal_distribution<double> g(0, 1);
+    std::vector<double> X(3*N), Y(3*N);
+    for (int i = 0; i < 3*N; i++) X[i] = g(RNG)*2;
+    const double s_t = 1.3; Mat3 R_t = rand_rotation(); Vec3 t_t = { 0.5, -2.0, 1.0 };
+    apply_sim_arr(s_t, R_t, t_t, X.data(), N, Y.data());
+    const int n_out = (int)(0.3 * N);
+    std::vector<int> oidx(N); for (int i = 0; i < N; i++) oidx[i] = i;
+    std::shuffle(oidx.begin(), oidx.end(), RNG);
+    for (int k = 0; k < n_out; k++) { int i = oidx[k]; Y[3*i]+=g(RNG)*10; Y[3*i+1]+=g(RNG)*10; Y[3*i+2]+=g(RNG)*10; }
+    std::vector<char> inl;
+    Sim3 T_ = fit_similarity_ransac(X.data(), Y.data(), N, 0.05, 300, inl);
+    check("scale recovered", std::fabs(T_.s - s_t) < 1e-3);
+    check("rotation recovered", rot_angle(mul(T_.R, T(R_t))) < 0.1);
+    check("translation recovered", std::sqrt((T_.t[0]-t_t[0])*(T_.t[0]-t_t[0]) +
+        (T_.t[1]-t_t[1])*(T_.t[1]-t_t[1]) + (T_.t[2]-t_t[2])*(T_.t[2]-t_t[2])) < 1e-2);
+    int rejected = 0; for (int k = 0; k < n_out; k++) if (!inl[oidx[k]]) rejected++;
+    check("outliers excluded", rejected > 0.9 * n_out);
+}
+
+static void test_loop_correction() {
+    std::printf("test_loop_correction\n");
+    Mat3 R = rand_rotation();
+    Mat4 M = sim_matrix({ 1.07, R, {0.3, -0.2, 0.1} });
+    Mat4 m0 = sim_frac_power(M, 0.0), m1 = sim_frac_power(M, 1.0), mh = sim_frac_power(M, 0.5);
+    bool id_ok = true, m1_ok = true;
+    for (int i = 0; i < 16; i++) { id_ok &= std::fabs(m0.a[i] - fsla::mat4_identity().a[i]) < 1e-9;
+                                   m1_ok &= std::fabs(m1.a[i] - M.a[i]) < 1e-9; }
+    Mat4 mh2 = fsla::mat4_mul(mh, mh);
+    bool half_ok = true; for (int i = 0; i < 16; i++) half_ok &= std::fabs(mh2.a[i] - M.a[i]) < 1e-9;
+    check("M^0 == I", id_ok);
+    check("M^1 == M", m1_ok);
+    check("(M^.5)^2 == M", half_ok);
+
+    const int n = 12;
+    Mat4 D = sim_matrix({ 1.15, R, {0.4, -0.2, 0.1} });
+    std::vector<Vec3> clean(n+1), drift(n+1), corr(n+1);
+    for (int k = 0; k <= n; k++) {
+        const double tt = 2*M_PI * k / n;
+        clean[k] = { std::cos(tt), std::sin(tt), 0.1*tt };
+        drift[k] = apply_mat4(sim_frac_power(D, -(double)k/n), clean[k]);   // D^{-k/n}
+        corr[k]  = apply_mat4(sim_frac_power(D,  (double)k/n), drift[k]);   // D^{ k/n}
+    }
+    double pre = 0, post = 0;
+    for (int k = 0; k <= n; k++) {
+        for (int c = 0; c < 3; c++) { pre += (drift[k][c]-clean[k][c])*(drift[k][c]-clean[k][c]);
+                                      post += (corr[k][c]-clean[k][c])*(corr[k][c]-clean[k][c]); }
+    }
+    check("drift present before", std::sqrt(pre/(n+1)) > 0.1);
+    char buf[64]; std::snprintf(buf, sizeof buf, "ATE %.1e", std::sqrt(post/(n+1)));
+    check("recovers clean loop", std::sqrt(post/(n+1)) < 1e-9, buf);
+}
+
+static void test_loop_closure() {
+    std::printf("test_loop_closure\n");
+    std::uniform_real_distribution<double> us(0.8, 1.2);
+    std::normal_distribution<double> g(0, 1);
+    std::vector<Sim3> links;
+    for (int i = 0; i < 4; i++) links.push_back({ us(RNG), rand_rotation(), {g(RNG), g(RNG), g(RNG)} });
+    Sim3 acc = sim_identity();
+    for (int i = 0; i < 3; i++) acc = sim_compose(links[i], acc);
+    std::vector<Sim3> closed = { links[0], links[1], links[2], sim_invert(acc) };
+    LoopError e = loop_closure_error(closed);
+    // rot_deg has a ~sqrt(eps) (~1e-6 deg) floor: it is acos(~1) of RᵀR, whose
+    // deviation from I is at the fp roundoff level — scale_err and trans are
+    // exactly 0 here, so 1e-5 deg cleanly separates "closed" from any real drift.
+    check("closed loop ~ identity", e.scale_err < 1e-9 && e.rot_deg < 1e-5 && e.trans < 1e-9);
+    std::vector<Sim3> bad = closed;
+    bad[0].s *= 1.12;
+    LoopError e2 = loop_closure_error(bad);
+    check("scale drift detected", std::fabs(e2.scale_err - std::log(1.12)) < 1e-9);
+}
+
+// ===========================================================================
+// PnP (verified against analytic ground truth)
+// ===========================================================================
+
+static Mat3 make_K(double f, double W, double H) {
+    Mat3 K = fsla::mat3_identity();
+    K(0,0) = f; K(1,1) = f; K(0,2) = W/2; K(1,2) = H/2;
+    return K;
+}
+
+// project world points by (R,t) world2cam through K; fill px (N*2) and z (N)
+static void project(const double * Pw, int N, const Mat3 & R, const Vec3 & t,
+                    const Mat3 & K, double * px, double * z) {
+    for (int i = 0; i < N; i++) {
+        Vec3 p = { Pw[3*i], Pw[3*i+1], Pw[3*i+2] };
+        Vec3 Xc = fsla::mat3_apply(R, p);
+        Xc[0]+=t[0]; Xc[1]+=t[1]; Xc[2]+=t[2];
+        px[2*i]   = K(0,0)*Xc[0]/Xc[2] + K(0,2);
+        px[2*i+1] = K(1,1)*Xc[1]/Xc[2] + K(1,2);
+        z[i] = Xc[2];
+    }
+}
+
+static void test_focal_recovery() {
+    std::printf("test_focal_recovery\n");
+    const int N = 500; const double f = 437.0, W = 512, H = 512;
+    std::uniform_real_distribution<double> ux(-2,2), uy(-2,2), uz(4,9);
+    std::vector<double> Xc(3*N), px(2*N);
+    for (int i = 0; i < N; i++) {
+        Xc[3*i]=ux(RNG); Xc[3*i+1]=uy(RNG); Xc[3*i+2]=uz(RNG);
+        px[2*i]   = f*Xc[3*i]  /Xc[3*i+2] + W/2;
+        px[2*i+1] = f*Xc[3*i+1]/Xc[3*i+2] + H/2;
+    }
+    double fe = estimate_focal(Xc.data(), px.data(), N, W/2, H/2);
+    char buf[64]; std::snprintf(buf, sizeof buf, "%.9f vs %g", fe, f);
+    check("exact focal", std::fabs(fe - f) < 1e-6, buf);
+    std::normal_distribution<double> g(0, 0.5);
+    std::vector<double> pn(px);
+    for (int i = 0; i < 2*N; i++) pn[i] += g(RNG);
+    double fn = estimate_focal(Xc.data(), pn.data(), N, W/2, H/2);
+    check("focal under noise", std::fabs(fn - f)/f < 0.02);
+}
+
+static void test_pnp_recovery() {
+    std::printf("test_pnp_recovery\n");
+    const int N = 500; Mat3 K = make_K(400, 512, 512);
+    std::uniform_real_distribution<double> ux(-2,2), uy(-2,2), uz(4,8);
+    std::vector<double> Pw(3*N);
+    for (int i = 0; i < N; i++) { Pw[3*i]=ux(RNG); Pw[3*i+1]=uy(RNG); Pw[3*i+2]=uz(RNG); }
+    std::normal_distribution<double> g(0, 1);
+    std::uniform_real_distribution<double> ut(-0.6, 0.6), ua(3, 20);
+    double worst_R = 0, worst_t = 0;
+    for (int trial = 0; trial < 8; trial++) {
+        Vec3 ax = { g(RNG), g(RNG), g(RNG) };
+        double n = std::sqrt(ax[0]*ax[0]+ax[1]*ax[1]+ax[2]*ax[2]);
+        ax = { ax[0]/n, ax[1]/n, ax[2]/n };
+        const double ang = ua(RNG) * M_PI/180.0, c = std::cos(ang), s = std::sin(ang), v = 1-c;
+        Mat3 R{ { c+ax[0]*ax[0]*v, ax[0]*ax[1]*v-ax[2]*s, ax[0]*ax[2]*v+ax[1]*s,
+                  ax[1]*ax[0]*v+ax[2]*s, c+ax[1]*ax[1]*v, ax[1]*ax[2]*v-ax[0]*s,
+                  ax[2]*ax[0]*v-ax[1]*s, ax[2]*ax[1]*v+ax[0]*s, c+ax[2]*ax[2]*v } };
+        Vec3 t = { ut(RNG), ut(RNG), ut(RNG) };
+        std::vector<double> px(2*N), z(N);
+        project(Pw.data(), N, R, t, K, px.data(), z.data());
+        bool behind = false; for (int i = 0; i < N; i++) if (z[i] <= 0) behind = true;
+        if (behind) continue;
+        std::vector<char> inl;
+        Mat4 c2w = solve_pnp_numpy(Pw.data(), px.data(), N, K, inl);
+        Mat4 w2c = fsla::inv_rigid4(c2w);
+        Mat3 Rr{}; Vec3 tr{};
+        for (int i = 0; i < 3; i++) { for (int j = 0; j < 3; j++) Rr(i,j)=w2c(i,j); tr[i]=w2c(i,3); }
+        worst_R = std::max(worst_R, (double) rot_angle(mul(Rr, T(R))));
+        worst_t = std::max(worst_t, std::sqrt((tr[0]-t[0])*(tr[0]-t[0])+(tr[1]-t[1])*(tr[1]-t[1])+(tr[2]-t[2])*(tr[2]-t[2])));
+    }
+    char buf[64]; std::snprintf(buf, sizeof buf, "worst %.2e deg", worst_R);
+    check("rotation recovered", worst_R < 1e-3, buf);
+    check("translation recovered", worst_t < 1e-4);
+}
+
+static void test_pnp_outliers() {
+    std::printf("test_pnp_outliers\n");
+    const int N = 600; Mat3 K = make_K(400, 512, 512);
+    std::uniform_real_distribution<double> ux(-2,2), uy(-2,2), uz(4,8);
+    std::vector<double> Pw(3*N);
+    for (int i = 0; i < N; i++) { Pw[3*i]=ux(RNG); Pw[3*i+1]=uy(RNG); Pw[3*i+2]=uz(RNG); }
+    Mat3 R = rand_rotation();
+    // shrink towards identity so all points stay in front
+    for (int i = 0; i < 3; i++) for (int j = 0; j < 3; j++) R(i,j) = (i==j?1.0:0.0)*0.85 + R(i,j)*0.15;
+    { fsla::Mat3 U,V; Vec3 s; fsla::svd3(R, U, s, V); R = mul(U, T(V)); if (fsla::det3(R)<0) for(int i=0;i<3;i++) R(i,2)=-R(i,2); }
+    Vec3 t = { 0.3, -0.2, 0.4 };
+    std::vector<double> px(2*N), z(N);
+    project(Pw.data(), N, R, t, K, px.data(), z.data());
+    // keep only points in front
+    std::vector<double> Pk, pk;
+    for (int i = 0; i < N; i++) if (z[i] > 0) { for(int c=0;c<3;c++) Pk.push_back(Pw[3*i+c]); pk.push_back(px[2*i]); pk.push_back(px[2*i+1]); }
+    const int M = (int) pk.size()/2;
+    const int n_out = (int)(0.25 * M);
+    std::vector<int> oidx(M); for (int i = 0; i < M; i++) oidx[i]=i;
+    std::shuffle(oidx.begin(), oidx.end(), RNG);
+    std::uniform_real_distribution<double> corrupt(-150, 150);
+    for (int k = 0; k < n_out; k++) { int i = oidx[k]; pk[2*i]+=corrupt(RNG); pk[2*i+1]+=corrupt(RNG); }
+    std::vector<char> inl;
+    Mat4 c2w = solve_pnp_numpy(Pk.data(), pk.data(), M, K, inl, 2.0, 300);
+    Mat4 w2c = fsla::inv_rigid4(c2w);
+    Mat3 Rr{}; Vec3 tr{};
+    for (int i = 0; i < 3; i++) { for (int j = 0; j < 3; j++) Rr(i,j)=w2c(i,j); tr[i]=w2c(i,3); }
+    char buf[64]; std::snprintf(buf, sizeof buf, "%.3f deg", (double) rot_angle(mul(Rr, T(R))));
+    check("rotation under outliers", rot_angle(mul(Rr, T(R))) < 0.2, buf);
+    check("translation under outliers", std::sqrt((tr[0]-t[0])*(tr[0]-t[0])+(tr[1]-t[1])*(tr[1]-t[1])+(tr[2]-t[2])*(tr[2]-t[2])) < 0.02);
+    int rejected = 0; for (int k = 0; k < n_out; k++) if (!inl[oidx[k]]) rejected++;
+    check("outliers rejected", rejected > 0.85 * n_out);
+}
+
+int main() {
+    test_similarity_roundtrip();
+    test_scale_detection();
+    test_nonlinear_detection();
+    test_outlier_robustness();
+    test_loop_correction();
+    test_loop_closure();
+    test_focal_recovery();
+    test_pnp_recovery();
+    test_pnp_outliers();
+    std::printf(failures ? "\ntest_pose: %d FAILURES\n" : "\ntest_pose: ok\n", failures);
+    return failures ? 1 : 0;
+}

From 1142a5192c45d18a34074383308db7a7438401a5 Mon Sep 17 00:00:00 2001
From: Richard Palethorpe <io@richiejp.com>
Date: Sat, 27 Jun 2026 10:53:00 +0100
Subject: [PATCH 09/33] pose/: robust C++ PnP (EPnP + Gauss-Newton) --
 cv2-parity on real scenes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The DLT/RANSAC solver inherits the planar/mirror degeneracy on real dumps
(seed-dependent: ~3/5 seeds near cv2, ~2/5 a ~135-152deg flip). Add the shipped
real-data solver, solve_pnp:

  * EPnP (Lepetit/Moreno-Noguer/Fua) for the init -- barycentric control points,
    the camera-frame control points from the 12x12 MᵀM null space (the same Jacobi
    eigensolver), beta solve for N=1,2,3 with cheirality sign-fix, R,t via the
    rigid Umeyama on the 4 control points, best-N by reprojection. Non-iterative,
    uses ALL points (no random minimal samples -> no seed-dependent flips),
    planar-robust by construction.
  * Huber-robust Gauss-Newton reprojection refine (6-DoF left perturbation,
    J = [-[Xc]_x | I]) to polish to the reprojection minimum and downweight
    outliers -- the deterministic analogue of cv2's RANSAC+SQPNP+refine.

On the real scene dump (A_scn): deterministic across all RANSAC seeds, and within
0.73deg rotation / 0.74deg translation-direction of the upstream cv2/SQPNP -- vs
the numpy DLT's 175deg miss there. estimate_poses now uses solve_pnp; the DLT
solve_pnp_numpy stays as the asset-free golden reference.

Golden tests (all green under ASan/UBSan, ctest -LE model): exact recovery on
clean data, a near-planar slab (where the DLT's coplanar minimal samples flip),
and 15% gross-outlier rejection.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 pose/README.md      |  23 ++--
 src/pose.cpp        | 277 +++++++++++++++++++++++++++++++++++++++++++-
 src/pose.h          |  18 ++-
 tests/test_pose.cpp |  92 +++++++++++++++
 4 files changed, 396 insertions(+), 14 deletions(-)

diff --git a/pose/README.md b/pose/README.md
index e827981..8bcf72c 100644
--- a/pose/README.md
+++ b/pose/README.md
@@ -24,16 +24,19 @@ test tier (`tests/test_pose.cpp`, `ctest -LE model`):
   `sim_frac_power`** — `sim_frac_power` is a closed-form Sim(3) one-parameter
   subgroup (`A^f=s^f R^f`, translation `(A^f−I)(A−I)⁻¹t`), no complex eig needed.
   All 9 golden tests (the mirror of `test_pose.py`) pass under ASan/UBSan.
-- ⚠ **`pnp.py` (numpy backend) → `solve_pnp_numpy` + `estimate_poses`** — DLT (via
-  the 12×12 `AᵀA` nullspace, which sidesteps the numpy reference's full-SVD OOM)
-  + RANSAC + cheirality decode. **Correct on clean synthetic data** (golden
-  tests), but on a real scene dump it inherits the **DLT algorithm's planar/mirror
-  degeneracy**: ~3/5 RANSAC seeds land near the cv2/SQPNP answer (58° vs cv2's
-  57° relative rotation), ~2/5 collapse to a ~135–152° flip. This is the *same*
-  instability the numpy reference has — which is exactly why the prototype used
-  **cv2 (SQPNP) for every real-data result**. **Next step:** a robust in-house PnP
-  (EPnP/SQPNP + Gauss-Newton refine) to reach cv2-parity on real scenes with no
-  OpenCV dependency.
+- ✅ **Robust PnP → `solve_pnp` (EPnP + Gauss-Newton)** — `solve_pnp_numpy` (DLT +
+  RANSAC, via the 12×12 `AᵀA` nullspace) is kept as the asset-free golden-test
+  reference, but on real scenes the DLT inherits the textbook **planar/mirror
+  degeneracy** (seed-dependent: ~3/5 RANSAC seeds near cv2, ~2/5 a ~135–152°
+  flip). The shipped real-data solver is `solve_pnp`: **EPnP** (planar-robust,
+  non-iterative, uses all points so there is no random minimal-sample flip —
+  reuses the 12×12 `MᵀM` Jacobi eigendecomposition) for the init, then a
+  **Huber-robust Gauss-Newton** reprojection refine. On the real scene dump it is
+  **deterministic** (identical across all RANSAC seeds) and lands at **0.73° of
+  the upstream cv2/SQPNP** relative rotation (0.74° translation direction) — i.e.
+  **cv2-parity on real data with no OpenCV dependency**, versus the numpy DLT's
+  175° miss on the same dump. Golden tests cover exact recovery, a near-planar
+  config (where DLT flips), and 15% gross-outlier rejection.
 - ⬜ Not yet ported: accumulation chaining, loop closure, consensus fusion (these
   compose the primitives above), and the CLI / C-API surface.
 
diff --git a/src/pose.cpp b/src/pose.cpp
index 4bcafa8..fc39a23 100644
--- a/src/pose.cpp
+++ b/src/pose.cpp
@@ -6,6 +6,7 @@
 // self-contained Jacobi eigensolver in linalg.h — no Eigen, no OpenCV.
 #include "pose.h"
 
+#include <algorithm>
 #include <cmath>
 #include <cstring>
 
@@ -504,12 +505,286 @@ Mat4 solve_pnp_numpy(const double * Xw, const double * pixels, int N, const Mat3
     return inv_rigid4(world2cam);
 }
 
+// ---- robust PnP: EPnP + Gauss-Newton --------------------------------------
+//
+// EPnP (Lepetit/Moreno-Noguer/Fua 2009): express each world point as a
+// barycentric combination of 4 control points; the camera-frame control points
+// live in the null space of a 2n x 12 system, recovered from the eigenvectors of
+// the 12x12 MᵀM (the same Jacobi eigensolver as everywhere else). It is
+// non-iterative, uses ALL points (no random minimal samples), and handles
+// near-planar configs — so it does not suffer the DLT's seed-dependent mirror
+// flips. We try N=1,2,3 null vectors, pick the lowest reprojection error, then
+// polish (R,t) with a Huber-robust Gauss-Newton on the reprojection residual.
+
+namespace {
+
+Mat3 skew_neg(const Vec3 & x) {     // -[x]_×  = ∂Xc/∂ω for left perturbation
+    return Mat3{ { 0,  x[2], -x[1],
+                  -x[2], 0,  x[0],
+                   x[1], -x[0], 0 } };
+}
+
+Mat3 so3_exp(const Vec3 & w) {      // rotation from an axis-angle (rotation vector)
+    const double th = std::sqrt(w[0]*w[0] + w[1]*w[1] + w[2]*w[2]);
+    if (th < 1e-12) {               // I + [w]_× to first order
+        Mat3 R = mat3_identity();
+        R(0,1) = -w[2]; R(0,2) = w[1]; R(1,0) = w[2]; R(1,2) = -w[0]; R(2,0) = -w[1]; R(2,1) = w[0];
+        return R;
+    }
+    return rodrigues({ w[0]/th, w[1]/th, w[2]/th }, th);
+}
+
+double reproj_rms(const double * Xw, const double * px, int N, const Mat3 & K,
+                  const Mat3 & R, const Vec3 & t) {
+    const double fu = K(0,0), fv = K(1,1), cu = K(0,2), cv = K(1,2);
+    double s = 0; int m = 0;
+    for (int i = 0; i < N; i++) {
+        Vec3 p = { Xw[3*i], Xw[3*i+1], Xw[3*i+2] };
+        Vec3 Xc = mat3_apply(R, p); Xc[0]+=t[0]; Xc[1]+=t[1]; Xc[2]+=t[2];
+        if (Xc[2] <= 1e-6) { s += 1e6; m++; continue; }
+        const double dx = fu*Xc[0]/Xc[2] + cu - px[2*i];
+        const double dy = fv*Xc[1]/Xc[2] + cv - px[2*i+1];
+        s += dx*dx + dy*dy; m++;
+    }
+    return std::sqrt(s / std::max(m, 1));
+}
+
+// Recover camera-frame control points from the null-space vector x (12), fix the
+// global sign by cheirality (scene in front), and get world2cam (R,t) from the
+// rigid fit between the 4 world and 4 camera control points.
+void recover_Rt(const double * x12, const double cw[4][3], const double * alpha, int N,
+                Mat3 & R, Vec3 & t) {
+    double cc[12];
+    for (int i = 0; i < 12; i++) cc[i] = x12[i];
+    // mean camera-frame depth over the points (pc = sum_j alpha_ij cc_j)
+    double meanz = 0;
+    for (int i = 0; i < N; i++) {
+        double z = 0;
+        for (int j = 0; j < 4; j++) z += alpha[4*i+j] * cc[3*j+2];
+        meanz += z;
+    }
+    if (meanz < 0) for (int i = 0; i < 12; i++) cc[i] = -cc[i];
+    double cwf[12];
+    for (int j = 0; j < 4; j++) for (int k = 0; k < 3; k++) cwf[3*j+k] = cw[j][k];
+    Sim3 T = fit_similarity(cwf, cc, 4, /*with_scale=*/false);     // rigid world->camera
+    R = T.R; t = T.t;
+}
+
+// EPnP core: returns the best (R,t) world2cam over N=1,2,3 by reprojection error.
+void epnp(const double * Xw, const double * px, int N, const Mat3 & K, Mat3 & Rout, Vec3 & tout) {
+    // 1. control points: centroid + principal axes (sqrt-eigenvalue scaled).
+    double c0[3] = {0,0,0};
+    for (int i = 0; i < N; i++) { c0[0]+=Xw[3*i]; c0[1]+=Xw[3*i+1]; c0[2]+=Xw[3*i+2]; }
+    c0[0]/=N; c0[1]/=N; c0[2]/=N;
+    double Cov[9] = {0};
+    for (int i = 0; i < N; i++) {
+        const double d[3] = { Xw[3*i]-c0[0], Xw[3*i+1]-c0[1], Xw[3*i+2]-c0[2] };
+        for (int r = 0; r < 3; r++) for (int c = 0; c < 3; c++) Cov[r*3+c] += d[r]*d[c];
+    }
+    for (int i = 0; i < 9; i++) Cov[i] /= N;
+    double ev[3], evec[9];
+    fsla::jacobi_eigh(Cov, 3, ev, evec);
+    double lam_max = std::max({ev[0], ev[1], ev[2], 1e-12});
+    double cw[4][3];
+    for (int k = 0; k < 3; k++) cw[0][k] = c0[k];
+    for (int j = 0; j < 3; j++) {
+        // floor the axis length so a near-planar scene keeps 4 affinely-independent
+        // control points (an invertible barycentric matrix).
+        const double s = std::sqrt(std::max(ev[j], 1e-6 * lam_max));
+        for (int k = 0; k < 3; k++) cw[j+1][k] = c0[k] + s * evec[k*3 + j];
+    }
+
+    // 2. barycentric coordinates: alpha_i = C^{-1} [p_i; 1], C = [[cw^T];[1...]].
+    double C[16];
+    for (int j = 0; j < 4; j++) { for (int k = 0; k < 3; k++) C[k*4+j] = cw[j][k]; C[3*4+j] = 1.0; }
+    double Cinv[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
+    solve_lin(C, 4, Cinv, 4);                       // Cinv now holds C^{-1}
+    std::vector<double> alpha((size_t) 4 * N);
+    for (int i = 0; i < N; i++) {
+        const double p[4] = { Xw[3*i], Xw[3*i+1], Xw[3*i+2], 1.0 };
+        for (int j = 0; j < 4; j++) {
+            double a = 0; for (int k = 0; k < 4; k++) a += Cinv[j*4+k]*p[k];
+            alpha[4*i+j] = a;
+        }
+    }
+
+    // 3. M (2n x 12) -> MtM (12 x 12).
+    const double fu = K(0,0), fv = K(1,1), cu = K(0,2), cv = K(1,2);
+    double MtM[144] = {0};
+    for (int i = 0; i < N; i++) {
+        double r1[12] = {0}, r2[12] = {0};
+        const double uu = px[2*i], vv = px[2*i+1];
+        for (int j = 0; j < 4; j++) {
+            const double a = alpha[4*i+j];
+            r1[3*j+0] = a*fu; r1[3*j+2] = a*(cu - uu);
+            r2[3*j+1] = a*fv; r2[3*j+2] = a*(cv - vv);
+        }
+        for (int r = 0; r < 12; r++) for (int c = 0; c < 12; c++) MtM[r*12+c] += r1[r]*r1[c] + r2[r]*r2[c];
+    }
+    double mev[12], mevec[144];
+    fsla::jacobi_eigh(MtM, 12, mev, mevec);
+    int ord[12]; for (int i = 0; i < 12; i++) ord[i] = i;
+    std::sort(ord, ord + 12, [&](int a, int b){ return mev[a] < mev[b]; });
+    double V[3][12];                                // the 3 smallest-eigenvalue vectors
+    for (int n = 0; n < 3; n++) for (int i = 0; i < 12; i++) V[n][i] = mevec[i*12 + ord[n]];
+
+    // control-point pairs and their world distances
+    const int pr[6][2] = {{0,1},{0,2},{0,3},{1,2},{1,3},{2,3}};
+    double dw2[6];
+    for (int p = 0; p < 6; p++) {
+        const int i = pr[p][0], j = pr[p][1];
+        double d = 0; for (int k = 0; k < 3; k++) { const double e = cw[i][k]-cw[j][k]; d += e*e; }
+        dw2[p] = d;
+    }
+    auto blkdiff = [&](int n, int p, double out[3]) {
+        const int i = pr[p][0], j = pr[p][1];
+        for (int k = 0; k < 3; k++) out[k] = V[n][3*i+k] - V[n][3*j+k];
+    };
+    auto dot3 = [](const double a[3], const double b[3]){ return a[0]*b[0]+a[1]*b[1]+a[2]*b[2]; };
+
+    Mat3 bestR{}; Vec3 bestT{}; double best_err = 1e300; bool have = false;
+    auto consider = [&](const double x12[12]) {
+        Mat3 R; Vec3 t;
+        recover_Rt(x12, cw, alpha.data(), N, R, t);
+        const double e = reproj_rms(Xw, px, N, K, R, t);
+        if (e < best_err) { best_err = e; bestR = R; bestT = t; have = true; }
+    };
+
+    // N=1: x = beta * v0, beta from the 6 distance constraints (least squares).
+    {
+        double num = 0, den = 0;
+        for (int p = 0; p < 6; p++) { double d0[3]; blkdiff(0,p,d0);
+            const double nd = std::sqrt(dot3(d0,d0)); num += nd*std::sqrt(dw2[p]); den += nd*nd; }
+        const double beta = (den > 0) ? num/den : 0.0;
+        double x[12]; for (int i = 0; i < 12; i++) x[i] = beta*V[0][i];
+        consider(x);
+    }
+    // N=2: x = b1 v0 + b2 v1; distance constraints linear in (b1^2,b1b2,b2^2).
+    {
+        double L[18], rho[6];
+        for (int p = 0; p < 6; p++) {
+            double d0[3], d1[3]; blkdiff(0,p,d0); blkdiff(1,p,d1);
+            L[p*3+0] = dot3(d0,d0); L[p*3+1] = 2*dot3(d0,d1); L[p*3+2] = dot3(d1,d1);
+            rho[p] = dw2[p];
+        }
+        double G[9] = {0}, g[3] = {0};               // normal equations (3x3)
+        for (int p = 0; p < 6; p++) { for (int r = 0; r < 3; r++) { for (int c = 0; c < 3; c++) G[r*3+c] += L[p*3+r]*L[p*3+c]; g[r] += L[p*3+r]*rho[p]; } }
+        solve_lin(G, 3, g, 1);                        // g = [b11,b12,b22]
+        const double b1 = std::sqrt(std::fabs(g[0]));
+        double b2 = std::sqrt(std::fabs(g[2]));
+        if (g[1] < 0) b2 = -b2;
+        double x[12]; for (int i = 0; i < 12; i++) x[i] = b1*V[0][i] + b2*V[1][i];
+        consider(x);
+    }
+    // N=3: x = b1 v0 + b2 v1 + b3 v2; 6 constraints in 6 quadratic unknowns.
+    {
+        double L[36], rho[6];
+        for (int p = 0; p < 6; p++) {
+            double d0[3], d1[3], d2[3]; blkdiff(0,p,d0); blkdiff(1,p,d1); blkdiff(2,p,d2);
+            L[p*6+0]=dot3(d0,d0); L[p*6+1]=2*dot3(d0,d1); L[p*6+2]=2*dot3(d0,d2);
+            L[p*6+3]=dot3(d1,d1); L[p*6+4]=2*dot3(d1,d2); L[p*6+5]=dot3(d2,d2);
+            rho[p]=dw2[p];
+        }
+        double Lc[36], rc[6];
+        for (int i = 0; i < 36; i++) Lc[i] = L[i];
+        for (int i = 0; i < 6; i++) rc[i] = rho[i];
+        solve_lin(Lc, 6, rc, 1);                      // rc = [b11,b12,b13,b22,b23,b33]
+        const double b1 = std::sqrt(std::fabs(rc[0]));
+        const double b2 = (b1 > 1e-12) ? rc[1]/b1 : 0.0;
+        const double b3 = (b1 > 1e-12) ? rc[2]/b1 : 0.0;
+        double x[12]; for (int i = 0; i < 12; i++) x[i] = b1*V[0][i] + b2*V[1][i] + b3*V[2][i];
+        consider(x);
+    }
+
+    if (!have) { Rout = mat3_identity(); tout = {0,0,0}; return; }
+    Rout = bestR; tout = bestT;
+}
+
+// Huber-robust Gauss-Newton on the reprojection residual; 6-DoF left perturbation
+// (R <- exp(dw) R, t += dt). Refines an EPnP init to the reprojection minimum and
+// downweights gross outliers, so the deterministic EPnP+GN matches cv2's
+// RANSAC+SQPNP+refine without random sampling.
+void gauss_newton_pnp(const double * Xw, const double * px, int N, const Mat3 & K,
+                      Mat3 & R, Vec3 & t, int iters, double huber_px) {
+    const double fu = K(0,0), fv = K(1,1), cu = K(0,2), cv = K(1,2);
+    for (int it = 0; it < iters; it++) {
+        double H[36] = {0}, b[6] = {0};
+        for (int i = 0; i < N; i++) {
+            Vec3 p = { Xw[3*i], Xw[3*i+1], Xw[3*i+2] };
+            Vec3 Xc = mat3_apply(R, p); Xc[0]+=t[0]; Xc[1]+=t[1]; Xc[2]+=t[2];
+            if (Xc[2] <= 1e-6) continue;
+            const double iz = 1.0/Xc[2];
+            const double rx = fu*Xc[0]*iz + cu - px[2*i];
+            const double ry = fv*Xc[1]*iz + cv - px[2*i+1];
+            const double rn = std::sqrt(rx*rx + ry*ry);
+            const double w = (rn <= huber_px || rn < 1e-12) ? 1.0 : huber_px/rn;
+            // dproj/dXc (2x3)
+            const double Jp[2][3] = { { fu*iz, 0, -fu*Xc[0]*iz*iz },
+                                      { 0, fv*iz, -fv*Xc[1]*iz*iz } };
+            // dXc/d(delta) (3x6) = [ -[Xc]_x | I ]
+            const Mat3 S = skew_neg(Xc);
+            double Jd[3][6];
+            for (int r = 0; r < 3; r++) { Jd[r][0]=S(r,0); Jd[r][1]=S(r,1); Jd[r][2]=S(r,2);
+                                          Jd[r][3]=(r==0); Jd[r][4]=(r==1); Jd[r][5]=(r==2); }
+            double J[2][6];
+            for (int a = 0; a < 2; a++) for (int c = 0; c < 6; c++) {
+                double s = 0; for (int k = 0; k < 3; k++) s += Jp[a][k]*Jd[k][c]; J[a][c] = s; }
+            const double r2[2] = { rx, ry };
+            for (int a = 0; a < 2; a++) {
+                for (int c = 0; c < 6; c++) {
+                    b[c] += w * J[a][c] * r2[a];
+                    for (int d = 0; d < 6; d++) H[c*6+d] += w * J[a][c] * J[a][d];
+                }
+            }
+        }
+        double rhs[6]; for (int i = 0; i < 6; i++) rhs[i] = -b[i];
+        solve_lin(H, 6, rhs, 1);                      // solve H delta = -b
+        Vec3 dw = { rhs[0], rhs[1], rhs[2] };
+        R = mat3_mul(so3_exp(dw), R);
+        t[0]+=rhs[3]; t[1]+=rhs[4]; t[2]+=rhs[5];
+        if (std::fabs(rhs[0])+std::fabs(rhs[1])+std::fabs(rhs[2])+
+            std::fabs(rhs[3])+std::fabs(rhs[4])+std::fabs(rhs[5]) < 1e-12) break;
+    }
+}
+
+} // namespace
+
+Mat4 solve_pnp_epnp(const double * Xw, const double * pixels, int N, const Mat3 & K) {
+    Mat3 R; Vec3 t;
+    epnp(Xw, pixels, N, K, R, t);
+    Mat4 w2c = mat4_identity();
+    for (int i = 0; i < 3; i++) { for (int j = 0; j < 3; j++) w2c(i,j) = R(i,j); w2c(i,3) = t[i]; }
+    return inv_rigid4(w2c);
+}
+
+Mat4 solve_pnp(const double * Xw, const double * pixels, int N, const Mat3 & K,
+               std::vector<char> & inliers, double thresh_px, int gn_iters) {
+    Mat3 R; Vec3 t;
+    epnp(Xw, pixels, N, K, R, t);
+    gauss_newton_pnp(Xw, pixels, N, K, R, t, gn_iters, thresh_px);
+
+    inliers.assign(N, 0);
+    const double fu = K(0,0), fv = K(1,1), cu = K(0,2), cv = K(1,2);
+    for (int i = 0; i < N; i++) {
+        Vec3 p = { Xw[3*i], Xw[3*i+1], Xw[3*i+2] };
+        Vec3 Xc = mat3_apply(R, p); Xc[0]+=t[0]; Xc[1]+=t[1]; Xc[2]+=t[2];
+        if (Xc[2] <= 1e-6) continue;
+        const double dx = fu*Xc[0]/Xc[2] + cu - pixels[2*i];
+        const double dy = fv*Xc[1]/Xc[2] + cv - pixels[2*i+1];
+        if (std::sqrt(dx*dx + dy*dy) < thresh_px) inliers[i] = 1;
+    }
+    Mat4 w2c = mat4_identity();
+    for (int i = 0; i < 3; i++) { for (int j = 0; j < 3; j++) w2c(i,j) = R(i,j); w2c(i,3) = t[i]; }
+    return inv_rigid4(w2c);
+}
+
 // ---- estimate_poses orchestration -----------------------------------------
 
 PoseResult estimate_poses(const std::vector<const float *> & points,
                           const std::vector<const float *> & opacities,
                           int H, int W, double opacity_threshold,
                           double focal, int pnp_iter, bool normalize, uint64_t seed) {
+    (void) pnp_iter; (void) seed;        // robust PnP is deterministic (EPnP+GN)
     const int Nv = (int) points.size();
     const int P = H * W;
     const double ppx = W / 2.0, ppy = H / 2.0;
@@ -547,7 +822,7 @@ PoseResult estimate_poses(const std::vector<const float *> & points,
             px.push_back(grid[2*i]); px.push_back(grid[2*i+1]);
         }
         std::vector<char> inl;
-        Mat4 c2w = solve_pnp_numpy(Xw.data(), px.data(), (int) (Xw.size()/3), K, inl, 5.0, pnp_iter, seed);
+        Mat4 c2w = solve_pnp(Xw.data(), px.data(), (int) (Xw.size()/3), K, inl, 5.0, 10);
         res.cam2world.push_back(c2w);
     }
 
diff --git a/src/pose.h b/src/pose.h
index e511e32..3da4ac2 100644
--- a/src/pose.h
+++ b/src/pose.h
@@ -70,13 +70,25 @@ LoopError loop_closure_error(const std::vector<Sim3> & links);
 // of the eigendecomposition; valid while rotation < 180deg. Mirrors sim_frac_power.
 Mat4 sim_frac_power(const Mat4 & M, double f);
 
-// ---- PnP (mirrors pose/pnp.py numpy backend) -------------------------------
-// RANSAC DLT PnP with known intrinsics K. Xw is N*3 world points, pixels is N*2
-// (col,row). Returns cam2world (4x4) and fills `inliers` (size N, 0/1).
+// ---- PnP -------------------------------------------------------------------
+// RANSAC DLT PnP with known intrinsics K (mirrors pose/pnp.py numpy backend).
+// Xw is N*3 world points, pixels is N*2 (col,row). Returns cam2world (4x4) and
+// fills `inliers` (size N, 0/1). Kept as the asset-free golden-test reference; on
+// real scenes the DLT inherits the planar/mirror degeneracy — use solve_pnp.
 Mat4 solve_pnp_numpy(const double * Xw, const double * pixels, int N, const Mat3 & K,
                      std::vector<char> & inliers,
                      double thresh_px = 5.0, int iters = 100, uint64_t seed = 0);
 
+// Robust PnP: EPnP (planar-robust, deterministic — uses all points, no random
+// minimal samples) for the initial pose, then a Gauss-Newton / Huber reprojection
+// refine. Reaches cv2/SQPNP-grade poses on real scenes with no OpenCV dependency.
+// Fills `inliers` (reprojection < thresh_px). This is what estimate_poses uses.
+Mat4 solve_pnp(const double * Xw, const double * pixels, int N, const Mat3 & K,
+               std::vector<char> & inliers, double thresh_px = 5.0, int gn_iters = 10);
+
+// EPnP-only (no refine) — exposed for testing the planar-robust initialization.
+Mat4 solve_pnp_epnp(const double * Xw, const double * pixels, int N, const Mat3 & K);
+
 // Integer pixel grid (col,row), row-major over H*W — matches upstream xy_grid
 // (no half-pixel offset). Fills `out` with 2*H*W doubles.
 void pixel_grid(int H, int W, std::vector<double> & out);
diff --git a/tests/test_pose.cpp b/tests/test_pose.cpp
index 5341ae1..4a8f433 100644
--- a/tests/test_pose.cpp
+++ b/tests/test_pose.cpp
@@ -337,6 +337,95 @@ static void test_pnp_outliers() {
     check("outliers rejected", rejected > 0.85 * n_out);
 }
 
+// ===========================================================================
+// Robust PnP: EPnP + Gauss-Newton (the shipped real-data solver)
+// ===========================================================================
+
+static void test_pnp_robust_recovery() {
+    std::printf("test_pnp_robust_recovery\n");
+    const int N = 500; Mat3 K = make_K(400, 512, 512);
+    std::uniform_real_distribution<double> ux(-2,2), uy(-2,2), uz(4,8);
+    std::vector<double> Pw(3*N);
+    for (int i = 0; i < N; i++) { Pw[3*i]=ux(RNG); Pw[3*i+1]=uy(RNG); Pw[3*i+2]=uz(RNG); }
+    std::normal_distribution<double> g(0, 1);
+    std::uniform_real_distribution<double> ut(-0.6, 0.6);
+    double worst_R = 0, worst_t = 0;
+    for (int trial = 0; trial < 8; trial++) {
+        Mat3 R = rand_rotation();
+        for (int i = 0; i < 3; i++) for (int j = 0; j < 3; j++) R(i,j) = (i==j?1.0:0.0)*0.8 + R(i,j)*0.2;
+        { fsla::Mat3 U,V; Vec3 s; fsla::svd3(R,U,s,V); R = mul(U,T(V)); if (fsla::det3(R)<0) for(int i=0;i<3;i++) R(i,2)=-R(i,2); }
+        Vec3 t = { ut(RNG), ut(RNG), ut(RNG) };
+        std::vector<double> px(2*N), z(N);
+        project(Pw.data(), N, R, t, K, px.data(), z.data());
+        bool behind = false; for (int i = 0; i < N; i++) if (z[i] <= 0) behind = true;
+        if (behind) continue;
+        std::vector<char> inl;
+        Mat4 c2w = solve_pnp(Pw.data(), px.data(), N, K, inl);
+        Mat4 w2c = fsla::inv_rigid4(c2w);
+        Mat3 Rr{}; Vec3 tr{};
+        for (int i = 0; i < 3; i++) { for (int j = 0; j < 3; j++) Rr(i,j)=w2c(i,j); tr[i]=w2c(i,3); }
+        worst_R = std::max(worst_R, (double) rot_angle(mul(Rr, T(R))));
+        worst_t = std::max(worst_t, std::sqrt((tr[0]-t[0])*(tr[0]-t[0])+(tr[1]-t[1])*(tr[1]-t[1])+(tr[2]-t[2])*(tr[2]-t[2])));
+    }
+    char buf[64]; std::snprintf(buf, sizeof buf, "worst %.2e deg", worst_R);
+    check("rotation recovered", worst_R < 1e-3, buf);
+    check("translation recovered", worst_t < 1e-5);
+}
+
+static void test_pnp_robust_planar() {
+    // Near-planar scene (a thin slab) — the config where the DLT's coplanar
+    // minimal samples flip. EPnP's all-point control basis stays well-posed.
+    std::printf("test_pnp_robust_planar\n");
+    const int N = 500; Mat3 K = make_K(400, 512, 512);
+    std::uniform_real_distribution<double> ux(-2,2), uy(-2,2);
+    std::normal_distribution<double> thin(0, 0.02);              // ~planar, slight thickness
+    std::vector<double> Pw(3*N);
+    for (int i = 0; i < N; i++) { Pw[3*i]=ux(RNG); Pw[3*i+1]=uy(RNG); Pw[3*i+2]=6.0 + thin(RNG); }
+    Mat3 R = rand_rotation();
+    for (int i = 0; i < 3; i++) for (int j = 0; j < 3; j++) R(i,j) = (i==j?1.0:0.0)*0.85 + R(i,j)*0.15;
+    { fsla::Mat3 U,V; Vec3 s; fsla::svd3(R,U,s,V); R = mul(U,T(V)); if (fsla::det3(R)<0) for(int i=0;i<3;i++) R(i,2)=-R(i,2); }
+    Vec3 t = { 0.2, -0.1, 0.3 };
+    std::vector<double> px(2*N), z(N);
+    project(Pw.data(), N, R, t, K, px.data(), z.data());
+    std::vector<char> inl;
+    Mat4 w2c = fsla::inv_rigid4(solve_pnp(Pw.data(), px.data(), N, K, inl));
+    Mat3 Rr{}; Vec3 tr{};
+    for (int i = 0; i < 3; i++) { for (int j = 0; j < 3; j++) Rr(i,j)=w2c(i,j); tr[i]=w2c(i,3); }
+    char buf[64]; std::snprintf(buf, sizeof buf, "%.4f deg", (double) rot_angle(mul(Rr, T(R))));
+    check("planar rotation recovered", rot_angle(mul(Rr, T(R))) < 0.05, buf);
+    check("planar translation recovered", std::sqrt((tr[0]-t[0])*(tr[0]-t[0])+(tr[1]-t[1])*(tr[1]-t[1])+(tr[2]-t[2])*(tr[2]-t[2])) < 0.01);
+}
+
+static void test_pnp_robust_outliers() {
+    std::printf("test_pnp_robust_outliers\n");
+    const int N = 600; Mat3 K = make_K(400, 512, 512);
+    std::uniform_real_distribution<double> ux(-2,2), uy(-2,2), uz(4,8);
+    std::vector<double> Pw(3*N);
+    for (int i = 0; i < N; i++) { Pw[3*i]=ux(RNG); Pw[3*i+1]=uy(RNG); Pw[3*i+2]=uz(RNG); }
+    Mat3 R = rand_rotation();
+    for (int i = 0; i < 3; i++) for (int j = 0; j < 3; j++) R(i,j) = (i==j?1.0:0.0)*0.85 + R(i,j)*0.15;
+    { fsla::Mat3 U,V; Vec3 s; fsla::svd3(R,U,s,V); R = mul(U,T(V)); if (fsla::det3(R)<0) for(int i=0;i<3;i++) R(i,2)=-R(i,2); }
+    Vec3 t = { 0.3, -0.2, 0.4 };
+    std::vector<double> px(2*N), z(N);
+    project(Pw.data(), N, R, t, K, px.data(), z.data());
+    std::vector<double> Pk, pk;
+    for (int i = 0; i < N; i++) if (z[i] > 0) { for(int c=0;c<3;c++) Pk.push_back(Pw[3*i+c]); pk.push_back(px[2*i]); pk.push_back(px[2*i+1]); }
+    const int M = (int) pk.size()/2;
+    const int n_out = (int)(0.15 * M);                          // 15% gross pixel corruption
+    std::vector<int> oidx(M); for (int i = 0; i < M; i++) oidx[i]=i;
+    std::shuffle(oidx.begin(), oidx.end(), RNG);
+    std::uniform_real_distribution<double> corrupt(-120, 120);
+    for (int k = 0; k < n_out; k++) { int i = oidx[k]; pk[2*i]+=corrupt(RNG); pk[2*i+1]+=corrupt(RNG); }
+    std::vector<char> inl;
+    Mat4 w2c = fsla::inv_rigid4(solve_pnp(Pk.data(), pk.data(), M, K, inl, 3.0));
+    Mat3 Rr{}; Vec3 tr{};
+    for (int i = 0; i < 3; i++) { for (int j = 0; j < 3; j++) Rr(i,j)=w2c(i,j); tr[i]=w2c(i,3); }
+    char buf[64]; std::snprintf(buf, sizeof buf, "%.3f deg", (double) rot_angle(mul(Rr, T(R))));
+    check("rotation under outliers", rot_angle(mul(Rr, T(R))) < 0.5, buf);
+    int rejected = 0; for (int k = 0; k < n_out; k++) if (!inl[oidx[k]]) rejected++;
+    check("outliers rejected", rejected > 0.85 * n_out);
+}
+
 int main() {
     test_similarity_roundtrip();
     test_scale_detection();
@@ -347,6 +436,9 @@ int main() {
     test_focal_recovery();
     test_pnp_recovery();
     test_pnp_outliers();
+    test_pnp_robust_recovery();
+    test_pnp_robust_planar();
+    test_pnp_robust_outliers();
     std::printf(failures ? "\ntest_pose: %d FAILURES\n" : "\ntest_pose: ok\n", failures);
     return failures ? 1 : 0;
 }

From 93393e8ecb1a9ed34b08783a0bf6c73cccc48eb3 Mon Sep 17 00:00:00 2001
From: Richard Palethorpe <io@richiejp.com>
Date: Sat, 27 Jun 2026 15:45:21 +0100
Subject: [PATCH 10/33] pose/: C++ accumulation chaining (Accumulator) -- one
 world from a photo stream

Port accumulate.py to src/pose.{h,cpp}: the sliding-window accumulating-
reconstruction loop. Accumulator::add_pair takes each consecutive pair's
[2,H,W,gc] engine output, recovers the pair's cameras (estimate_poses), fits the
cross-run Sim(3) on the shared frame (fit_similarity_ransac), composes a global
chain, and drops every new frame's gaussians into one world. Exposes cloud(),
camera_path(), and per-link ChainLink diagnostics (scale/inlier/valid/resid).

Validation:
- Asset-free golden (test_accumulate_chain): synthetic pinhole clip with distinct
  per-run scales -> trajectory ATE 7.6e-8 of extent, per-link scale to 3e-9,
  fit residual ~1e-7. Green under ASan/UBSan.
- Real-data parity on the 13 cached pair_*.f32 dumps vs the numpy/cv2 prototype:
  cloud size bit-exact (2,633,725), per-link valid% identical (deterministic
  mask), per-link Sim(3) scale to mean 0.5% (11/12 links <1%), trajectory within
  6.6% of the cv2 chain. Residual = known RANSAC-RNG + EPnP-vs-SQPNP delta.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 pose/README.md      |  15 +++-
 src/pose.cpp        | 200 ++++++++++++++++++++++++++++++++++++++++++++
 src/pose.h          |  81 ++++++++++++++++++
 tests/test_pose.cpp | 155 ++++++++++++++++++++++++++++++++++
 4 files changed, 449 insertions(+), 2 deletions(-)

diff --git a/pose/README.md b/pose/README.md
index 8bcf72c..866cb20 100644
--- a/pose/README.md
+++ b/pose/README.md
@@ -37,8 +37,19 @@ test tier (`tests/test_pose.cpp`, `ctest -LE model`):
   **cv2-parity on real data with no OpenCV dependency**, versus the numpy DLT's
   175° miss on the same dump. Golden tests cover exact recovery, a near-planar
   config (where DLT flips), and 15% gross-outlier rejection.
-- ⬜ Not yet ported: accumulation chaining, loop closure, consensus fusion (these
-  compose the primitives above), and the CLI / C-API surface.
+- ✅ **Accumulation chaining → `Accumulator` (accumulate.py)** — the live
+  sliding-window loop: feed each consecutive pair's `[2,H,W,gc]` engine output,
+  it recovers the pair's cameras (`estimate_poses`), fits the cross-run Sim(3) on
+  the shared frame, composes a global chain, and drops every new frame's gaussians
+  into one world (`cloud()`, `camera_path()`). Golden-tested (synthetic clip:
+  trajectory ATE 7.6e-8 of extent, per-link scale to 3e-9). **Real-data parity**
+  on the 13 cached `pair_*.f32` dumps vs the numpy/cv2 prototype: cloud size
+  **bit-exact** (2,633,725) and per-link valid% identical (deterministic mask);
+  per-link Sim(3) scale agrees to **mean 0.5%** (11/12 links <1%, worst on the
+  documented low-inlier leg); trajectory within **6.6%** of the cv2 chain — the
+  residual is the known RANSAC-RNG + EPnP-vs-SQPNP delta, not a port gap.
+- ⬜ Not yet ported: loop closure, consensus fusion (these compose the primitives
+  above), and the CLI / C-API surface.
 
 ## Why it's needed
 
diff --git a/src/pose.cpp b/src/pose.cpp
index fc39a23..01ea13e 100644
--- a/src/pose.cpp
+++ b/src/pose.cpp
@@ -9,6 +9,7 @@
 #include <algorithm>
 #include <cmath>
 #include <cstring>
+#include <unordered_map>
 
 namespace free_splatter {
 namespace pose {
@@ -836,5 +837,204 @@ PoseResult estimate_poses(const std::vector<const float *> & points,
     return res;
 }
 
+// ---- accumulation ---------------------------------------------------------
+
+namespace {
+inline float clamp01(float v) { return v < 0.0f ? 0.0f : (v > 1.0f ? 1.0f : v); }
+const double SH_C0 = 0.28209479177387814;   // SH degree-0 basis (DC -> rgb)
+} // namespace
+
+Accumulator::Accumulator(int H, int W, double opacity_threshold,
+                         double ransac_thresh_frac, int ransac_iters, uint64_t seed)
+    : H_(H), W_(W), thr_(opacity_threshold), rthr_(ransac_thresh_frac),
+      riters_(ransac_iters), seed_(seed), T_(sim_identity()), final_cam_(mat4_identity()) {}
+
+void Accumulator::add_view(const float * pts, const float * op, const float * rgb,
+                           const Sim3 & T, int frame) {
+    const int P = H_ * W_;
+    for (int i = 0; i < P; i++) {
+        if (op[i] <= thr_) continue;
+        Vec3 x = { pts[3*i], pts[3*i+1], pts[3*i+2] };
+        Vec3 w = sim_apply(T, x);
+        AccumPoint p;
+        p.x = (float) w[0]; p.y = (float) w[1]; p.z = (float) w[2];
+        p.r = rgb[3*i]; p.g = rgb[3*i+1]; p.b = rgb[3*i+2];
+        p.frame = frame;
+        cloud_.push_back(p);
+    }
+}
+
+ChainLink Accumulator::add_pair(const float * g, int gc, double focal) {
+    const int P = H_ * W_;
+    // de-interleave the two views: points (ch 0:3), opacity (ch 15, already
+    // sigmoid-activated), rgb = clip(SH_DC * C0 + 0.5, 0, 1) (ch 3:6).
+    std::vector<float> pts0(3*P), pts1(3*P), op0(P), op1(P), rgb0(3*P), rgb1(3*P);
+    for (int i = 0; i < P; i++) {
+        const float * a0 = g + (size_t) i * gc;
+        const float * a1 = g + (size_t) (P + i) * gc;
+        for (int c = 0; c < 3; c++) { pts0[3*i+c] = a0[c]; pts1[3*i+c] = a1[c]; }
+        op0[i] = a0[15]; op1[i] = a1[15];
+        for (int c = 0; c < 3; c++) {
+            rgb0[3*i+c] = clamp01((float) (a0[3+c] * SH_C0 + 0.5));
+            rgb1[3*i+c] = clamp01((float) (a1[3+c] * SH_C0 + 0.5));
+        }
+    }
+
+    // recover this pair's cameras (view-0 frame, use_first_focal)
+    std::vector<const float *> pts = { pts0.data(), pts1.data() };
+    std::vector<const float *> ops = { op0.data(),  op1.data()  };
+    PoseResult pr = estimate_poses(pts, ops, H_, W_, thr_, focal, 100, false, seed_);
+    const Mat4 c2w0 = pr.cam2world[0], c2w1 = pr.cam2world[1];
+
+    ChainLink link{};
+    if (!have_prev_) {
+        T_ = sim_identity();
+        link.sim = sim_identity();
+        link.global = T_;
+        link.scale = 1.0; link.inlier_frac = 1.0; link.valid_frac = 1.0; link.resid_frac = 0.0;
+        add_view(pts0.data(), op0.data(), rgb0.data(), T_, next_frame_++);   // f_0
+        add_view(pts1.data(), op1.data(), rgb1.data(), T_, next_frame_++);   // f_1
+    } else {
+        // fit Sim(3) from this run's view 0 (the shared frame, run-k coords) to the
+        // previous run's view 1 (same frame, run-(k-1) coords). Mask = valid in both.
+        std::vector<double> XA, XB;
+        for (int i = 0; i < P; i++) if (op0[i] > thr_ && prev_mask1_[i]) {
+            XA.push_back(pts0[3*i]); XA.push_back(pts0[3*i+1]); XA.push_back(pts0[3*i+2]);
+            XB.push_back(prev_pts1_[3*i]); XB.push_back(prev_pts1_[3*i+1]); XB.push_back(prev_pts1_[3*i+2]);
+        }
+        const int n = (int) (XA.size() / 3);
+        double myB[3]; mean3(XB.data(), n, myB);
+        std::vector<double> ycB(3*n);
+        for (int i = 0; i < n; i++) for (int c = 0; c < 3; c++) ycB[3*i+c] = XB[3*i+c] - myB[c];
+        const double scene = rms3(ycB.data(), n);
+
+        std::vector<char> inl;
+        Sim3 S = fit_similarity_ransac(XA.data(), XB.data(), n, rthr_ * scene, riters_, inl, true, seed_);
+        T_ = sim_compose(T_, S);                 // run k -> global
+
+        int n_inl = 0;
+        std::vector<double> rA, rB;
+        for (int i = 0; i < n; i++) if (inl[i]) {
+            n_inl++;
+            Vec3 a = { XA[3*i], XA[3*i+1], XA[3*i+2] };
+            Vec3 sa = sim_apply(S, a);
+            for (int c = 0; c < 3; c++) { rA.push_back(sa[c] - XB[3*i+c]); }
+            (void) rB;
+        }
+        link.sim = S;
+        link.global = T_;
+        link.scale = S.s;
+        link.inlier_frac = (n > 0) ? (double) n_inl / n : 0.0;
+        link.valid_frac  = (double) n / P;
+        link.resid_frac  = (scene > 0 && n_inl > 0) ? rms3(rA.data(), n_inl) / scene : 0.0;
+
+        add_view(pts1.data(), op1.data(), rgb1.data(), T_, next_frame_++);   // f_{k+1}
+    }
+
+    cams_.push_back(mat4_mul(sim_matrix(T_), c2w0));     // frame f_k camera (view 0)
+    final_cam_ = mat4_mul(sim_matrix(T_), c2w1);         // latest view-1 camera
+
+    // this run's view 1 becomes the next run's shared frame
+    prev_pts1_.assign(pts1.begin(), pts1.end());
+    prev_mask1_.assign(P, 0);
+    for (int i = 0; i < P; i++) prev_mask1_[i] = (op1[i] > thr_) ? 1 : 0;
+    have_prev_ = true;
+    links_.push_back(link);
+    return link;
+}
+
+std::vector<Mat4> Accumulator::camera_path() const {
+    std::vector<Mat4> path = cams_;
+    if (!cams_.empty()) path.push_back(final_cam_);
+    return path;
+}
+
+// ---- consensus fusion -----------------------------------------------------
+
+FuseStats consensus_fuse(const std::vector<AccumPoint> & cloud, double voxel_frac, int k,
+                         std::vector<AccumPoint> & fused) {
+    fused.clear();
+    FuseStats st{};
+    st.raw_points = (int64_t) cloud.size();
+    if (cloud.empty()) return st;
+
+    // extent = rms distance to centroid; voxel = voxel_frac * extent; coords from min.
+    double mean[3] = {0,0,0}, lo[3] = {1e300,1e300,1e300};
+    for (const auto & p : cloud) {
+        mean[0]+=p.x; mean[1]+=p.y; mean[2]+=p.z;
+        lo[0]=std::min(lo[0],(double)p.x); lo[1]=std::min(lo[1],(double)p.y); lo[2]=std::min(lo[2],(double)p.z);
+    }
+    for (int c = 0; c < 3; c++) mean[c] /= cloud.size();
+    double ss = 0;
+    for (const auto & p : cloud) {
+        const double dx=p.x-mean[0], dy=p.y-mean[1], dz=p.z-mean[2];
+        ss += dx*dx + dy*dy + dz*dz;
+    }
+    const double ext = std::sqrt(ss / cloud.size());
+    const double v = voxel_frac * ext;
+    if (!(v > 0)) return st;
+
+    struct Vox { double sx=0,sy=0,sz=0,sr=0,sg=0,sb=0; int64_t cnt=0; std::vector<int32_t> frames; };
+    struct Key { int32_t i,j,k; bool operator==(const Key&o) const { return i==o.i&&j==o.j&&k==o.k; } };
+    struct KeyHash { size_t operator()(const Key&q) const {
+        uint64_t h = (uint64_t)(uint32_t)q.i * 0x9E3779B97F4A7C15ULL;
+        h ^= (uint64_t)(uint32_t)q.j + 0x9E3779B9U + (h<<6) + (h>>2);
+        h ^= (uint64_t)(uint32_t)q.k + 0x85EBCA6BU + (h<<6) + (h>>2);
+        return (size_t) h; } };
+    std::unordered_map<Key, Vox, KeyHash> grid;
+    grid.reserve(cloud.size() / 2 + 1);
+
+    for (const auto & p : cloud) {
+        Key key{ (int32_t) std::floor((p.x - lo[0]) / v),
+                 (int32_t) std::floor((p.y - lo[1]) / v),
+                 (int32_t) std::floor((p.z - lo[2]) / v) };
+        Vox & vx = grid[key];
+        vx.sx+=p.x; vx.sy+=p.y; vx.sz+=p.z; vx.sr+=p.r; vx.sg+=p.g; vx.sb+=p.b; vx.cnt++;
+        bool seen = false;
+        for (int32_t f : vx.frames) if (f == p.frame) { seen = true; break; }
+        if (!seen) vx.frames.push_back(p.frame);
+    }
+
+    st.voxels = (int64_t) grid.size();
+    for (const auto & kv : grid) {
+        const Vox & vx = kv.second;
+        if ((int) vx.frames.size() < k) continue;
+        st.kept_voxels++;
+        st.points_kept += vx.cnt;
+        AccumPoint p;
+        p.x = (float) (vx.sx / vx.cnt); p.y = (float) (vx.sy / vx.cnt); p.z = (float) (vx.sz / vx.cnt);
+        p.r = (float) (vx.sr / vx.cnt); p.g = (float) (vx.sg / vx.cnt); p.b = (float) (vx.sb / vx.cnt);
+        p.frame = (int32_t) vx.frames.size();   // support count (informational)
+        fused.push_back(p);
+    }
+    st.points_dropped = st.raw_points - st.points_kept;
+    return st;
+}
+
+// ---- loop closure ---------------------------------------------------------
+
+Mat4 sim4_invert(const Mat4 & M) {
+    Mat3 A{}; Vec3 t{};
+    for (int i = 0; i < 3; i++) { for (int j = 0; j < 3; j++) A(i,j) = M(i,j); t[i] = M(i,3); }
+    const double s = std::cbrt(det3(A));
+    Mat3 R = A;
+    for (int i = 0; i < 9; i++) R.a[i] /= s;
+    Sim3 inv = sim_invert(Sim3{ s, R, t });
+    return sim_matrix(inv);
+}
+
+std::vector<Vec3> distribute_drift(const Mat4 & D, const std::vector<Mat4> & global_poses) {
+    const int m = (int) global_poses.size();
+    std::vector<Vec3> out(m);
+    const int n = std::max(m - 1, 1);
+    for (int k = 0; k < m; k++) {
+        Mat4 Dk = sim_frac_power(D, (double) k / n);
+        Vec3 c = { global_poses[k](0,3), global_poses[k](1,3), global_poses[k](2,3) };
+        Vec3 r = mat3_apply(Mat3{ { Dk(0,0),Dk(0,1),Dk(0,2), Dk(1,0),Dk(1,1),Dk(1,2), Dk(2,0),Dk(2,1),Dk(2,2) } }, c);
+        out[k] = { r[0] + Dk(0,3), r[1] + Dk(1,3), r[2] + Dk(2,3) };
+    }
+    return out;
+}
+
 } // namespace pose
 } // namespace free_splatter
diff --git a/src/pose.h b/src/pose.h
index 3da4ac2..dbe1a23 100644
--- a/src/pose.h
+++ b/src/pose.h
@@ -110,6 +110,87 @@ PoseResult estimate_poses(const std::vector<const float *> & points,
                           double focal = -1.0, int pnp_iter = 100,
                           bool normalize = false, uint64_t seed = 0);
 
+// ---- accumulation: chain successive runs into one world (mirrors accumulate.py)
+// One accumulated point in the global frame, tagged with the source frame it came
+// from (consensus fusion needs the frame id). rgb is in [0,1].
+struct AccumPoint {
+    float x, y, z;
+    float r, g, b;
+    int32_t frame;
+};
+
+// Diagnostics for chaining one run into the global frame.
+struct ChainLink {
+    Sim3   sim;          // run k -> run k-1 (local similarity); identity for run 0
+    Sim3   global;       // run k -> global frame (T_k = T_{k-1} . sim)
+    double scale;        // sim.s — per-link scale drift
+    double inlier_frac;  // RANSAC inliers / shared-frame valid points
+    double valid_frac;   // pixels valid (opacity) in BOTH shared views
+    double resid_frac;   // inlier RANSAC residual / target scene extent
+};
+
+// Sliding-window accumulator: feed it each consecutive pair's engine output
+// ([2,H,W,gc], where view 0 shares a frame with the previous pair's view 1) and
+// it recovers each pair's cameras (estimate_poses), fits the cross-run Sim(3) on
+// the shared frame, composes a global chain, and drops every new frame's gaussians
+// into one world. This is the live accumulating-reconstruction loop, in C++.
+class Accumulator {
+public:
+    Accumulator(int H, int W, double opacity_threshold = 0.05,
+                double ransac_thresh_frac = 0.02, int ransac_iters = 300,
+                uint64_t seed = 0);
+
+    // gaussians: 2*H*W*gc floats (two views, engine channel layout). Recovers the
+    // pair's cameras, chains the Sim(3), accumulates the new frame(s). focal<=0 ->
+    // estimate per pair (use_first_focal). Returns the chain link just added.
+    ChainLink add_pair(const float * gaussians, int gaussian_channels, double focal = -1.0);
+
+    const std::vector<AccumPoint> & cloud() const { return cloud_; }
+    const std::vector<ChainLink>  & links() const { return links_; }
+    // count+1 global cam2world (similarity) matrices: frame f_k = T_k . c2w_k[view0],
+    // plus the final frame f_n = T_{n-1} . c2w_{n-1}[view1]. Mirrors accumulate.py's `rec`.
+    std::vector<Mat4> camera_path() const;
+    int  frame_count() const { return next_frame_; }
+    int  pair_count()  const { return (int) links_.size(); }
+    const Sim3 & global_transform() const { return T_; }
+
+private:
+    void add_view(const float * pts, const float * op, const float * rgb,
+                  const Sim3 & T, int frame);
+
+    int H_, W_;
+    double thr_, rthr_;
+    int riters_;
+    uint64_t seed_;
+    Sim3 T_;
+    bool have_prev_ = false;
+    std::vector<float> prev_pts1_;   // 3*P previous view-1 points (the shared frame)
+    std::vector<char>  prev_mask1_;  // P   previous view-1 opacity mask
+    std::vector<AccumPoint> cloud_;
+    std::vector<Mat4>       cams_;       // per-pair view-0 global cam2world
+    Mat4                    final_cam_;  // latest view-1 global cam2world (appended last)
+    std::vector<ChainLink>  links_;
+    int next_frame_ = 0;
+};
+
+// ---- consensus fusion (mirrors fuse.py) -----------------------------------
+struct FuseStats {
+    int64_t raw_points, voxels, kept_voxels, points_kept, points_dropped;
+};
+// Voxelize the cloud at voxel_frac * cloud-extent and keep only voxels corroborated
+// by >= k DISTINCT source frames, averaging the agreeing predictions (which also
+// denoises the surface). `fused` receives one averaged point per consensus voxel.
+FuseStats consensus_fuse(const std::vector<AccumPoint> & cloud, double voxel_frac, int k,
+                         std::vector<AccumPoint> & fused);
+
+// ---- loop closure (mirrors loop_closure.py) -------------------------------
+// Inverse of a similarity 4x4 [[sR,t],[0,1]] (sR = s*rotation): [[(1/s)Rᵀ,...],[0,1]].
+Mat4 sim4_invert(const Mat4 & M);
+// Distribute an accumulated Sim(3) drift D over a chain of n+1 nodes by applying
+// D^(k/n) at node k (even relaxation, sim_frac_power). Returns the corrected camera
+// centers. global_poses are the open-loop global cam2world (similarity) matrices.
+std::vector<Vec3> distribute_drift(const Mat4 & D, const std::vector<Mat4> & global_poses);
+
 } // namespace pose
 } // namespace free_splatter
 
diff --git a/tests/test_pose.cpp b/tests/test_pose.cpp
index 4a8f433..53e5ec9 100644
--- a/tests/test_pose.cpp
+++ b/tests/test_pose.cpp
@@ -426,6 +426,158 @@ static void test_pnp_robust_outliers() {
     check("outliers rejected", rejected > 0.85 * n_out);
 }
 
+// ===========================================================================
+// Accumulation / fusion / loop closure (the orchestration on top of the
+// primitives, mirrors accumulate.py / fuse.py / loop_closure.py)
+// ===========================================================================
+
+// Synthetic multi-frame clip with KNOWN geometry: a pinhole camera on a smooth
+// trajectory; each pair's gaussians are exact back-projections (engine channel
+// layout), and consecutive runs are rescaled by a distinct per-run factor (the
+// 1/baseline normalization that makes cross-run alignment a *similarity*). The
+// accumulator must recover the camera trajectory (up to a global Sim(3)) and the
+// per-link scale ratios.
+static Mat4 cam2world_frame(int i) {
+    const double ang = i * 4.0 * M_PI / 180.0;                 // small yaw per frame
+    const double c = std::cos(ang), s = std::sin(ang);
+    Mat4 M = fsla::mat4_identity();
+    M(0,0)=c; M(0,2)=s; M(2,0)=-s; M(2,2)=c;
+    M(0,3) = i * 0.4;                                          // translate along x
+    return M;
+}
+static double depth_at(int i, int r, int c) {
+    return 5.0 + 0.3*std::sin(0.7*i) + 0.05*r - 0.03*c
+         + 0.4 * (((r*7 + c*13 + i*5) % 11) / 11.0);           // non-coplanar spread
+}
+
+static void test_accumulate_chain() {
+    std::printf("test_accumulate_chain\n");
+    const int H = 16, W = 16, gc = 23, P = H*W, F = 5;        // 5 frames -> 4 pairs
+    const double f = 18.0;
+    Mat3 K = make_K(f, W, H), Kinv = fsla::inv3(K);
+    auto backproj = [&](int i, int r, int c) -> Vec3 {
+        const double d = depth_at(i, r, c);
+        Vec3 uv = { (double) c, (double) r, 1.0 };
+        Vec3 n = fsla::mat3_apply(Kinv, uv);
+        return { n[0]*d, n[1]*d, n[2]*d };
+    };
+
+    Accumulator acc(H, W, /*opacity_threshold=*/0.05);
+    std::vector<double> sigma(F);
+    for (int k = 0; k < F-1; k++) {                           // pair (k, k+1) -> run k
+        const double sk = 1.0 + 0.1*k;                        // distinct per-run scale
+        sigma[k] = sk;
+        Mat4 w2c_k = fsla::inv_rigid4(cam2world_frame(k));
+        Mat4 rel = fsla::mat4_mul(w2c_k, cam2world_frame(k+1));   // cam_k <- cam_{k+1}
+        std::vector<float> buf((size_t) 2 * P * gc, 0.0f);
+        for (int r = 0; r < H; r++) for (int c = 0; c < W; c++) {
+            const int i = r*W + c;
+            Vec3 X0 = backproj(k, r, c);                      // camera-k frame
+            Vec3 Xn = backproj(k+1, r, c);                    // camera-(k+1) frame
+            Vec3 X1 = apply_mat4(rel, Xn);                    // -> camera-k frame
+            float * v0 = &buf[(size_t) i * gc];
+            float * v1 = &buf[(size_t) (P + i) * gc];
+            for (int t = 0; t < 3; t++) { v0[t] = (float)(sk*X0[t]); v1[t] = (float)(sk*X1[t]); }
+            v0[15] = v1[15] = 0.9f;                           // opacity (activated)
+            for (int t = 0; t < 3; t++) { v0[3+t] = 0.1f*t; v1[3+t] = 0.1f*t; } // color
+        }
+        acc.add_pair(buf.data(), gc);
+    }
+
+    check("pair count", acc.pair_count() == F-1);
+    check("frame count", acc.frame_count() == F);
+    check("cloud non-empty", acc.cloud().size() == (size_t) F * P);   // all opacities valid
+
+    // per-link scale recovers sigma_{k-1}/sigma_k
+    bool scales_ok = true; double worst_scale = 0;
+    for (int k = 1; k < F-1; k++) {
+        const double expect = sigma[k-1]/sigma[k];
+        worst_scale = std::max(worst_scale, std::fabs(acc.links()[k].scale - expect));
+        scales_ok &= std::fabs(acc.links()[k].scale - expect) < 1e-6;
+    }
+    { char buf[64]; std::snprintf(buf, sizeof buf, "worst |Δs| %.2e", worst_scale);
+      check("per-link scale recovered", scales_ok, buf); }
+    double worst_resid = 0;
+    for (int k = 1; k < F-1; k++) worst_resid = std::max(worst_resid, acc.links()[k].resid_frac);
+    { char buf[64]; std::snprintf(buf, sizeof buf, "%.2e", worst_resid);
+      check("per-link residual ~0", worst_resid < 1e-6, buf); }
+
+    // recovered trajectory vs GT, after a global Sim(3) alignment (ATE)
+    std::vector<Mat4> path = acc.camera_path();
+    std::vector<double> rec(3*F), gt(3*F);
+    for (int i = 0; i < F; i++) {
+        rec[3*i]=path[i](0,3); rec[3*i+1]=path[i](1,3); rec[3*i+2]=path[i](2,3);
+        Mat4 g = cam2world_frame(i);
+        gt[3*i]=g(0,3); gt[3*i+1]=g(1,3); gt[3*i+2]=g(2,3);
+    }
+    Sim3 A = fit_similarity(rec.data(), gt.data(), F);
+    std::vector<double> rec_al(3*F);
+    apply_sim_arr(A.s, A.R, A.t, rec.data(), F, rec_al.data());
+    double mg[3]={0,0,0}; for (int i=0;i<F;i++) for(int c=0;c<3;c++) mg[c]+=gt[3*i+c];
+    for (int c=0;c<3;c++) mg[c]/=F;
+    double ext=0; for(int i=0;i<F;i++) for(int c=0;c<3;c++){double d=gt[3*i+c]-mg[c]; ext+=d*d;}
+    ext = std::sqrt(ext/F);
+    const double ate = rms3(rec_al.data(), gt.data(), F) / ext;
+    { char buf[64]; std::snprintf(buf, sizeof buf, "ATE %.2e of extent", ate);
+      check("trajectory recovered", ate < 1e-3, buf); }
+}
+
+static void test_consensus_fuse() {
+    std::printf("test_consensus_fuse\n");
+    std::vector<AccumPoint> cloud;
+    // 5 surface points, each corroborated by frames {0,1,2} (tight cluster -> one
+    // voxel, support 3); 20 isolated single-frame floaters far apart (support 1).
+    for (int loc = 0; loc < 5; loc++)
+        for (int fr = 0; fr < 3; fr++)
+            cloud.push_back({ (float)(2.0*loc) + 0.001f*fr, 0.0f, 0.0f, 0.5f, 0.5f, 0.5f, fr });
+    for (int j = 0; j < 20; j++)
+        cloud.push_back({ 40.0f + 2.0f*j, 0.0f, 0.0f, 0.2f, 0.2f, 0.2f, j % 4 });
+
+    std::vector<AccumPoint> fused;
+    FuseStats st = consensus_fuse(cloud, /*voxel_frac=*/0.02, /*k=*/2, fused);
+    check("raw points", st.raw_points == 35);
+    check("kept voxels (the surface)", st.kept_voxels == 5);
+    check("fused size", fused.size() == 5);
+    check("points kept", st.points_kept == 15);
+    check("floaters dropped", st.points_dropped == 20);
+    // a fused point sits at its cluster centroid (~2*loc), color preserved
+    bool centroid_ok = true;
+    for (const auto & p : fused) {
+        const double loc = std::round(p.x / 2.0);
+        centroid_ok &= std::fabs(p.x - 2.0*loc) < 0.01 && std::fabs(p.r - 0.5) < 1e-6;
+    }
+    check("fused centroid + color", centroid_ok);
+}
+
+static void test_loop_distribute() {
+    std::printf("test_loop_distribute\n");
+    Mat3 R = rand_rotation();
+    Mat4 D = sim_matrix({ 1.15, R, {0.4, -0.2, 0.1} });
+    // sim4_invert is a true inverse of a similarity 4x4
+    Mat4 S = sim_matrix({ 1.3, rand_rotation(), {1.0, -2.0, 0.5} });
+    Mat4 SI = fsla::mat4_mul(sim4_invert(S), S);
+    bool inv_ok = true; for (int i = 0; i < 16; i++) inv_ok &= std::fabs(SI.a[i] - fsla::mat4_identity().a[i]) < 1e-9;
+    check("sim4_invert is an inverse", inv_ok);
+
+    // a clean loop, drifted by D^{-k/n}; distribute_drift(D) must cancel it.
+    const int n = 8;
+    std::vector<Mat4> poses(n+1);
+    std::vector<Vec3> clean(n+1);
+    for (int k = 0; k <= n; k++) {
+        const double tt = 2*M_PI * k / n;
+        clean[k] = { std::cos(tt), std::sin(tt), 0.1*tt };
+        Vec3 drifted = apply_mat4(sim_frac_power(D, -(double)k/n), clean[k]);
+        Mat4 Pk = fsla::mat4_identity();
+        Pk(0,3)=drifted[0]; Pk(1,3)=drifted[1]; Pk(2,3)=drifted[2];
+        poses[k] = Pk;
+    }
+    std::vector<Vec3> corr = distribute_drift(D, poses);
+    double post = 0;
+    for (int k = 0; k <= n; k++) for (int c = 0; c < 3; c++) post += (corr[k][c]-clean[k][c])*(corr[k][c]-clean[k][c]);
+    char buf[64]; std::snprintf(buf, sizeof buf, "ATE %.1e", std::sqrt(post/(n+1)));
+    check("distribute_drift recovers clean loop", std::sqrt(post/(n+1)) < 1e-9, buf);
+}
+
 int main() {
     test_similarity_roundtrip();
     test_scale_detection();
@@ -439,6 +591,9 @@ int main() {
     test_pnp_robust_recovery();
     test_pnp_robust_planar();
     test_pnp_robust_outliers();
+    test_accumulate_chain();
+    test_consensus_fuse();
+    test_loop_distribute();
     std::printf(failures ? "\ntest_pose: %d FAILURES\n" : "\ntest_pose: ok\n", failures);
     return failures ? 1 : 0;
 }

From e3f316f7edb01641ff5b3d953b87c3c46237f588 Mon Sep 17 00:00:00 2001
From: Richard Palethorpe <io@richiejp.com>
Date: Sat, 27 Jun 2026 15:49:23 +0100
Subject: [PATCH 11/33] pose/: validate C++ loop closure (sim4_invert +
 distribute_drift) vs Python

The loop-closure machinery (sim4_invert, distribute_drift, sim_frac_power) shipped
with the Accumulator commit; this records its parity validation.

- sim_frac_power C++ closed-form == numpy eig-based to 5e-10 across f in [-0.5,1.3]
  (so distribute_drift is bit-identical to the prototype's distribution).
- sim4_invert is an exact similarity inverse (1e-9); golden recovers a known
  drifted loop to 4e-16.
- Real-data parity on the loopcache (13 chain pairs + close_0_260): recovered
  drift matches the prototype's loop error (scale 1.09 vs 1.12, 4.6 vs 4.4 deg),
  deterministic valid% identical. The corrected-trajectory delta is the known
  EPnP-vs-cv2 PnP backend feeding D, not the distribution math.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 pose/README.md | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/pose/README.md b/pose/README.md
index 866cb20..f1769a2 100644
--- a/pose/README.md
+++ b/pose/README.md
@@ -48,8 +48,21 @@ test tier (`tests/test_pose.cpp`, `ctest -LE model`):
   per-link Sim(3) scale agrees to **mean 0.5%** (11/12 links <1%, worst on the
   documented low-inlier leg); trajectory within **6.6%** of the cv2 chain — the
   residual is the known RANSAC-RNG + EPnP-vs-SQPNP delta, not a port gap.
-- ⬜ Not yet ported: loop closure, consensus fusion (these compose the primitives
-  above), and the CLI / C-API surface.
+- ✅ **Loop closure → `sim4_invert` + `distribute_drift` (loop_closure.py)** — the
+  even Sim(3) relaxation: from an accumulated drift `D` (the loop-closure
+  measurement `P_n_loop · P_last⁻¹`), distribute `D^(k/n)` over the chain.
+  `sim_frac_power` (the closed-form one-parameter subgroup) is **numerically
+  identical to numpy's eig-based version (5e-10)** across f ∈ {−0.5 … 1.3};
+  `sim4_invert` is an exact similarity inverse (1e-9); golden test recovers a
+  known drifted loop to 4e-16. **Real-data parity** on the cached loopcache (13
+  chain pairs + `close_0_260`): the recovered drift matches the prototype's loop
+  error (scale 1.09 vs 1.12, **4.6° vs 4.4°**, README "4.4°"), deterministic
+  valid% identical; the corrected-trajectory delta is the (already-characterized)
+  EPnP-vs-cv2 PnP backend feeding `D`, since the distribution math itself is
+  bit-identical to numpy. Confirms the prototype's finding: on this short clip the
+  chain already closes, so loop closure is a near-identity correction.
+- ⬜ Not yet ported: consensus fusion (composes the primitives above), and the
+  CLI / C-API surface.
 
 ## Why it's needed
 

From 3fdf84488343e9192e796548bdc6cbac7dcf5512 Mon Sep 17 00:00:00 2001
From: Richard Palethorpe <io@richiejp.com>
Date: Sat, 27 Jun 2026 15:50:59 +0100
Subject: [PATCH 12/33] pose/: validate C++ consensus fusion (consensus_fuse)
 vs Python fuse.py

consensus_fuse shipped with the Accumulator commit (one hash-grid pass over the
frame-tagged cloud: >=K distinct-frame voxels kept, agreeing predictions
averaged). This records its parity validation.

- Golden (test_consensus_fuse): exact counts on controlled synthetic support.
- Real-data parity vs fuse.py on the 13 acc dumps (voxel 0.02, K>=2): raw points
  bit-exact (2,633,725), per-point floater drop 14.0% vs 14%, raw->fused
  reduction 93.9% vs 94%, kept-voxel fraction 53.8% vs 54%. Sub-1% voxel-count
  delta is the chaining RANSAC-RNG, not the fusion math. Reproduces the
  prototype's "remove the 14% single-frame edge-haze floaters" result.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 pose/README.md | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/pose/README.md b/pose/README.md
index f1769a2..b5474ec 100644
--- a/pose/README.md
+++ b/pose/README.md
@@ -61,8 +61,17 @@ test tier (`tests/test_pose.cpp`, `ctest -LE model`):
   EPnP-vs-cv2 PnP backend feeding `D`, since the distribution math itself is
   bit-identical to numpy. Confirms the prototype's finding: on this short clip the
   chain already closes, so loop closure is a near-identity correction.
-- ⬜ Not yet ported: consensus fusion (composes the primitives above), and the
-  CLI / C-API surface.
+- ✅ **Consensus fusion → `consensus_fuse` (fuse.py)** — voxelize the accumulated
+  cloud at `voxel_frac · extent` and keep only voxels corroborated by ≥ K distinct
+  source frames, averaging the agreeing predictions (the per-point `frame` tag the
+  `Accumulator` carries makes this a single hash-grid pass). Golden test pins the
+  counts exactly on controlled synthetic support. **Real-data parity** vs `fuse.py`
+  on the 13 `acc` dumps (voxel 0.02, K≥2): raw points **bit-exact** (2,633,725),
+  per-point floater drop **14.0% vs 14%**, raw→fused reduction 93.9% vs 94%,
+  kept-voxel fraction 53.8% vs 54% — the sub-1% voxel-count delta is the chaining
+  RANSAC-RNG shifting positions at voxel boundaries, not the fusion math. The
+  prototype's "remove the 14% single-frame edge-haze floaters" result reproduces.
+- ⬜ Not yet ported: the CLI / C-API surface (then this prototype is deleted).
 
 ## Why it's needed
 

From 8529cd40e228321c0b876bbb4833a347484ff3d7 Mon Sep 17 00:00:00 2001
From: Richard Palethorpe <io@richiejp.com>
Date: Sat, 27 Jun 2026 15:58:16 +0100
Subject: [PATCH 13/33] CLI + C-API: pose recovery + accumulation surface (no
 Python)

Expose the now-ported pose pipeline from the C API and the CLI, with no Python
runtime dependency (per CLAUDE.md):

C API (include/free_splatter.h):
- free_splatter_estimate_poses: recover per-view cam2world from an engine buffer.
- free_splatter_accumulator_{new,free,add_pair,frame_count,cloud,fuse,
  camera_path}: opaque sliding-window accumulator wrapping pose::Accumulator;
  add_pair takes each consecutive pair's [2,H,W,gc] engine output, returns the
  growing global cloud (free_splatter_point: xyz+rgb+frame), the consensus-fused
  cloud, and the global camera trajectory. FFI-friendly, malloc'd buffers freed
  with free_splatter_buf_free.

CLI (free_splatter-cli --accumulate):
- runs the engine over each consecutive image pair, chains the runs into one
  world, and writes PREFIX_<nframes>.splat after each pair (the evolving
  reconstruction) plus, with --fuse, a consensus-fused PREFIX_fused.splat. New
  write_cloud_splat emits the xyz+rgb cloud as small isotropic .splat gaussians.

Verified end-to-end on real frames (5 frames -> evolving 519K/779K/.. splats +
103K fused), sanitizer-clean under the debug preset; asset-free ctest tier green.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 include/free_splatter.h     |  56 +++++++++++++++++
 src/free_splatter.cpp       | 119 ++++++++++++++++++++++++++++++++++++
 tools/free_splatter-cli.cpp | 108 +++++++++++++++++++++++++++++++-
 3 files changed, 281 insertions(+), 2 deletions(-)

diff --git a/include/free_splatter.h b/include/free_splatter.h
index 4707170..2913c94 100644
--- a/include/free_splatter.h
+++ b/include/free_splatter.h
@@ -77,6 +77,62 @@ FREE_SPLATTER_API int  free_splatter_run(free_splatter_ctx * ctx, const float *
                        int32_t height, int32_t width, float ** out, size_t * n_out);
 FREE_SPLATTER_API void free_splatter_buf_free(void * buf);  // NULL-safe
 
+// ---- downstream pose recovery + accumulation ----
+// The pose consumer of the engine seam: given the engine's [N,H,W,gaussian_channels]
+// output it recovers each view's camera (PnP) and stitches successive runs into one
+// accumulating world (cross-run Sim(3)). Dependency-free (no Python, no OpenCV).
+
+// Recover each view's camera from an engine output buffer. gaussians:
+// n_views*height*width*gaussian_channels float32 (the layout free_splatter_run
+// returns). cam2world_out: caller-owned n_views*16 float32, filled with a
+// row-major 4x4 cam2world per view. If focal_out is non-NULL it receives the
+// estimated shared focal. opacity_threshold gates valid pixels (engine output is
+// already activated). Returns 0 on success, -1 on failure.
+FREE_SPLATTER_API int free_splatter_estimate_poses(
+    const float * gaussians, int32_t n_views, int32_t height, int32_t width,
+    int32_t gaussian_channels, float opacity_threshold,
+    float * cam2world_out, float * focal_out);
+
+// One accumulated point in the global frame: position, color in [0,1], and the
+// source frame it came from (for consensus fusion).
+typedef struct {
+    float x, y, z;
+    float r, g, b;
+    int32_t frame;
+} free_splatter_point;
+
+// Sliding-window accumulator: feed each consecutive pair's engine output and it
+// chains the runs into one world. Opaque handle, not thread-safe.
+typedef struct free_splatter_accumulator free_splatter_accumulator;
+FREE_SPLATTER_API free_splatter_accumulator * free_splatter_accumulator_new(
+    int32_t height, int32_t width, float opacity_threshold);
+FREE_SPLATTER_API void free_splatter_accumulator_free(free_splatter_accumulator * acc); // NULL-safe
+
+// Add one pair's engine output: 2*height*width*gaussian_channels float32, where
+// view 0 shares a frame with the previous pair's view 1. Returns 0 on success.
+FREE_SPLATTER_API int free_splatter_accumulator_add_pair(
+    free_splatter_accumulator * acc, const float * gaussians, int32_t gaussian_channels);
+
+// Number of frames accumulated so far (== pairs_added + 1, or 0 before any pair).
+FREE_SPLATTER_API int free_splatter_accumulator_frame_count(const free_splatter_accumulator * acc);
+
+// Copy the current accumulated cloud: *out is malloc'd free_splatter_point[*n_out]
+// (free with free_splatter_buf_free). Returns 0 on success, -1 on failure.
+FREE_SPLATTER_API int free_splatter_accumulator_cloud(
+    const free_splatter_accumulator * acc, free_splatter_point ** out, size_t * n_out);
+
+// Consensus-fuse the current cloud: keep only voxels (size voxel_frac * extent)
+// corroborated by >= k distinct source frames, averaging the agreeing predictions.
+// *out malloc'd as above. Returns 0 on success, -1 on failure.
+FREE_SPLATTER_API int free_splatter_accumulator_fuse(
+    const free_splatter_accumulator * acc, float voxel_frac, int32_t k,
+    free_splatter_point ** out, size_t * n_out);
+
+// Copy the global camera trajectory: *out malloc'd (*n_frames)*16 float32, a
+// row-major 4x4 cam2world (similarity) per frame. Returns 0 on success.
+FREE_SPLATTER_API int free_splatter_accumulator_camera_path(
+    const free_splatter_accumulator * acc, float ** out, int32_t * n_frames);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/free_splatter.cpp b/src/free_splatter.cpp
index 3927622..53fd94f 100644
--- a/src/free_splatter.cpp
+++ b/src/free_splatter.cpp
@@ -3,12 +3,14 @@
 #include "image.h"
 #include "model.h"
 #include "options.h"
+#include "pose.h"
 
 #include <cstdlib>
 #include <cstring>
 #include <filesystem>
 #include <fstream>
 #include <string>
+#include <vector>
 
 // ---- opaque types ----------------------------------------------------------
 
@@ -156,3 +158,120 @@ int free_splatter_run(free_splatter_ctx * ctx, const float * images, int32_t n_v
 }
 
 void free_splatter_buf_free(void * buf) { free(buf); }
+
+// ---- downstream pose recovery + accumulation -------------------------------
+
+namespace {
+
+// De-interleave per-view contiguous points (3*P) + opacity (P) from a
+// [n_views,H,W,gc] engine buffer. Returns per-view base pointers into `store`.
+void deinterleave(const float * g, int nv, int P, int gc,
+                  std::vector<std::vector<float>> & pts,
+                  std::vector<std::vector<float>> & ops,
+                  std::vector<const float *> & pptr,
+                  std::vector<const float *> & optr) {
+    pts.assign(nv, {}); ops.assign(nv, {});
+    pptr.resize(nv); optr.resize(nv);
+    for (int v = 0; v < nv; v++) {
+        pts[v].resize((size_t) 3 * P); ops[v].resize(P);
+        for (int i = 0; i < P; i++) {
+            const float * a = &g[(size_t) (v * P + i) * gc];
+            pts[v][3*i] = a[0]; pts[v][3*i+1] = a[1]; pts[v][3*i+2] = a[2];
+            ops[v][i] = a[15];
+        }
+        pptr[v] = pts[v].data(); optr[v] = ops[v].data();
+    }
+}
+
+// Copy a pose vector's cloud into a malloc'd C array (ABI-stable layout).
+int emit_points(const std::vector<free_splatter::pose::AccumPoint> & src,
+                free_splatter_point ** out, size_t * n_out) {
+    *out = (free_splatter_point *) malloc(src.size() * sizeof(free_splatter_point));
+    if (!*out && !src.empty()) return -1;
+    for (size_t i = 0; i < src.size(); i++) {
+        (*out)[i].x = src[i].x; (*out)[i].y = src[i].y; (*out)[i].z = src[i].z;
+        (*out)[i].r = src[i].r; (*out)[i].g = src[i].g; (*out)[i].b = src[i].b;
+        (*out)[i].frame = src[i].frame;
+    }
+    *n_out = src.size();
+    return 0;
+}
+
+} // namespace
+
+struct free_splatter_accumulator {
+    free_splatter::pose::Accumulator acc;
+    int gc = 0;
+    free_splatter_accumulator(int H, int W, double thr) : acc(H, W, thr) {}
+};
+
+int free_splatter_estimate_poses(const float * gaussians, int32_t n_views, int32_t height,
+                                 int32_t width, int32_t gaussian_channels,
+                                 float opacity_threshold, float * cam2world_out, float * focal_out) {
+    if (!gaussians || !cam2world_out || n_views < 1 || height < 1 || width < 1 || gaussian_channels < 16)
+        return -1;
+    const int P = height * width;
+    std::vector<std::vector<float>> pts, ops;
+    std::vector<const float *> pptr, optr;
+    deinterleave(gaussians, n_views, P, gaussian_channels, pts, ops, pptr, optr);
+    free_splatter::pose::PoseResult pr =
+        free_splatter::pose::estimate_poses(pptr, optr, height, width, opacity_threshold);
+    for (int v = 0; v < n_views; v++)
+        for (int i = 0; i < 4; i++) for (int j = 0; j < 4; j++)
+            cam2world_out[v*16 + i*4 + j] = (float) pr.cam2world[v](i, j);
+    if (focal_out) *focal_out = (float) pr.focal;
+    return 0;
+}
+
+free_splatter_accumulator * free_splatter_accumulator_new(int32_t height, int32_t width,
+                                                          float opacity_threshold) {
+    if (height < 1 || width < 1) return nullptr;
+    return new (std::nothrow) free_splatter_accumulator(height, width, opacity_threshold);
+}
+
+void free_splatter_accumulator_free(free_splatter_accumulator * acc) { delete acc; }
+
+int free_splatter_accumulator_add_pair(free_splatter_accumulator * acc, const float * gaussians,
+                                       int32_t gaussian_channels) {
+    if (!acc || !gaussians || gaussian_channels < 16) return -1;
+    acc->gc = gaussian_channels;
+    acc->acc.add_pair(gaussians, gaussian_channels);
+    return 0;
+}
+
+int free_splatter_accumulator_frame_count(const free_splatter_accumulator * acc) {
+    return acc ? acc->acc.frame_count() : 0;
+}
+
+int free_splatter_accumulator_cloud(const free_splatter_accumulator * acc,
+                                    free_splatter_point ** out, size_t * n_out) {
+    if (out) *out = nullptr;
+    if (n_out) *n_out = 0;
+    if (!acc || !out || !n_out) return -1;
+    return emit_points(acc->acc.cloud(), out, n_out);
+}
+
+int free_splatter_accumulator_fuse(const free_splatter_accumulator * acc, float voxel_frac,
+                                   int32_t k, free_splatter_point ** out, size_t * n_out) {
+    if (out) *out = nullptr;
+    if (n_out) *n_out = 0;
+    if (!acc || !out || !n_out || k < 1 || !(voxel_frac > 0)) return -1;
+    std::vector<free_splatter::pose::AccumPoint> fused;
+    free_splatter::pose::consensus_fuse(acc->acc.cloud(), voxel_frac, k, fused);
+    return emit_points(fused, out, n_out);
+}
+
+int free_splatter_accumulator_camera_path(const free_splatter_accumulator * acc,
+                                          float ** out, int32_t * n_frames) {
+    if (out) *out = nullptr;
+    if (n_frames) *n_frames = 0;
+    if (!acc || !out || !n_frames) return -1;
+    std::vector<fsla::Mat4> path = acc->acc.camera_path();
+    *out = (float *) malloc(path.size() * 16 * sizeof(float));
+    if (!*out && !path.empty()) return -1;
+    for (size_t f = 0; f < path.size(); f++)
+        for (int i = 0; i < 4; i++) for (int j = 0; j < 4; j++)
+            (*out)[f*16 + i*4 + j] = (float) path[f](i, j);
+    *n_frames = (int32_t) path.size();
+    return 0;
+}
diff --git a/tools/free_splatter-cli.cpp b/tools/free_splatter-cli.cpp
index 495f15a..683db95 100644
--- a/tools/free_splatter-cli.cpp
+++ b/tools/free_splatter-cli.cpp
@@ -93,19 +93,66 @@ static bool write_splat(const float * g, size_t n, int gc, float opac_thr,
     return true;
 }
 
+// Write an accumulated point cloud (xyz + rgb, no per-point covariance) as an
+// antimatter15 .splat, each point a small isotropic gaussian. Caps to max_splats
+// by stride-subsampling. scale_frac sets the splat radius as a fraction of the
+// cloud's rms extent.
+static bool write_cloud_splat(const free_splatter_point * pts, size_t n, size_t max_splats,
+                              float scale_frac, const char * path) {
+    // rms extent about the centroid -> per-point isotropic scale
+    double m[3] = {0,0,0};
+    for (size_t i = 0; i < n; i++) { m[0]+=pts[i].x; m[1]+=pts[i].y; m[2]+=pts[i].z; }
+    if (n) { m[0]/=n; m[1]/=n; m[2]/=n; }
+    double ss = 0;
+    for (size_t i = 0; i < n; i++) { const double dx=pts[i].x-m[0], dy=pts[i].y-m[1], dz=pts[i].z-m[2]; ss += dx*dx+dy*dy+dz*dz; }
+    const float ext = (float) std::sqrt(ss / (n ? n : 1));
+    const float scale = std::max(scale_frac * ext, 1e-6f);
+
+    const size_t stride = (max_splats > 0 && n > max_splats) ? (n + max_splats - 1) / max_splats : 1;
+    std::ofstream f(path, std::ios::binary);
+    if (!f) { std::fprintf(stderr, "cannot write %s\n", path); return false; }
+    auto u8 = [](float v) -> unsigned char { float t = v < 0 ? 0 : v > 255 ? 255 : v; return (unsigned char) t; };
+    size_t written = 0;
+    for (size_t i = 0; i < n; i += stride) {
+        // OpenCV (y down, z forward) -> viewer OpenGL (y up): negate y,z.
+        float pos[3]   = { pts[i].x, -pts[i].y, -pts[i].z };
+        float sc[3]    = { scale, scale, scale };
+        unsigned char rgba[4] = { u8(pts[i].r*255.0f), u8(pts[i].g*255.0f), u8(pts[i].b*255.0f), 255 };
+        unsigned char rot[4]  = { 255, 128, 128, 128 };   // identity quaternion (w,x,y,z)
+        f.write((const char *) pos, 12);
+        f.write((const char *) sc, 12);
+        f.write((const char *) rgba, 4);
+        f.write((const char *) rot, 4);
+        written++;
+    }
+    std::printf("wrote %s: %zu splats (of %zu cloud points)\n", path, written, n);
+    return true;
+}
+
 static int usage(const char * a0) {
     std::fprintf(stderr,
         "usage: %s [--device DEV] [--splat OUT.splat] [--out OUT.f32]\n"
         "          [--opacity-threshold T] [--max-splats N] [--dump-taps DIR]\n"
-        "          MODEL.gguf (IMAGES... | INPUT.f32)\n", a0);
+        "          MODEL.gguf (IMAGES... | INPUT.f32)\n"
+        "\n"
+        "accumulate mode (>=2 images -> one world from the photo stream):\n"
+        "  %s --accumulate [--splat-prefix P] [--fuse] [--voxel V] [--fuse-k K]\n"
+        "     [--splat-scale S] [--max-splats N] MODEL.gguf IMG0 IMG1 IMG2 ...\n"
+        "  runs the engine on each consecutive pair, chains the runs (Sim(3)) into\n"
+        "  one accumulating cloud; writes P_<nframes>.splat after each pair (the\n"
+        "  evolving reconstruction) and, with --fuse, a consensus-fused P_fused.splat.\n",
+        a0, a0);
     return 2;
 }
 
 int main(int argc, char ** argv) {
     const char * device = nullptr, * taps_dir = nullptr, * out_path = nullptr, * splat_path = nullptr;
-    const char * model = nullptr;
+    const char * model = nullptr, * splat_prefix = nullptr;
     float opac_thr = 5e-3f;
     long  max_splats = 0;
+    bool  accumulate = false, fuse = false;
+    float voxel = 0.02f, splat_scale = 0.0015f;
+    int   fuse_k = 2;
     std::vector<std::string> inputs;
 
     for (int i = 1; i < argc; i++) {
@@ -116,6 +163,12 @@ int main(int argc, char ** argv) {
         else if (a == "--splat" && i+1 < argc)          splat_path = argv[++i];
         else if (a == "--opacity-threshold" && i+1<argc) opac_thr = (float) atof(argv[++i]);
         else if (a == "--max-splats" && i+1 < argc)     max_splats = atol(argv[++i]);
+        else if (a == "--accumulate")                   accumulate = true;
+        else if (a == "--splat-prefix" && i+1 < argc)   splat_prefix = argv[++i];
+        else if (a == "--fuse")                         fuse = true;
+        else if (a == "--voxel" && i+1 < argc)          voxel = (float) atof(argv[++i]);
+        else if (a == "--fuse-k" && i+1 < argc)         fuse_k = atoi(argv[++i]);
+        else if (a == "--splat-scale" && i+1 < argc)    splat_scale = (float) atof(argv[++i]);
         else if (a == "-h" || a == "--help")            return usage(argv[0]);
         else if (!model)                                model = argv[i];
         else                                            inputs.push_back(argv[i]);
@@ -138,6 +191,57 @@ int main(int argc, char ** argv) {
                 geo.image_width, geo.image_height, geo.in_channels, geo.gaussian_channels);
     if (inputs.empty()) { free_splatter_free(ctx); return 0; }
 
+    // ---- accumulate mode: chain consecutive image pairs into one world --------
+    if (accumulate) {
+        if (inputs.size() < 2) { std::fprintf(stderr, "--accumulate needs >=2 images\n"); free_splatter_free(ctx); return 2; }
+        const int sz = geo.image_width;
+        // decode every frame once (CHW per view)
+        std::vector<std::vector<float>> frames;
+        for (const std::string & p : inputs) {
+            std::vector<float> img;
+            if (!load_image_chw(p.c_str(), sz, img)) { free_splatter_free(ctx); return 1; }
+            frames.push_back(std::move(img));
+        }
+        free_splatter_accumulator * acc = free_splatter_accumulator_new(sz, sz, opac_thr);
+        if (!acc) { std::fprintf(stderr, "accumulator alloc failed\n"); free_splatter_free(ctx); return 1; }
+
+        const int64_t per_view = (int64_t) geo.in_channels * sz * sz;
+        for (size_t k = 0; k + 1 < frames.size(); k++) {
+            std::vector<float> pair(2 * per_view);
+            std::memcpy(&pair[0],        frames[k].data(),   per_view * sizeof(float));
+            std::memcpy(&pair[per_view], frames[k+1].data(), per_view * sizeof(float));
+            float * g = nullptr; size_t ng = 0;
+            if (free_splatter_run(ctx, pair.data(), 2, sz, sz, &g, &ng) != 0) {
+                std::fprintf(stderr, "run pair %zu failed: %s\n", k, free_splatter_last_error(ctx));
+                free_splatter_accumulator_free(acc); free_splatter_free(ctx); return 1;
+            }
+            free_splatter_accumulator_add_pair(acc, g, geo.gaussian_channels);
+            free_splatter_buf_free(g);
+            const int nframes = free_splatter_accumulator_frame_count(acc);
+            std::printf("pair %zu (%s, %s) -> %d frames\n", k,
+                        inputs[k].c_str(), inputs[k+1].c_str(), nframes);
+            if (splat_prefix) {
+                free_splatter_point * cloud = nullptr; size_t nc = 0;
+                free_splatter_accumulator_cloud(acc, &cloud, &nc);
+                char path[1024];
+                std::snprintf(path, sizeof path, "%s_%d.splat", splat_prefix, nframes);
+                write_cloud_splat(cloud, nc, (size_t) max_splats, splat_scale, path);
+                free_splatter_buf_free(cloud);
+            }
+        }
+        if (fuse && splat_prefix) {
+            free_splatter_point * fc = nullptr; size_t nf = 0;
+            free_splatter_accumulator_fuse(acc, voxel, fuse_k, &fc, &nf);
+            char path[1024];
+            std::snprintf(path, sizeof path, "%s_fused.splat", splat_prefix);
+            write_cloud_splat(fc, nf, (size_t) max_splats, splat_scale, path);
+            free_splatter_buf_free(fc);
+        }
+        free_splatter_accumulator_free(acc);
+        free_splatter_free(ctx);
+        return 0;
+    }
+
     // assemble input: one raw .f32, or several decoded images
     std::vector<float> buf;
     int32_t n_views = 0;

From 3408afc1588ce7e8f1eff63b7a6e167e04052045 Mon Sep 17 00:00:00 2001
From: Richard Palethorpe <io@richiejp.com>
Date: Sat, 27 Jun 2026 16:00:56 +0100
Subject: [PATCH 14/33] pose/: delete the Python prototype -- fully ported to
 C++ and shipped

Per CLAUDE.md: the pose/ research prototype (focal/align/pnp + accumulate/loop/
fuse + the re10k/T&T validation harnesses) proved the accumulating-reconstruction
approach. That whole pipeline is now rewritten in C++ (src/pose.{h,cpp}), exposed
via free_splatter-cli + include/free_splatter.h with no Python, and validated
(asset-free golden tests + real-data parity recorded in the prior commits). The
prototype was a throwaway, not a parallel implementation to maintain -- so it is
removed. Git history preserves it and its layer-by-layer parity harnesses.

Updates the dangling references (CLAUDE.md, src/linalg.h, src/pose.{h,cpp}) to
point at git history instead of the deleted files.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 CLAUDE.md                     |  30 ++--
 pose/README.md                | 226 -----------------------------
 pose/_upstream.py             | 107 --------------
 pose/accumulate.py            | 157 ---------------------
 pose/align.py                 | 234 ------------------------------
 pose/check_cv2_parity.py      |  73 ----------
 pose/check_upstream_parity.py | 154 --------------------
 pose/empirical.py             |  83 -----------
 pose/find_loop.py             |  70 ---------
 pose/focal.py                 |  48 -------
 pose/fuse.py                  | 134 ------------------
 pose/loop_closure.py          | 150 --------------------
 pose/pnp.py                   | 245 --------------------------------
 pose/re10k_control.py         | 105 --------------
 pose/re10k_crossrun.py        |  69 ---------
 pose/re10k_experiment.py      |  76 ----------
 pose/re10k_fetch.py           |  84 -----------
 pose/render_ply.py            |  54 -------
 pose/test_pose.py             | 258 ----------------------------------
 pose/tt_control.py            | 124 ----------------
 pose/tt_experiment.py         |  79 -----------
 src/linalg.h                  |   5 +-
 src/pose.cpp                  |  14 +-
 src/pose.h                    |  10 +-
 24 files changed, 34 insertions(+), 2555 deletions(-)
 delete mode 100644 pose/README.md
 delete mode 100644 pose/_upstream.py
 delete mode 100644 pose/accumulate.py
 delete mode 100644 pose/align.py
 delete mode 100644 pose/check_cv2_parity.py
 delete mode 100644 pose/check_upstream_parity.py
 delete mode 100644 pose/empirical.py
 delete mode 100644 pose/find_loop.py
 delete mode 100644 pose/focal.py
 delete mode 100644 pose/fuse.py
 delete mode 100644 pose/loop_closure.py
 delete mode 100644 pose/pnp.py
 delete mode 100644 pose/re10k_control.py
 delete mode 100644 pose/re10k_crossrun.py
 delete mode 100644 pose/re10k_experiment.py
 delete mode 100644 pose/re10k_fetch.py
 delete mode 100644 pose/render_ply.py
 delete mode 100644 pose/test_pose.py
 delete mode 100644 pose/tt_control.py
 delete mode 100644 pose/tt_experiment.py

diff --git a/CLAUDE.md b/CLAUDE.md
index d4e3a71..15bc41a 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -33,11 +33,14 @@ across all three variants, so object/object-2dgs are cheap follow-ons.
   1. **Dev-time reference / conversion / validation** that runs in
      `docker/Dockerfile.cuda` (`scripts/hf_dump.py`, `convert.py`,
      `compare_taps.py`, …) — the only place torch runs; never a runtime dependency.
-  2. **The `pose/` research prototype, TEMPORARILY.** It may continue in Python
-     (numpy + cv2) **only until the accumulating-reconstruction approach is
-     proven**. Once proven it is **rewritten in C++**, exposed via the CLI + C API,
-     and **the Python is deleted.** `pose/` is not wired into CMake/ctest and is a
-     throwaway prototype, not a parallel implementation to maintain.
+  2. **The `pose/` research prototype — now DONE and DELETED.** It was the
+     temporary Python (numpy + cv2) prototype that proved the
+     accumulating-reconstruction approach. That approach is proven, the whole
+     pipeline (focal, Sim(3) align, robust PnP, accumulation, loop closure,
+     consensus fusion) is **rewritten in C++** (`src/pose.{h,cpp}`), exposed via
+     `free_splatter-cli` + `include/free_splatter.h`, and the Python prototype has
+     been **removed** (see git history for it and its layer-by-layer parity
+     harnesses). No Python remains in the pose path.
 
 ## Validation is the backbone (non-negotiable)
 
@@ -71,15 +74,16 @@ This is a numerical port. Correctness means matching the PyTorch reference
 
 ## Per-component discipline
 
-Each component (`image`, `gguf_loader`, `backend`, `model`, the head, and — once
-ported to C++ — **`pose` = PnP + focal + Sim(3) alignment**) has its own unit test
-and is made independently green **before** cross-component parity. Component
-boundaries match the file layout. Keep the seam at the
+Each component (`image`, `gguf_loader`, `backend`, `model`, the head, and
+**`pose` = PnP + focal + Sim(3) alignment + accumulation/loop/fusion**) has its own
+unit test and is made independently green **before** cross-component parity.
+Component boundaries match the file layout. Keep the seam at the
 `[N,H,W,gaussian_channels]` tensor clean — that is the contract between the engine
-and the pose/rendering consumers. The C++ `pose` component inherits the parity
-discipline the Python prototype already established: **bit-exact to upstream
-`estimate_poses`** (see `pose/check_upstream_parity.py`) and **validated against
-independent ground-truth poses** (`pose/re10k_experiment.py`).
+and the pose/rendering consumers. The C++ `pose` component (`tests/test_pose.cpp`,
+asset-free golden tier) carries the parity discipline the Python prototype
+established: **bit-exact to upstream `estimate_poses`** and **validated against
+independent ground-truth poses** — see git history for the prototype's
+`check_upstream_parity.py` / `re10k_experiment.py` harnesses.
 
 ## Debugging philosophy (mandatory sequence)
 
diff --git a/pose/README.md b/pose/README.md
deleted file mode 100644
index b5474ec..0000000
--- a/pose/README.md
+++ /dev/null
@@ -1,226 +0,0 @@
-# pose/ — camera recovery + cross-run alignment (downstream prototype)
-
-**Status: TEMPORARY Python prototype — to be rewritten in C++ once proven, then
-deleted.**
-
-PnP pose recovery and cross-run alignment are now **in scope** (see CLAUDE.md).
-The shipped implementation will be **C++**, reachable from `free_splatter-cli` and
-the C API with **no Python dependency**. This directory is the throwaway research
-prototype that proves the accumulating-reconstruction approach first: a
-self-contained numpy + cv2 consumer of the engine's `[N, H, W, 23]` output,
-**not wired into the CMake build or `ctest`**. Once the approach is proven it is
-ported to C++ and this Python is removed — it is not a parallel implementation to
-maintain.
-
-## C++ port status (the ship target — `src/pose.{h,cpp}`)
-
-The port has begun, dependency-free (only the self-contained `src/linalg.h`
-Jacobi eigensolver — no Eigen/OpenCV), wired into the library and the asset-free
-test tier (`tests/test_pose.cpp`, `ctest -LE model`):
-
-- ✅ **`focal.py` → `estimate_focal`** — Weiszfeld; **bit-exact** to the numpy
-  reference on a real scene dump (596.408591886 both).
-- ✅ **`align.py` → Umeyama / RANSAC / residual-ladder / Sim(3) chaining /
-  `sim_frac_power`** — `sim_frac_power` is a closed-form Sim(3) one-parameter
-  subgroup (`A^f=s^f R^f`, translation `(A^f−I)(A−I)⁻¹t`), no complex eig needed.
-  All 9 golden tests (the mirror of `test_pose.py`) pass under ASan/UBSan.
-- ✅ **Robust PnP → `solve_pnp` (EPnP + Gauss-Newton)** — `solve_pnp_numpy` (DLT +
-  RANSAC, via the 12×12 `AᵀA` nullspace) is kept as the asset-free golden-test
-  reference, but on real scenes the DLT inherits the textbook **planar/mirror
-  degeneracy** (seed-dependent: ~3/5 RANSAC seeds near cv2, ~2/5 a ~135–152°
-  flip). The shipped real-data solver is `solve_pnp`: **EPnP** (planar-robust,
-  non-iterative, uses all points so there is no random minimal-sample flip —
-  reuses the 12×12 `MᵀM` Jacobi eigendecomposition) for the init, then a
-  **Huber-robust Gauss-Newton** reprojection refine. On the real scene dump it is
-  **deterministic** (identical across all RANSAC seeds) and lands at **0.73° of
-  the upstream cv2/SQPNP** relative rotation (0.74° translation direction) — i.e.
-  **cv2-parity on real data with no OpenCV dependency**, versus the numpy DLT's
-  175° miss on the same dump. Golden tests cover exact recovery, a near-planar
-  config (where DLT flips), and 15% gross-outlier rejection.
-- ✅ **Accumulation chaining → `Accumulator` (accumulate.py)** — the live
-  sliding-window loop: feed each consecutive pair's `[2,H,W,gc]` engine output,
-  it recovers the pair's cameras (`estimate_poses`), fits the cross-run Sim(3) on
-  the shared frame, composes a global chain, and drops every new frame's gaussians
-  into one world (`cloud()`, `camera_path()`). Golden-tested (synthetic clip:
-  trajectory ATE 7.6e-8 of extent, per-link scale to 3e-9). **Real-data parity**
-  on the 13 cached `pair_*.f32` dumps vs the numpy/cv2 prototype: cloud size
-  **bit-exact** (2,633,725) and per-link valid% identical (deterministic mask);
-  per-link Sim(3) scale agrees to **mean 0.5%** (11/12 links <1%, worst on the
-  documented low-inlier leg); trajectory within **6.6%** of the cv2 chain — the
-  residual is the known RANSAC-RNG + EPnP-vs-SQPNP delta, not a port gap.
-- ✅ **Loop closure → `sim4_invert` + `distribute_drift` (loop_closure.py)** — the
-  even Sim(3) relaxation: from an accumulated drift `D` (the loop-closure
-  measurement `P_n_loop · P_last⁻¹`), distribute `D^(k/n)` over the chain.
-  `sim_frac_power` (the closed-form one-parameter subgroup) is **numerically
-  identical to numpy's eig-based version (5e-10)** across f ∈ {−0.5 … 1.3};
-  `sim4_invert` is an exact similarity inverse (1e-9); golden test recovers a
-  known drifted loop to 4e-16. **Real-data parity** on the cached loopcache (13
-  chain pairs + `close_0_260`): the recovered drift matches the prototype's loop
-  error (scale 1.09 vs 1.12, **4.6° vs 4.4°**, README "4.4°"), deterministic
-  valid% identical; the corrected-trajectory delta is the (already-characterized)
-  EPnP-vs-cv2 PnP backend feeding `D`, since the distribution math itself is
-  bit-identical to numpy. Confirms the prototype's finding: on this short clip the
-  chain already closes, so loop closure is a near-identity correction.
-- ✅ **Consensus fusion → `consensus_fuse` (fuse.py)** — voxelize the accumulated
-  cloud at `voxel_frac · extent` and keep only voxels corroborated by ≥ K distinct
-  source frames, averaging the agreeing predictions (the per-point `frame` tag the
-  `Accumulator` carries makes this a single hash-grid pass). Golden test pins the
-  counts exactly on controlled synthetic support. **Real-data parity** vs `fuse.py`
-  on the 13 `acc` dumps (voxel 0.02, K≥2): raw points **bit-exact** (2,633,725),
-  per-point floater drop **14.0% vs 14%**, raw→fused reduction 93.9% vs 94%,
-  kept-voxel fraction 53.8% vs 54% — the sub-1% voxel-count delta is the chaining
-  RANSAC-RNG shifting positions at voxel boundaries, not the fusion math. The
-  prototype's "remove the 14% single-frame edge-haze floaters" result reproduces.
-- ⬜ Not yet ported: the CLI / C-API surface (then this prototype is deleted).
-
-## Why it's needed
-
-Each `free_splatter_run` over a different photo pair produces gaussians in its
-**own** coordinate system: positions are predicted in the first view's camera
-frame, then the whole scene is rescaled by `s = 1/mean(‖point‖)` (see upstream
-`estimate_poses`). To stitch successive splats into one world you need:
-
-1. **PnP** — recover each view's camera from the per-pixel 2D↔3D correspondences
-   the gaussians hand you for free.
-2. **Cross-run alignment** — because two runs normalize by a *different* `1/d`,
-   the shared content comes out at a different **scale**. Aligning run B into run
-   A is therefore a **similarity** (rotation + translation + one uniform scale,
-   7-DoF), not a rigid transform.
-
-## Upstream recipe being mirrored (TencentARC/FreeSplatter)
-
-`freesplatter/models/model.py::estimate_poses` → DUSt3R-derived `fast_pnp`
-(`utils/recon_util.py`):
-
-Verified against the cached upstream source (`.cache/upstream/{model,runner}.py`)
-and the fetched `recon_util.py`, the **scene** recipe (the live use case) is:
-
-| step | upstream (scene path) | here |
-|---|---|---|
-| correspondences | pixel ↔ gaussian center `X` (ch 0:3) | `pnp.pixel_grid` (integer px) + `points` |
-| validity mask (PnP) | `sigmoid(opacity) > 0.05` | `estimate_poses(opacity_threshold=)` (engine output is already activated) |
-| focal | Weiszfeld, **view 0 only** (`use_first_focal=True`), **all pixels** (`masks=None`) | `estimate_poses(use_first_focal=True)` → `focal.estimate_focal` |
-| pose | `cv2.solvePnPRansac(reprojErr=5, SOLVEPNP_SQPNP, iters=10)`, `cam2world=inv(world2cam)` | `pnp.solve_pnp_cv2` (exact) / `solve_pnp_numpy` (reference) |
-| anchor | none — view 0 is ~identity by construction (gaussians live in view 0's frame) | `estimate_poses` returns raw poses |
-| scene scale | runner rescales **camera centers** so the view0→view1 **baseline = 1**: `s = 1/(‖t₁−t₀‖+1e-2)` | `pnp.rescale_to_baseline` / `estimate_poses(normalize=True)` |
-
-> An earlier draft had three of these wrong (focal averaged over all views, an
-> `inv(c2w[0])` re-anchor, and a `1/mean‖pts‖` scene scale). Driving the *actual*
-> upstream `estimate_poses` on real engine output (see `check_upstream_parity.py`)
-> surfaced and fixed all three — the focal one mattered: averaging a low-overlap
-> second view dragged focal 507 vs the correct 596, a 15% error → 1.35° pose error.
-
-`solve_pnp_cv2` is the byte-for-byte upstream call (needs `opencv`, in the CUDA
-container or the nix shell). `solve_pnp_numpy` is a DLT + RANSAC reference with no
-third-party deps, verified against analytic ground truth so the pipeline runs
-anywhere numpy is.
-
-## Files
-
-- `focal.py` — Weiszfeld shared-focal estimation (mirrors upstream `estimate_focal`).
-- `pnp.py` — DLT/RANSAC + cv2 PnP backends, `estimate_poses` orchestration, and
-  `rescale_to_baseline` (the runner's scene normalization).
-- `align.py` — Umeyama similarity fit, RANSAC variant, the **residual ladder**
-  (`diagnose`: rigid→similarity→affine + depth-correlation) that detects whether a
-  single uniform scale suffices or the mismatch is non-uniform, and similarity
-  chaining / loop-closure metrics.
-- `_upstream.py` — vendored upstream kernels (`fast_pnp` verbatim, `estimate_focal`
-  numpy-transcribed) used ONLY by the parity check; not our code.
-- `test_pose.py` — asset-free golden tests (no model, no fixtures, no cv2).
-- `check_cv2_parity.py` — `solve_pnp_numpy` vs the exact `cv2.solvePnPRansac` on
-  synthetic ground truth (needs cv2).
-- `check_upstream_parity.py` — our whole orchestration vs the upstream recipe on
-  REAL engine output (needs cv2 + a dumped `[N,H,W,23]` `.f32`).
-- `empirical.py` — cross-run residual-ladder on real engine output (the scale test).
-
-### Dense GT-posed control (is it us, or the data/model?)
-
-- `tt_control.py` / `tt_experiment.py` — Tanks-and-Temples (NSVF) loader + engine
-  vs GT-pose check. **Verdict: T&T is OUT OF DISTRIBUTION** for FreeSplatter-scene
-  (narrow-FOV object orbits): opacity confident on only 8–17% of pixels, pose error
-  28–145°. Kept as the harness + the negative result.
-- `re10k_control.py` / `re10k_fetch.py` / `re10k_experiment.py` — RealEstate10K
-  loader (parser + GT geometry), yt-dlp/ffmpeg frame fetch, and the engine vs
-  GT-pose check. **In distribution → the control works**: relative pose recovered
-  to **0.4–1.5°** vs GT, opacity confident on 68–75% of pixels.
-- `re10k_crossrun.py` — cross-run consistency vs baseline (the accumulation
-  question): a shared frame's two reconstructions agree on **98% of pixels within
-  10%** of scene extent at small baseline.
-
-### Accumulation prototype (the live idea, assembled)
-
-- `accumulate.py` — slide a window over a clip, recover each pair (PnP), chain a
-  Sim(3) between consecutive runs via their shared-frame correspondences
-  (`align.compose`), drop every frame's gaussians into one world, and measure the
-  recovered camera trajectory vs GT. `render_ply.py` — pinhole projection of the
-  colored cloud to a PNG (visual coherence check).
-- **Result (13 pairs, stride 20):** per-link Sim(3) residual ~1.0–1.4%; trajectory
-  ATE **11%** of extent with drift growing 7%→13% (monotone monocular scale drift,
-  0.755 over 12 links); the 2.6M-point cloud renders as a **coherent room** matching
-  the input. The accumulating-reconstruction idea is **proven** end-to-end.
-- `find_loop.py` — search re10k poses for a clip that revisits its start (the loop
-  substrate). `loop_closure.py` — chain open-loop, measure the loop error via a
-  closing pair, then distribute it (even Sim(3) relaxation, `align.sim_frac_power`)
-  and report ATE before/after. **Finding:** loop closure helps only when drift
-  *accumulates* into a large loop error. On the real loop clip tested it did NOT
-  help — the open-loop chain already closes (loop error 4.4° / scale 1.12 / 8%
-  trans), so the ~34% ATE is per-link **odometry noise** (inlier% as low as 17%
-  on fast legs) + the focal-bias warp, *self-consistent but distorted vs GT*, which
-  loop closure can't fix. The correction machinery itself is verified to recover
-  synthetic accumulated drift to ~0 (`test_pose.py::test_loop_correction`).
-- `fuse.py` — **consensus fusion** (the edge-noise answer): voxelize the accumulated
-  cloud at the consistency scale and keep only voxels corroborated by ≥K distinct
-  frames, averaging the agreeing predictions. On the forward clip: **14% of points
-  are single-frame floaters** that render as incoherent edge-haze, while the ≥2-frame
-  consensus renders as a **clean, crisp room** with the haze gone. Tradeoff: dropping
-  single-frame points also trims the single-view periphery (coverage vs cleanliness).
-
-## Run the tests
-
-```sh
-nix develop -c python3 pose/test_pose.py                      # golden, numpy only
-nix develop -c python3 pose/check_cv2_parity.py               # solver vs cv2 (needs cv2)
-EMP_DIR=/path/to/dumps nix develop -c python3 \
-    pose/check_upstream_parity.py A_scn.f32                   # orchestration vs upstream
-# dense control (needs the engine + a GGUF + network for re10k frames):
-nix develop -c python3 pose/re10k_fetch.py CLIP.txt OUTDIR 10
-FS_DEVICE=cpu nix develop -c python3 \
-    pose/re10k_experiment.py CLIP.txt OUTDIR 40 20,40,80,160
-```
-
-## Status
-
-- ✅ **Solver parity** — `solve_pnp_numpy` ≡ `cv2.solvePnPRansac` (~1e-7 clean).
-- ✅ **Orchestration parity** — our `estimate_poses` is **bit-exact** to upstream's
-  scene recipe on real engine output (focal/PnP/relative-pose all 0.00° at matched
-  precision; the only residual is upstream's float32-K vs our float64, a RANSAC
-  inlier-boundary effect of ≤0.5° on near-degenerate object data, 0° on scene).
-- ✅ **Empirical scale test** — cross-run mismatch is a uniform-scale similarity
-  (scene ~11% scale drift), no nonlinear warp; `diagnose` classifies it.
-- ✅ **Dense GT-posed control** — pipeline validated against INDEPENDENT GT on
-  in-distribution re10k (relative pose to 0.4–1.5°); T&T confirmed OOD. Finding:
-  the model has a constant wide-FOV focal bias (recovers ~274 vs GT ~439) — benign
-  for relative accumulation, off for metric scale.
-- ✅ **Accumulation prototype** — sliding-window Sim(3) chaining over a clip builds
-  one coherent world; trajectory tracks GT to ~11% ATE with bounded-able drift. The
-  idea is proven; per CLAUDE.md the next implementation step is the **C++ port**.
-- ✅ **Loop closure** — implemented + machinery verified on synthetic drift (recovers
-  to ~0). On a real loop clip it did NOT help: the chain already closes, so the error
-  is odometry noise + focal warp, not accumulated drift. Lesson: better odometry
-  (smaller baselines / fusion / focal) is the lever for short loops; loop closure
-  pays off on long trajectories with consistent accumulated drift.
-- ✅ **Consensus fusion** — gating the accumulated cloud on ≥2-frame agreement removes
-  the 14% single-frame floaters (incoherent edge-haze) and yields a clean surface —
-  the definitive "yes, accumulation removes the edge noise." Trades single-view
-  periphery for a cleaner core.
-
-## Not done yet (honest)
-
-- The **focal bias** (recovered ~274 vs GT ~440+) — the remaining geometry-distortion
-  source; worth understanding whether it's fixable.
-- A **higher-motion clip** wide-baseline sweep, and a **long** trajectory where loop
-  closure demonstrably pays off.
-- **Fuse-then-align**: feed the consensus surface back into the odometry (lift the
-  low-inlier links the loop-closure diagnosis flagged), not just the final cloud.
-- The **C++ port** (CLI + C API, no Python) once the design is locked — then this
-  prototype is deleted.
diff --git a/pose/_upstream.py b/pose/_upstream.py
deleted file mode 100644
index 97f7199..0000000
--- a/pose/_upstream.py
+++ /dev/null
@@ -1,107 +0,0 @@
-"""Vendored upstream reference for the parity check -- NOT our code.
-
-These are the two numerical kernels FreeSplatter's `estimate_poses` calls, copied
-from TencentARC/FreeSplatter `freesplatter/utils/recon_util.py`
-(https://raw.githubusercontent.com/TencentARC/FreeSplatter/main/freesplatter/utils/recon_util.py,
-fetched 2026-06-26). They exist here ONLY so `check_upstream_parity.py` can run
-the original algorithm against our `focal.py`/`pnp.py` on real engine output.
-
-  * `fast_pnp`  -- copied VERBATIM (it is cv2 + numpy only; runs as-is in nix).
-  * `estimate_focal_np` -- a faithful numpy transcription of upstream
-    `estimate_focal` (the original is torch; the math is identical -- integer
-    `xy_grid` pixels, nan_to_num, mean/mean init, 10 IRLS reweights, the
-    `focal_base` clip which is a no-op at the default min/max). Kept line-by-line
-    so any divergence is a real algorithmic difference, not a re-derivation.
-
-Do not "improve" or refactor this file; its value is being unmodified upstream.
-"""
-import cv2
-import numpy as np
-
-
-def xy_grid(W, H):
-    """Upstream xy_grid (numpy branch): (H,W,2) int, [...,0]=col(x), [...,1]=row(y)."""
-    tw, th = np.arange(W), np.arange(H)
-    gx, gy = np.meshgrid(tw, th, indexing="xy")     # both (H,W)
-    return np.stack([gx, gy], axis=-1)
-
-
-def estimate_focal_np(pts3d, pp=None, mask=None, min_focal=0.0, max_focal=np.inf):
-    """Faithful numpy transcription of upstream recon_util.estimate_focal."""
-    H, W, THREE = pts3d.shape
-    assert THREE == 3
-    if pp is None:
-        pp = np.array([W / 2.0, H / 2.0], dtype=float)
-    pp = np.asarray(pp, float)
-
-    pixels = xy_grid(W, H).reshape(-1, 2).astype(float) - pp.reshape(1, 2)   # (HW,2)
-    pts = pts3d.reshape(H * W, 3).astype(float)
-
-    if mask is not None:
-        m = np.asarray(mask).ravel().astype(bool)
-        assert len(m) == pts.shape[0]
-        pts = pts[m]
-        pixels = pixels[m]
-
-    xy_over_z = pts[:, :2] / pts[:, 2:3]
-    xy_over_z = np.nan_to_num(xy_over_z, posinf=0.0, neginf=0.0)
-
-    dot_xy_px = (xy_over_z * pixels).sum(axis=-1)
-    dot_xy_xy = (xy_over_z ** 2).sum(axis=-1)
-    focal = dot_xy_px.mean() / dot_xy_xy.mean()
-
-    for _ in range(10):
-        dis = np.linalg.norm(pixels - focal * xy_over_z, axis=-1)
-        w = 1.0 / np.clip(dis, 1e-8, None)
-        focal = (w * dot_xy_px).mean() / (w * dot_xy_xy).mean()
-
-    focal_base = max(H, W) / (2 * np.tan(np.deg2rad(60) / 2))
-    focal = float(np.clip(focal, min_focal * focal_base, max_focal * focal_base))
-    return focal
-
-
-def fast_pnp(pts3d, mask, focal=None, pp=None, niter_PnP=10, k_dtype=np.float32):
-    """VERBATIM copy of upstream recon_util.fast_pnp (cv2 + numpy).
-
-    The default `k_dtype=np.float32` reproduces upstream byte-for-byte (upstream
-    writes `K = np.float32(...)`). `check_upstream_parity.py` passes float64 to
-    isolate the one precision choice that differs from our wrapper: on near-
-    threshold-dense object data the float32 K nudges a few hundred points across
-    the 5px inlier boundary, shifting the pose ~0.5deg. Matched precision -> exact."""
-    H, W, _ = pts3d.shape
-    pixels = np.mgrid[:W, :H].T.astype(float)
-
-    if focal is None:
-        S = max(W, H)
-        tentative_focals = np.geomspace(S / 2, S * 3, 21)
-    else:
-        tentative_focals = [focal]
-
-    if pp is None:
-        pp = (W / 2, H / 2)
-
-    best = 0,
-    for focal in tentative_focals:
-        K = np.array([(focal, 0, pp[0]), (0, focal, pp[1]), (0, 0, 1)], dtype=k_dtype)
-
-        success, R, T, inliers = cv2.solvePnPRansac(
-            pts3d[mask], pixels[mask], K, None,
-            iterationsCount=niter_PnP, reprojectionError=5, flags=cv2.SOLVEPNP_SQPNP)
-        if not success:
-            continue
-
-        score = len(inliers)
-        if success and score > best[0]:
-            best = score, R, T, focal
-
-    if not best[0]:
-        return None
-
-    _, R, T, best_focal = best
-    R = cv2.Rodrigues(R)[0]      # world to cam
-    world2cam = np.eye(4).astype(float)
-    world2cam[:3, :3] = R
-    world2cam[:3, 3] = T.reshape(3)
-    cam2world = np.linalg.inv(world2cam)
-
-    return best_focal, cam2world
diff --git a/pose/accumulate.py b/pose/accumulate.py
deleted file mode 100644
index decab3a..0000000
--- a/pose/accumulate.py
+++ /dev/null
@@ -1,157 +0,0 @@
-"""Sliding-window accumulation prototype -- assemble the validated pieces.
-
-Slide a window of overlapping pairs across a re10k clip:
-    pair k = (f_k, f_{k+1})   ->  gaussians in f_k's camera frame (run k's world)
-Consecutive runs share a frame (f_{k+1} is view1 of pair k and view0 of pair k+1),
-so their per-pixel gaussians for that frame are exact correspondences. Fit a Sim(3)
-S_k mapping run k's frame into run k-1's frame, and COMPOSE into a global transform
-    T_0 = I ,  T_k = T_{k-1} . S_k
-that drops every run's gaussians into ONE world (frame f_0's). Accumulate them,
-and measure the recovered camera trajectory against ground truth (drift).
-
-This is the live pipeline minus realtime: PnP (pnp.py) + Sim(3) align (align.py) +
-chaining (align.compose) over a real sequence, justified by re10k_crossrun.py.
-
-    FS_DEVICE=cpu nix develop -c python3 pose/accumulate.py CLIP.txt FRAME_DIR \\
-        --start 0 --stride 20 --count 13 --cache DIR --ply out.ply
-
-Engine dumps are cached (pair_*.f32); re-runs skip inference.
-"""
-import os
-import argparse
-import numpy as np
-
-import re10k_control as rc
-import re10k_fetch as rf
-import re10k_experiment as ex          # run_engine
-import pnp
-import align
-
-C0 = 0.28209479177387814
-THR = 0.05
-
-
-def load_dump(path):
-    a = np.fromfile(path, np.float32).reshape(2, 512, 512, 23)
-    pts = [a[v, ..., 0:3].astype(np.float64) for v in (0, 1)]
-    op = [a[v, ..., 15].astype(np.float64) for v in (0, 1)]
-    rgb = [np.clip(a[v, ..., 3:6] * C0 + 0.5, 0, 1) for v in (0, 1)]
-    return pts, op, rgb
-
-
-def write_ply(path, xyz, rgb):
-    n = len(xyz)
-    hdr = ("ply\nformat binary_little_endian 1.0\n"
-           f"element vertex {n}\n"
-           "property float x\nproperty float y\nproperty float z\n"
-           "property uchar red\nproperty uchar green\nproperty uchar blue\n"
-           "end_header\n")
-    rec = np.zeros(n, dtype=[("x", "<f4"), ("y", "<f4"), ("z", "<f4"),
-                             ("r", "u1"), ("g", "u1"), ("b", "u1")])
-    rec["x"], rec["y"], rec["z"] = xyz[:, 0], xyz[:, 1], xyz[:, 2]
-    c = (np.clip(rgb, 0, 1) * 255).astype(np.uint8)
-    rec["r"], rec["g"], rec["b"] = c[:, 0], c[:, 1], c[:, 2]
-    with open(path, "wb") as f:
-        f.write(hdr.encode())
-        f.write(rec.tobytes())
-
-
-def main():
-    ap = argparse.ArgumentParser()
-    ap.add_argument("clip"); ap.add_argument("frame_dir")
-    ap.add_argument("--start", type=int, default=0)
-    ap.add_argument("--stride", type=int, default=20)
-    ap.add_argument("--count", type=int, default=13)        # number of pairs
-    ap.add_argument("--cache", default=os.environ.get("SCRATCH", "/tmp"))
-    ap.add_argument("--ply", default=None)
-    ap.add_argument("--max-points", type=int, default=2_000_000)
-    a = ap.parse_args()
-
-    url, frames = rc.parse_clip(a.clip)
-    idx = [a.start + k * a.stride for k in range(a.count + 1)]      # n+1 frame indices
-    assert idx[-1] < len(frames), f"frame {idx[-1]} beyond clip ({len(frames)})"
-    # make sure the frames exist (fetch on demand)
-    missing = [i for i in idx if not os.path.exists(os.path.join(a.frame_dir, f"f{i:04d}.png"))]
-    if missing:
-        rf.fetch(a.clip, a.frame_dir, indices=idx)
-    os.makedirs(a.cache, exist_ok=True)
-    fp = lambda i: os.path.join(a.frame_dir, f"f{i:04d}.png")
-
-    print(f"=== accumulate: {os.path.basename(a.clip)}  frames {idx[0]}..{idx[-1]} "
-          f"stride {a.stride}  ({a.count} pairs)  device={ex.DEVICE} ===\n{url}")
-
-    # --- run / load each pair, recover its relative pose ---
-    runs = []           # per pair: dict(pts, op, rgb, c2w)
-    for k in range(a.count):
-        dump = os.path.join(a.cache, f"pair_{idx[k]}_{idx[k+1]}.f32")
-        if not os.path.exists(dump):
-            ex.run_engine(fp(idx[k]), fp(idx[k + 1]), dump)
-            print(f"  ran pair {k}: ({idx[k]},{idx[k+1]})")
-        pts, op, rgb = load_dump(dump)
-        res = pnp.estimate_poses(pts, op, normalize=False)
-        runs.append({"pts": pts, "op": op, "rgb": rgb, "c2w": res["cam2world"]})
-
-    # --- chain Sim(3): T_k maps run k's frame -> global (frame f_0) ---
-    T = [align.identity()]
-    chain_info = []
-    for k in range(1, a.count):
-        XA, oA = runs[k]["pts"][0].reshape(-1, 3), runs[k]["op"][0].reshape(-1)     # f_k as view0 of run k
-        XB, oB = runs[k-1]["pts"][1].reshape(-1, 3), runs[k-1]["op"][1].reshape(-1) # f_k as view1 of run k-1
-        m = (oA > THR) & (oB > THR)
-        scene = align._rms(XB[m] - XB[m].mean(0))
-        s, R, t, inl = align.fit_similarity_ransac(XA[m], XB[m], thresh=0.02 * scene, iters=300)
-        S_k = (s, R, t)
-        T.append(align.compose(T[k-1], S_k))
-        resid = align._rms(align.apply_sim(S_k, XA[m][inl]) - XB[m][inl]) / scene
-        chain_info.append((k, s, 100*inl.mean(), 100*m.mean(), 100*resid))
-
-    # --- accumulate gaussians into the global frame (each frame once) ---
-    XYZ, RGB = [], []
-    def add(run_idx, view, Tk):
-        pts = runs[run_idx]["pts"][view].reshape(-1, 3)
-        op = runs[run_idx]["op"][view].reshape(-1)
-        rgb = runs[run_idx]["rgb"][view].reshape(-1, 3)
-        keep = op > THR
-        XYZ.append(align.apply_sim(Tk, pts[keep])); RGB.append(rgb[keep])
-    add(0, 0, T[0])                                  # f_0 (view0 of pair 0)
-    for k in range(a.count):                         # f_{k+1} = view1 of pair k
-        add(k, 1, T[k])
-    XYZ = np.concatenate(XYZ); RGB = np.concatenate(RGB)
-
-    # --- recovered camera trajectory (global) vs GT ---
-    rec = [align.apply_sim(T[k], runs[k]["c2w"][0][:3, 3]) for k in range(a.count)]
-    rec.append(align.apply_sim(T[a.count-1], runs[a.count-1]["c2w"][1][:3, 3]))
-    rec = np.array(rec)
-    gt = np.array([frames[i]["c2w"][:3, 3] for i in idx])
-    sA, RA, tA = align.fit_similarity(rec, gt)        # align recovered traj -> GT
-    rec_al = align.apply_sim((sA, RA, tA), rec)
-    perr = np.linalg.norm(rec_al - gt, axis=1)
-    gt_ext = align._rms(gt - gt.mean(0))
-
-    # --- report ---
-    print("\nchain (per shared frame): step  scale  inlier%  valid%  resid%scene")
-    for k, s, inlp, vp, rp in chain_info:
-        print(f"   {k:>3}   {s:>6.3f}  {inlp:>6.1f}  {vp:>6.1f}  {rp:>7.2f}")
-    total_scale = np.prod([ci[1] for ci in chain_info]) if chain_info else 1.0
-    print(f"cumulative scale drift over {a.count-1} links: {total_scale:.3f} "
-          f"(per-link mean {total_scale**(1/max(a.count-1,1)):.4f})")
-
-    print(f"\ntrajectory vs GT ({len(idx)} frames, after global Sim(3) align):")
-    print(f"   ATE rms = {align._rms(rec_al-gt):.4f}  ({100*align._rms(rec_al-gt)/gt_ext:.1f}% "
-          f"of GT extent {gt_ext:.4f})")
-    print(f"   per-frame error % of extent: " +
-          " ".join(f"{100*e/gt_ext:.0f}" for e in perr))
-    print(f"   drift: first-half {100*perr[:len(perr)//2].mean()/gt_ext:.1f}%  "
-          f"second-half {100*perr[len(perr)//2:].mean()/gt_ext:.1f}%")
-
-    print(f"\naccumulated cloud: {len(XYZ):,} points (opacity>{THR})")
-    if a.ply:
-        if len(XYZ) > a.max_points:
-            sel = np.random.default_rng(0).choice(len(XYZ), a.max_points, replace=False)
-            XYZ, RGB = XYZ[sel], RGB[sel]
-        write_ply(a.ply, XYZ, RGB)
-        print(f"   wrote {a.ply} ({len(XYZ):,} points)")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/pose/align.py b/pose/align.py
deleted file mode 100644
index 0beec07..0000000
--- a/pose/align.py
+++ /dev/null
@@ -1,234 +0,0 @@
-"""Coordinate-system normalization: align one inference's world into another's.
-
-DOWNSTREAM of the engine seam. Each free_splatter_run over a different photo
-pair invents its OWN coordinate system (gaussians in the first view's camera
-frame, then rescaled by s = 1/mean-point-distance -- see FreeSplatter
-estimate_poses). Two runs that share a photo therefore describe the SAME surface
-at a DIFFERENT scale, because each run's 1/d normalization is computed over a
-different point support. So aligning run B into run A is a SIMILARITY transform
-(rotation + translation + ONE uniform scale, 7 DoF), not a rigid one.
-
-This module fits that similarity in closed form (Umeyama 1991), provides a
-RANSAC variant for floaters, and -- crucially -- a residual ladder that *detects*
-whether a single uniform scale actually suffices or whether the mismatch is
-non-uniform (focal error / affine depth ambiguity), which is the open question.
-
-Pure numpy. A similarity is represented as the tuple T = (s, R, t) acting on
-column-stacked points by  x -> s * R @ x + t.
-"""
-import numpy as np
-
-
-def apply_sim(T, X):
-    """Apply T=(s,R,t) to X (N,3) -> (N,3)."""
-    s, R, t = T
-    return (s * (R @ X.T)).T + t
-
-
-def _rms(resid):
-    """Root-mean-square point error of an (N,3) residual array."""
-    return float(np.sqrt((resid ** 2).sum(axis=1).mean()))
-
-
-def fit_similarity(X, Y, with_scale=True):
-    """Umeyama: best (s,R,t) minimizing ||s R X + t - Y||^2 in closed form.
-
-    with_scale=False degenerates to rigid Kabsch (s fixed at 1). Reflections are
-    rejected (proper rotation, det R = +1) exactly as Umeyama prescribes.
-    """
-    X = np.asarray(X, float)
-    Y = np.asarray(Y, float)
-    n = X.shape[0]
-    mx = X.mean(0)
-    my = Y.mean(0)
-    Xc = X - mx
-    Yc = Y - my
-    Sigma = (Yc.T @ Xc) / n               # 3x3 cross-covariance, maps X -> Y
-    U, D, Vt = np.linalg.svd(Sigma)
-    S = np.eye(3)
-    if np.linalg.det(U) * np.linalg.det(Vt) < 0:   # reflection guard
-        S[-1, -1] = -1.0
-    R = U @ S @ Vt
-    if with_scale:
-        var_x = (Xc ** 2).sum() / n
-        s = float((D * np.diag(S)).sum() / var_x) if var_x > 0 else 1.0
-    else:
-        s = 1.0
-    t = my - s * (R @ mx)
-    return s, R, t
-
-
-def fit_rigid(X, Y):
-    """Rigid (6-DoF) fit -- similarity with the scale DoF removed."""
-    return fit_similarity(X, Y, with_scale=False)
-
-
-def fit_affine(X, Y):
-    """Full affine (12-DoF) fit: Y ~= A X + b. Returns ((A,b), prediction).
-
-    Used only as the next rung of the residual ladder: how much of the leftover
-    error after a similarity is explained by *non-uniform* (shear/anisotropic)
-    deformation -- the signature of focal error or affine-invariant depth.
-    """
-    n = X.shape[0]
-    Xh = np.hstack([X, np.ones((n, 1))])          # (n,4)
-    M, *_ = np.linalg.lstsq(Xh, Y, rcond=None)    # (4,3)
-    pred = Xh @ M
-    A = M[:3].T
-    b = M[3]
-    return (A, b), pred
-
-
-def fit_similarity_ransac(X, Y, thresh, iters=300, with_scale=True, seed=0):
-    """RANSAC similarity: robust to gross outliers (floaters in the overlap).
-
-    Minimal sample is 3 point pairs (a similarity has 7 DoF; 3 pairs give 9
-    constraints). Returns (s,R,t, inlier_mask).
-    """
-    rng = np.random.default_rng(seed)
-    n = len(X)
-    best = np.zeros(n, bool)
-    for _ in range(iters):
-        idx = rng.choice(n, 3, replace=False)
-        try:
-            T = fit_similarity(X[idx], Y[idx], with_scale)
-        except np.linalg.LinAlgError:
-            continue
-        res = np.linalg.norm(apply_sim(T, X) - Y, axis=1)
-        inl = res < thresh
-        if inl.sum() > best.sum():
-            best = inl
-    if best.sum() < 3:                            # fall back to plain fit
-        best = np.ones(n, bool)
-    T = fit_similarity(X[best], Y[best], with_scale)
-    return (*T, best)
-
-
-def residual_ladder(X, Y):
-    """Fit rigid -> similarity -> affine and report each RMS residual.
-
-    The DROP from rigid->similarity is how much was pure uniform scale; the drop
-    from similarity->affine is how much non-uniform warp remains. depth_corr is
-    the Pearson correlation between the per-point similarity residual magnitude
-    and the point's distance from the centroid -- a positive value means the
-    leftover error GROWS with depth, the fingerprint of focal/projective error.
-    All residuals are absolute (same units as Y); compare against `scene`.
-    """
-    scene = _rms(Y - Y.mean(0))                   # overall extent of the target
-
-    Tr = fit_rigid(X, Y)
-    r_rigid = _rms(apply_sim(Tr, X) - Y)
-
-    Ts = fit_similarity(X, Y, with_scale=True)
-    res_sim = apply_sim(Ts, X) - Y
-    r_sim = _rms(res_sim)
-
-    _, pred_aff = fit_affine(X, Y)
-    r_aff = _rms(pred_aff - Y)
-
-    depth = np.linalg.norm(X - X.mean(0), axis=1)
-    rmag = np.linalg.norm(res_sim, axis=1)
-    depth_corr = float(np.corrcoef(depth, rmag)[0, 1]) if rmag.std() > 1e-12 else 0.0
-
-    return {
-        "scene": scene,
-        "rigid": r_rigid,
-        "similarity": r_sim,
-        "affine": r_aff,
-        "scale": float(Ts[0]),
-        "depth_corr": depth_corr,
-    }
-
-
-def diagnose(X, Y, tol=1e-3, corr_tol=0.3):
-    """Classify the A<->B mismatch from the residual ladder.
-
-    Residuals are judged relative to scene extent. Verdicts:
-      rigid_ok    : a rigid transform already fits -- same scale (rare across runs)
-      needs_scale : a SIMILARITY fits but rigid does not -- the expected case
-      needs_affine: even similarity fails but affine fits -- non-uniform warp
-      nonlinear   : affine fails too -- genuinely nonlinear (e.g. quadratic depth)
-    `depth_corr` is reported as corroborating evidence of structure.
-    """
-    L = residual_ladder(X, Y)
-    sc = L["scene"] or 1.0
-    rr, rs, ra = L["rigid"] / sc, L["similarity"] / sc, L["affine"] / sc
-    # Is the leftover after a similarity STRUCTURED (a real warp) or just noise?
-    # Two independent signatures: affine meaningfully beats similarity, OR the
-    # residual correlates with depth. Neither => unstructured noise floor.
-    aff_gain = (L["similarity"] - L["affine"]) / L["similarity"] if L["similarity"] > 0 else 0.0
-    structured = abs(L["depth_corr"]) > corr_tol or aff_gain > 0.25
-    if rr < tol:
-        v = "rigid_ok"                       # same scale AND pose: rare across runs
-    elif rs < tol:
-        v = "needs_scale"                    # a clean similarity fits: the expected case
-    elif ra < tol:
-        v = "needs_affine"                   # similarity fails, affine fits: non-uniform
-    elif structured:
-        v = "nonlinear"                      # even affine fails AND residual is structured
-    else:
-        v = "similarity_plus_noise"          # similarity is the model; rest is noise floor
-    L["verdict"] = v
-    L["structured"] = bool(structured)
-    L["aff_gain"] = float(aff_gain)
-    return L
-
-
-# --- chaining similarities along the photo stream --------------------------
-
-def compose(T2, T1):
-    """Return T2 . T1 (apply T1 then T2)."""
-    s1, R1, t1 = T1
-    s2, R2, t2 = T2
-    return (s2 * s1, R2 @ R1, s2 * (R2 @ t1) + t2)
-
-
-def invert(T):
-    """Inverse similarity."""
-    s, R, t = T
-    si = 1.0 / s
-    Ri = R.T
-    return (si, Ri, -si * (Ri @ t))
-
-
-def identity():
-    return (1.0, np.eye(3), np.zeros(3))
-
-
-# --- Sim(3) as 4x4 matrices (for pose-graph relaxation) --------------------
-
-def sim_matrix(s, R, t):
-    """Sim(3) as a 4x4 homogeneous matrix [[sR, t],[0,1]] (compose == matmul)."""
-    M = np.eye(4)
-    M[:3, :3] = s * R
-    M[:3, 3] = t
-    return M
-
-
-def sim_frac_power(M, f):
-    """Fractional power M^f of a Sim(3) 4x4, via eigendecomposition (real part).
-
-    The even loop-closure relaxation: distribute an accumulated drift D over n
-    nodes by applying D^(k/n) at node k. M^0=I, M^1=M, (M^0.5)^2=M; valid while the
-    rotation is < 180deg (principal branch). Tested in test_pose.py."""
-    w, V = np.linalg.eig(M)
-    return (V @ np.diag(w ** f) @ np.linalg.inv(V)).real
-
-
-def loop_closure_error(transforms):
-    """Compose a closed loop of similarities and measure deviation from identity.
-
-    transforms: list [T_{0->1}, T_{1->2}, ..., T_{k->0}]. With drift-free links
-    the product is identity; the returned dict quantifies residual scale, rotation
-    (deg) and translation drift -- a direct scale-drift / loop-closure metric.
-    """
-    T = identity()
-    for Ti in transforms:
-        T = compose(Ti, T)
-    s, R, t = T
-    ang = np.degrees(np.arccos(np.clip((np.trace(R) - 1) / 2, -1, 1)))
-    return {
-        "scale_err": abs(np.log(s)),          # 0 == no scale drift
-        "rot_deg": float(ang),
-        "trans": float(np.linalg.norm(t)),
-    }
diff --git a/pose/check_cv2_parity.py b/pose/check_cv2_parity.py
deleted file mode 100644
index 4f9e8f2..0000000
--- a/pose/check_cv2_parity.py
+++ /dev/null
@@ -1,73 +0,0 @@
-"""Verify the numpy DLT/RANSAC PnP against the EXACT upstream solver.
-
-FreeSplatter uses cv2.solvePnPRansac(reprojErr=5, SOLVEPNP_SQPNP). This compares
-our dependency-free reference backend to that exact call on synthetic scenes with
-known ground truth -- both must recover the true pose AND agree with each other.
-Needs cv2 (nix develop, after opencv was added to the flake).
-
-    nix develop -c python3 pose/check_cv2_parity.py
-"""
-import numpy as np
-import pnp
-
-RNG = np.random.default_rng(7)
-fails = []
-
-
-def rot_angle(R):
-    return np.degrees(np.arccos(np.clip((np.trace(R) - 1) / 2, -1, 1)))
-
-
-def rand_small_pose(rng):
-    ax = rng.normal(size=3)
-    ax /= np.linalg.norm(ax)
-    ang = np.radians(rng.uniform(3, 20))
-    Kx = np.array([[0, -ax[2], ax[1]], [ax[2], 0, -ax[0]], [-ax[1], ax[0], 0]])
-    R = np.eye(3) + np.sin(ang) * Kx + (1 - np.cos(ang)) * Kx @ Kx
-    t = rng.uniform(-0.6, 0.6, size=3)
-    return R, t
-
-
-def run(label, with_outliers):
-    f = 400.0
-    K = np.array([[f, 0, 256.0], [0, f, 256.0], [0, 0, 1.0]])
-    worst = {"np_R": 0, "np_t": 0, "cv_R": 0, "cv_t": 0, "agree_R": 0, "agree_t": 0}
-    for _ in range(10):
-        Pw = RNG.uniform([-2, -2, 4], [2, 2, 8], size=(500, 3))
-        R, t = rand_small_pose(RNG)
-        Xc = (R @ Pw.T).T + t
-        if np.any(Xc[:, 2] <= 0):
-            continue
-        px = (K @ Xc.T).T
-        px = px[:, :2] / px[:, 2:3]
-        if with_outliers:
-            oi = RNG.choice(len(px), int(0.2 * len(px)), replace=False)
-            px[oi] += RNG.uniform(-120, 120, size=(len(oi), 2))
-
-        c2w_np, _ = pnp.solve_pnp_numpy(Pw, px, K, thresh_px=4.0, iters=300)
-        c2w_cv, _ = pnp.solve_pnp_cv2(Pw, px, K)
-        w_np, w_cv = np.linalg.inv(c2w_np), np.linalg.inv(c2w_cv)
-        worst["np_R"] = max(worst["np_R"], rot_angle(w_np[:3, :3] @ R.T))
-        worst["np_t"] = max(worst["np_t"], np.linalg.norm(w_np[:3, 3] - t))
-        worst["cv_R"] = max(worst["cv_R"], rot_angle(w_cv[:3, :3] @ R.T))
-        worst["cv_t"] = max(worst["cv_t"], np.linalg.norm(w_cv[:3, 3] - t))
-        worst["agree_R"] = max(worst["agree_R"], rot_angle(w_np[:3, :3] @ w_cv[:3, :3].T))
-        worst["agree_t"] = max(worst["agree_t"], np.linalg.norm(w_np[:3, 3] - w_cv[:3, 3]))
-
-    tolR, tolt = (0.5, 0.05) if with_outliers else (1e-3, 1e-4)
-    print(f"{label}: worst-of-10  "
-          f"numpy(R={worst['np_R']:.2e}deg t={worst['np_t']:.2e})  "
-          f"cv2(R={worst['cv_R']:.2e}deg t={worst['cv_t']:.2e})  "
-          f"agree(R={worst['agree_R']:.2e}deg t={worst['agree_t']:.2e})")
-    ok = (worst["np_R"] < tolR and worst["cv_R"] < tolR and
-          worst["agree_R"] < tolR and worst["agree_t"] < tolt)
-    if not ok:
-        fails.append(label)
-    return ok
-
-
-print("PnP backend parity: numpy DLT/RANSAC  vs  upstream cv2.solvePnPRansac")
-run("clean   ", with_outliers=False)
-run("outliers", with_outliers=True)
-print("PARITY OK" if not fails else f"PARITY FAILED: {fails}")
-raise SystemExit(1 if fails else 0)
diff --git a/pose/check_upstream_parity.py b/pose/check_upstream_parity.py
deleted file mode 100644
index 6fceb65..0000000
--- a/pose/check_upstream_parity.py
+++ /dev/null
@@ -1,154 +0,0 @@
-"""Verify our WHOLE pose orchestration against upstream, on REAL engine output.
-
-`check_cv2_parity.py` already pinned our solver wrapper to cv2.solvePnPRansac on
-synthetic ground truth. This goes one level up: it drives the *entire* upstream
-`estimate_poses` recipe (FreeSplatter scene path: use_first_focal, opacity mask
-for PnP, no focal mask, pnp_iter=10 -- see .cache/upstream/{model,runner}.py)
-using the vendored-verbatim upstream kernels in `_upstream.py`, and compares it
-to our `focal.py`/`pnp.py` on the gaussians the C++ engine actually produced.
-
-Three levels, each isolating one stage so a divergence points at one file:
-  1. focal kernel   -- focal.estimate_focal           vs _upstream.estimate_focal_np
-  2. PnP solver     -- pnp.solve_pnp_cv2              vs _upstream.fast_pnp   (shared focal)
-  3. full pipeline  -- pnp.estimate_poses             vs the upstream recipe replica
-Plus the scene baseline-rescale (runner.run_freesplatter_scene) for reference.
-
-    nix develop -c python3 pose/check_upstream_parity.py A_scn.f32   # scene (live case)
-
-Defaults to the scratchpad dumps if no path is given. Opacity in the engine
-output is ALREADY sigmoid-activated, so mask = (opacity > thr) reproduces
-upstream's sigmoid(raw) > thr exactly.
-"""
-import os
-import sys
-import numpy as np
-import cv2
-
-import focal
-import pnp
-import _upstream as up
-
-C, OPACITY = 23, 15
-THR = 0.05
-SEED = 0           # cv2.solvePnPRansac draws from the shared global cv::theRNG();
-                   # reseed identically before each call so RANSAC picks the SAME
-                   # minimal samples -> isolates algorithmic parity from RNG state.
-SCRATCH = os.environ.get("EMP_DIR", "").rstrip("/")
-_fails = []
-
-
-def rot_angle(R):
-    return float(np.degrees(np.arccos(np.clip((np.trace(R) - 1) / 2, -1, 1))))
-
-
-def load(path, H=512, W=512):
-    a = np.fromfile(path, np.float32)
-    N = a.size // (H * W * C)
-    a = a.reshape(N, H, W, C)
-    pts = [a[i, ..., 0:3].astype(np.float64) for i in range(N)]      # (H,W,3) each
-    op = [a[i, ..., OPACITY].astype(np.float64) for i in range(N)]   # (H,W) each
-    return N, H, W, pts, op
-
-
-def check(name, cond, detail=""):
-    tag = "PASS" if cond else "FAIL"
-    print(f"  [{tag}] {name}" + (f"   ({detail})" if detail else ""))
-    if not cond:
-        _fails.append(name)
-
-
-def upstream_estimate_poses(pts, op, H, W, pnp_iter=10):
-    """Replicate FreeSplatter scene estimate_poses with vendored kernels.
-
-    use_first_focal=True, masks=None: focal from view0 over ALL pixels; per-view
-    PnP masked by opacity>THR. Returns (c2ws list, focal)."""
-    pp = np.array([W / 2.0, H / 2.0])
-    f = up.estimate_focal_np(pts[0], pp=pp, mask=None)              # view0, all pixels
-    c2ws = []
-    cv2.setRNGSeed(SEED)
-    for i in range(len(pts)):
-        mask = op[i] > THR                                          # (H,W) bool
-        res = up.fast_pnp(pts[i], mask, focal=f, niter_PnP=pnp_iter,
-                          k_dtype=np.float64)   # matched precision (our wrapper is f64)
-        c2ws.append(res[1])
-    return c2ws, f
-
-
-def main():
-    path = sys.argv[1] if len(sys.argv) > 1 else os.path.join(SCRATCH, "A_scn.f32")
-    if not os.path.isabs(path) and SCRATCH and not os.path.exists(path):
-        path = os.path.join(SCRATCH, path)
-    N, H, W, pts, op = load(path)
-    pp = np.array([W / 2.0, H / 2.0])
-    print(f"=== upstream-parity on {os.path.basename(path)}  ({N} views) ===")
-    for i in range(N):
-        print(f"  view{i}: valid(opacity>{THR}) = {(op[i] > THR).mean()*100:.1f}%")
-
-    # ---- Level 1: focal kernel (view0, all pixels) ----
-    print("\n[1] focal kernel  focal.estimate_focal  vs  upstream.estimate_focal_np")
-    grid = pnp.pixel_grid(H, W)                                     # our pixel convention
-    f_up = up.estimate_focal_np(pts[0], pp=pp, mask=None)
-    f_ours = focal.estimate_focal(pts[0].reshape(-1, 3), grid, pp)
-    rel = abs(f_ours - f_up) / f_up
-    check("focal matches upstream (all pixels, view0)", rel < 1e-6,
-          f"ours {f_ours:.4f} vs up {f_up:.4f}  rel {rel:.2e}")
-
-    # ---- Level 2: PnP solver, shared upstream focal ----
-    print("\n[2] PnP solver  pnp.solve_pnp_cv2  vs  upstream.fast_pnp  (focal fixed)")
-    K = np.array([[f_up, 0, pp[0]], [0, f_up, pp[1]], [0, 0, 1.0]])
-    for i in range(N):
-        mask = op[i] > THR
-        Xw = pts[i].reshape(-1, 3)[mask.ravel()]
-        px = grid[mask.ravel()]
-        cv2.setRNGSeed(SEED)
-        c2w_up = up.fast_pnp(pts[i], mask, focal=f_up, niter_PnP=10,
-                             k_dtype=np.float64)[1]      # matched precision
-        cv2.setRNGSeed(SEED)
-        c2w_ours, _ = pnp.solve_pnp_cv2(Xw, px, K, iters=10, reproj=5.0)
-        dR = rot_angle(c2w_ours[:3, :3] @ c2w_up[:3, :3].T)
-        dt = float(np.linalg.norm(c2w_ours[:3, 3] - c2w_up[:3, 3]))
-        check(f"view{i} cam2world matches upstream", dR < 1e-3 and dt < 1e-4,
-              f"dR {dR:.2e} deg  dt {dt:.2e}")
-        # diagnostic: upstream's native float32 K (precision-only effect)
-        cv2.setRNGSeed(SEED)
-        c2w_f32 = up.fast_pnp(pts[i], mask, focal=f_up, niter_PnP=10)[1]
-        dR32 = rot_angle(c2w_ours[:3, :3] @ c2w_f32[:3, :3].T)
-        print(f"        (upstream native float32 K: dR {dR32:.2e} deg -- "
-              f"RANSAC inlier-boundary precision, not orchestration)")
-
-    # ---- Level 3: full pipeline ----
-    print("\n[3] full pipeline  pnp.estimate_poses  vs  upstream recipe")
-    up_c2ws, f_up2 = upstream_estimate_poses(pts, op, H, W, pnp_iter=10)
-    cv2.setRNGSeed(SEED)
-    out = pnp.estimate_poses(pts, op, opacity_threshold=THR, backend="cv2",
-                             normalize=False, cam_frame_points=pts)
-    our_c2ws, f_our2 = out["cam2world"], out["focal"]
-    relf = abs(f_our2 - f_up2) / f_up2
-    check("pipeline focal matches", relf < 1e-6, f"ours {f_our2:.4f} vs up {f_up2:.4f}")
-    # anchor-invariant relative pose view0->view1 (handles any re-anchor convention)
-    def rel_pose(c):
-        return np.linalg.inv(c[0]) @ c[1]
-    Rup, Rours = rel_pose(up_c2ws), rel_pose(our_c2ws)
-    dR = rot_angle(Rours[:3, :3] @ Rup[:3, :3].T)
-    # compare translation DIRECTION (scale/anchor independent)
-    tup, tour = Rup[:3, 3], Rours[:3, 3]
-    cos_t = float(tup @ tour / (np.linalg.norm(tup) * np.linalg.norm(tour) + 1e-12))
-    check("relative-pose rotation matches", dR < 1e-2, f"dR {dR:.2e} deg")
-    check("relative-pose translation direction matches", cos_t > 1 - 1e-6,
-          f"cos {cos_t:.8f}")
-
-    # ---- scene baseline rescale (runner), reference only ----
-    baseline = float(np.linalg.norm(up_c2ws[1][:3, 3] - up_c2ws[0][:3, 3])) + 1e-2
-    print(f"\n[ref] scene baseline rescale: 1/baseline = {1.0/baseline:.4f} "
-          f"(runner applies this to camera centers; our estimate_poses returns raw "
-          f"poses, normalize=False)")
-
-    print()
-    if _fails:
-        print(f"UPSTREAM PARITY FAILED ({len(_fails)}): " + ", ".join(_fails))
-        raise SystemExit(1)
-    print("UPSTREAM PARITY OK")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/pose/empirical.py b/pose/empirical.py
deleted file mode 100644
index bd91498..0000000
--- a/pose/empirical.py
+++ /dev/null
@@ -1,83 +0,0 @@
-"""Empirical cross-run test on REAL engine output.
-
-A photo that appears in two separate inferences (an overlapping pair sharing it)
-is reconstructed twice, in two different coordinate systems. Its per-pixel
-gaussian centers from the two runs are the SAME physical points described two
-ways -- exact correspondences. Fitting a similarity between them and reading the
-residual ladder answers the open question on real data: is the cross-run mismatch
-a clean uniform-scale similarity, or does it carry nonlinear warp?
-
-Usage:
-    nix develop -c python3 pose/empirical.py A.f32 viewA B.f32 viewB [label]
-
-A.f32/B.f32 are raw engine outputs [N,H,W,23]; viewA/viewB pick the shared photo
-within each (e.g. it's view 1 of pair (000,004) and view 0 of pair (004,013)).
-"""
-import sys
-import numpy as np
-import align
-
-C = 23
-OPACITY = 15
-
-
-def load_view(path, view, H=512, W=512):
-    a = np.fromfile(path, np.float32)
-    N = a.size // (H * W * C)
-    a = a.reshape(N, H, W, C)
-    g = a[view]
-    return g[..., 0:3].reshape(-1, 3).astype(np.float64), g[..., OPACITY].reshape(-1)
-
-
-def main():
-    pA, vA, pB, vB = sys.argv[1], int(sys.argv[2]), sys.argv[3], int(sys.argv[4])
-    label = sys.argv[5] if len(sys.argv) > 5 else ""
-    XA, oA = load_view(pA, vA)
-    XB, oB = load_view(pB, vB)
-
-    print(f"=== {label} ===")
-    print(f"opacity A: [{oA.min():.3f},{oA.max():.3f}] mean {oA.mean():.3f} | "
-          f"B: [{oB.min():.3f},{oB.max():.3f}] mean {oB.mean():.3f}")
-
-    thr = 0.05
-    mask = (oA > thr) & (oB > thr)
-    A, B = XA[mask], XB[mask]
-    print(f"valid (opacity>{thr} in both): {mask.sum()}/{mask.size} "
-          f"({100*mask.mean():.1f}%)")
-    if mask.sum() < 100:
-        print("too few valid correspondences"); return
-
-    scene = align._rms(A - A.mean(0))
-    # robust fit B -> A; threshold = 2% of scene extent
-    s, R, t, inl = align.fit_similarity_ransac(B, A, thresh=0.02 * scene, iters=400)
-    print(f"scene extent (rms): {scene:.4f}")
-    print(f"robust similarity B->A:  scale={s:.4f}  inliers={inl.sum()}/{len(A)} "
-          f"({100*inl.mean():.1f}%)")
-
-    # how cross-run-consistent is the WHOLE valid set under the robust fit?
-    res_all = np.linalg.norm(align.apply_sim((s, R, t), B) - A, axis=1)
-    print("cross-run consistency (all valid pixels, under robust similarity):")
-    for q in (0.01, 0.02, 0.05, 0.10):
-        print(f"   within {100*q:4.0f}% of scene extent: "
-              f"{100*(res_all < q*scene).mean():5.1f}% of pixels")
-
-    # residual ladder on the inlier (well-corresponded) surface
-    L = align.diagnose(B[inl], A[inl])
-    sc = L["scene"]
-    print("residual ladder (inliers, RMS  |  % of scene extent):")
-    for k in ("rigid", "similarity", "affine"):
-        print(f"   {k:11s} {L[k]:.5f}   {100*L[k]/sc:6.2f}%")
-    print(f"   recovered scale  {L['scale']:.4f}")
-    print(f"   depth_corr       {L['depth_corr']:+.3f}  (structured={L['structured']})")
-    print(f"   VERDICT          {L['verdict']}")
-
-    # interpretation
-    drop_scale = (L["rigid"] - L["similarity"]) / sc
-    drop_aff = (L["similarity"] - L["affine"]) / sc
-    print(f"interpretation: rigid->similarity drop {100*drop_scale:.2f}% "
-          f"(=how much was uniform scale); similarity->affine drop "
-          f"{100*drop_aff:.2f}% (=non-uniform warp).")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/pose/find_loop.py b/pose/find_loop.py
deleted file mode 100644
index ab732cf..0000000
--- a/pose/find_loop.py
+++ /dev/null
@@ -1,70 +0,0 @@
-"""Search RealEstate10K poses for a LOOPING clip -- camera wanders away from its
-start then returns near it (with a similar viewing direction, so the first and last
-frames overlap). That overlap is the loop-closure constraint we need to test drift
-correction. Pose-only: no video downloads until a candidate is chosen.
-
-A good loop: large max distance from the start (real motion) but small end-to-start
-distance (returns), and small first<->last rotation (same view => the closing pair
-(f_0, f_n) is a valid stereo pair the engine can reconstruct).
-
-    nix develop -c python3 pose/find_loop.py [split] [limit] [--live N]
-"""
-import sys
-import numpy as np
-
-import re10k_control as rc
-import re10k_fetch as rf
-
-
-def score(fr):
-    C = np.array([f["c2w"][:3, 3] for f in fr])
-    d = np.linalg.norm(C - C[0], axis=1)
-    maxd = d.max()
-    if maxd < 1e-2:
-        return None
-    rotend = rc.rot_deg(rc.rel_pose(fr[0]["c2w"], fr[-1]["c2w"]))
-    return {"n": len(fr), "maxd": float(maxd), "endd": float(d[-1]),
-            "return_ratio": float(d[-1] / maxd), "rotend": rotend,
-            "loopiness": float(maxd / (d[-1] + 0.05 * maxd))}
-
-
-def search(split="test", limit=None, min_frames=80,
-           max_return_ratio=0.25, max_rotend=45.0):
-    rows = []
-    clips = rc.list_clips(split)
-    if limit:
-        clips = clips[:limit]
-    for cp in clips:
-        try:
-            url, fr = rc.parse_clip(cp)
-        except Exception:
-            continue
-        if len(fr) < min_frames:
-            continue
-        s = score(fr)
-        if not s or s["return_ratio"] > max_return_ratio or s["rotend"] > max_rotend:
-            continue
-        s["clip"], s["id"] = cp, rc.clip_video_id(url)
-        rows.append(s)
-    rows.sort(key=lambda r: r["loopiness"], reverse=True)
-    return rows
-
-
-if __name__ == "__main__":
-    split = sys.argv[1] if len(sys.argv) > 1 and not sys.argv[1].startswith("-") else "test"
-    limit = int(sys.argv[2]) if len(sys.argv) > 2 and not sys.argv[2].startswith("-") else None
-    n_live = int(sys.argv[sys.argv.index("--live") + 1]) if "--live" in sys.argv else 0
-    rows = search(split, limit)
-    print(f"{len(rows)} loop candidates (split={split}, limit={limit})")
-    print(f"  {'loopiness':>9} {'n':>4} {'maxd':>6} {'ret%':>5} {'rotEnd':>6}  id")
-    shown = 0
-    for r in rows[:30]:
-        live = ""
-        if shown < n_live:
-            try:
-                live = "  LIVE" if rf.is_live(rc.parse_clip(r["clip"])[0]) else "  dead"
-            except Exception:
-                live = "  ?"
-            shown += 1
-        print(f"  {r['loopiness']:>9.1f} {r['n']:>4} {r['maxd']:>6.2f} "
-              f"{100*r['return_ratio']:>5.0f} {r['rotend']:>6.1f}  {r['id']}{live}")
diff --git a/pose/focal.py b/pose/focal.py
deleted file mode 100644
index 1e2ac27..0000000
--- a/pose/focal.py
+++ /dev/null
@@ -1,48 +0,0 @@
-"""Shared focal-length estimation (Weiszfeld), faithful to FreeSplatter.
-
-Upstream (freesplatter/models/model.py::estimate_focal, via DUSt3R) assumes a
-single shared focal across views, centered principal point, square pixels. Per
-view it solves
-
-    f* = argmin_f  sum_ij || (i',j') - f * (X_ij0/X_ij2, X_ij1/X_ij2) ||
-
-where (i',j') are pixel coords relative to the principal point and X is that
-view's camera-frame point map (gaussian xyz). The robust minimizer is found by
-the Weiszfeld algorithm (iteratively reweighted least squares, ~L1). Per-view
-focals are then averaged into one shared value.
-
-Pure numpy -- no cv2 needed for this stage.
-"""
-import numpy as np
-
-
-def estimate_focal(pts3d_cam, pixels, pp, weiszfeld_iters=10):
-    """Weiszfeld focal from camera-frame points and their pixels.
-
-    pts3d_cam : (...,3) points in the view's OWN camera frame (Z>0 in front).
-    pixels    : (...,2) pixel coordinates, same leading shape.
-    pp        : (2,) principal point (image center).
-    """
-    X = np.asarray(pts3d_cam, float).reshape(-1, 3)
-    P = np.asarray(pixels, float).reshape(-1, 2) - np.asarray(pp, float)
-    # ray directions; Z==0 -> 0 exactly as upstream estimate_focal (nan_to_num)
-    u = np.nan_to_num(X[:, :2] / X[:, 2:3], posinf=0.0, neginf=0.0)   # (N,2)
-
-    pu = (P * u).sum(1)                            # per-point p . u
-    uu = (u * u).sum(1)                            # per-point ||u||^2
-    f = pu.sum() / uu.sum()                        # least-squares init
-    for _ in range(weiszfeld_iters):
-        r = np.linalg.norm(P - f * u, axis=1)
-        w = 1.0 / np.maximum(r, 1e-8)
-        f = (w * pu).sum() / (w * uu).sum()
-    return float(f)
-
-
-def estimate_shared_focal(views, pp, weiszfeld_iters=10):
-    """Average per-view focals into one shared focal.
-
-    views: list of (pts3d_cam, pixels) pairs. Mirrors upstream's
-    `focals = focals.mean().repeat(N)`.
-    """
-    fs = [estimate_focal(p3, px, pp, weiszfeld_iters) for (p3, px) in views]
-    return float(np.mean(fs)), fs
diff --git a/pose/fuse.py b/pose/fuse.py
deleted file mode 100644
index df7b1f7..0000000
--- a/pose/fuse.py
+++ /dev/null
@@ -1,134 +0,0 @@
-"""Consensus fusion -- remove the partner-dependent floaters (the edge noise).
-
-Each per-pixel gaussian is conditioned on its partner view, so occlusion-edge /
-depth-ambiguous points land differently per run = floaters. A REAL surface point,
-though, is reconstructed by several overlapping frames and they agree in the global
-frame. So: voxelize the accumulated cloud at the consistency scale and keep only
-voxels corroborated by >= K DISTINCT frames, averaging the agreeing predictions
-(which also denoises the surface). Singletons / scattered floaters are dropped.
-
-This is the direct answer to "does accumulation remove the edge noise?" -- yes, once
-you gate on cross-frame consensus. Reuses cached engine dumps from accumulate.py
-(no new inference).
-
-    nix develop -c python3 pose/fuse.py CLIP FRAME_DIR --cache DIR \\
-        --stride 20 --count 13 --voxel 0.02 --k 2 --ply-raw raw.ply --ply-fused fused.ply
-"""
-import os
-import argparse
-import numpy as np
-
-import align
-
-THR = 0.05
-C0 = 0.28209479177387814
-
-
-def load(dump):
-    a = np.fromfile(dump, np.float32).reshape(2, 512, 512, 23)
-    pts = [a[v, ..., 0:3].astype(np.float64) for v in (0, 1)]
-    op = [a[v, ..., 15].astype(np.float64) for v in (0, 1)]
-    rgb = [np.clip(a[v, ..., 3:6] * C0 + 0.5, 0, 1) for v in (0, 1)]
-    return pts, op, rgb
-
-
-def fit_sim(X, Y):
-    scene = align._rms(Y - Y.mean(0))
-    s, R, t, _ = align.fit_similarity_ransac(X, Y, thresh=0.02 * scene, iters=300)
-    return (s, R, t)
-
-
-def write_ply(path, xyz, rgb):
-    n = len(xyz)
-    hdr = ("ply\nformat binary_little_endian 1.0\n"
-           f"element vertex {n}\nproperty float x\nproperty float y\nproperty float z\n"
-           "property uchar red\nproperty uchar green\nproperty uchar blue\nend_header\n")
-    rec = np.zeros(n, dtype=[("x", "<f4"), ("y", "<f4"), ("z", "<f4"),
-                             ("r", "u1"), ("g", "u1"), ("b", "u1")])
-    rec["x"], rec["y"], rec["z"] = xyz[:, 0], xyz[:, 1], xyz[:, 2]
-    c = (np.clip(rgb, 0, 1) * 255).astype(np.uint8)
-    rec["r"], rec["g"], rec["b"] = c[:, 0], c[:, 1], c[:, 2]
-    with open(path, "wb") as f:
-        f.write(hdr.encode()); f.write(rec.tobytes())
-
-
-def main():
-    ap = argparse.ArgumentParser()
-    ap.add_argument("clip"); ap.add_argument("frame_dir")
-    ap.add_argument("--start", type=int, default=0)
-    ap.add_argument("--stride", type=int, default=20)
-    ap.add_argument("--count", type=int, default=13)
-    ap.add_argument("--cache", default=os.environ.get("SCRATCH", "/tmp"))
-    ap.add_argument("--voxel", type=float, default=0.02)     # fraction of scene extent
-    ap.add_argument("--k", type=int, default=2)              # min distinct frames
-    ap.add_argument("--ply-raw", default=None)
-    ap.add_argument("--ply-fused", default=None)       # averaged consensus (denoised)
-    ap.add_argument("--ply-kept", default=None)        # raw points in consensus voxels
-    ap.add_argument("--ply-floaters", default=None)    # raw points dropped (support < K)
-    a = ap.parse_args()
-
-    idx = [a.start + k * a.stride for k in range(a.count + 1)]
-    runs = []
-    for k in range(a.count):
-        dump = os.path.join(a.cache, f"pair_{idx[k]}_{idx[k+1]}.f32")
-        pts, op, rgb = load(dump)
-        runs.append({"pts": pts, "op": op, "rgb": rgb})
-
-    # chain Sim(3) -> global (same as accumulate.py)
-    T = [align.identity()]
-    for k in range(1, a.count):
-        XA = runs[k]["pts"][0].reshape(-1, 3); oA = runs[k]["op"][0].reshape(-1)
-        XB = runs[k-1]["pts"][1].reshape(-1, 3); oB = runs[k-1]["op"][1].reshape(-1)
-        m = (oA > THR) & (oB > THR)
-        T.append(align.compose(T[k-1], fit_sim(XA[m], XB[m])))
-
-    # accumulate every frame's gaussians, tagged with a source-frame id
-    XYZ, RGB, FR = [], [], []
-    def add(run, view, Tk, fid):
-        op = runs[run]["op"][view].reshape(-1)
-        keep = op > THR
-        XYZ.append(align.apply_sim(Tk, runs[run]["pts"][view].reshape(-1, 3)[keep]))
-        RGB.append(runs[run]["rgb"][view].reshape(-1, 3)[keep])
-        FR.append(np.full(int(keep.sum()), fid, np.int64))
-    add(0, 0, T[0], 0)
-    for k in range(a.count):
-        add(k, 1, T[k], k + 1)
-    XYZ = np.concatenate(XYZ); RGB = np.concatenate(RGB); FR = np.concatenate(FR)
-
-    # voxel multi-frame consensus
-    ext = align._rms(XYZ - XYZ.mean(0))
-    v = a.voxel * ext
-    ijk = np.floor((XYZ - XYZ.min(0)) / v).astype(np.int64)
-    uniq, inv = np.unique(ijk, axis=0, return_inverse=True)
-    G = len(uniq)
-    # distinct source-frames per voxel
-    pairs = np.unique(np.stack([inv, FR], 1), axis=0)
-    support = np.bincount(pairs[:, 0], minlength=G)
-    cnt = np.bincount(inv, minlength=G).astype(float)
-    mean = np.stack([np.bincount(inv, XYZ[:, c], minlength=G) / cnt for c in range(3)], 1)
-    mrgb = np.stack([np.bincount(inv, RGB[:, c], minlength=G) / cnt for c in range(3)], 1)
-    keep = support >= a.k
-
-    print(f"=== consensus fusion  voxel={a.voxel:.3f}*extent  K>={a.k} ===")
-    print(f"raw points: {len(XYZ):,}   voxels: {G:,}")
-    hist = np.bincount(support)[:6]
-    print("voxels by distinct-frame support: " +
-          "  ".join(f"{i}f:{hist[i] if i < len(hist) else 0:,}" for i in range(1, 6)) +
-          f"  6+f:{(support >= 6).sum():,}")
-    print(f"FUSED points (>= {a.k} frames): {keep.sum():,}  "
-          f"({100*keep.sum()/G:.0f}% of voxels, {100*(1-keep.sum()/len(XYZ)):.0f}% point reduction)")
-    pkeep = keep[inv]                                  # per-point consensus mask
-    print(f"per-point: kept {pkeep.sum():,}  floaters dropped {(~pkeep).sum():,} "
-          f"({100*(~pkeep).mean():.0f}%)")
-    if a.ply_raw:
-        write_ply(a.ply_raw, XYZ, RGB); print(f"   raw      -> {a.ply_raw}")
-    if a.ply_fused:
-        write_ply(a.ply_fused, mean[keep], mrgb[keep]); print(f"   fused    -> {a.ply_fused}")
-    if a.ply_kept:
-        write_ply(a.ply_kept, XYZ[pkeep], RGB[pkeep]); print(f"   kept     -> {a.ply_kept}")
-    if a.ply_floaters:
-        write_ply(a.ply_floaters, XYZ[~pkeep], RGB[~pkeep]); print(f"   floaters -> {a.ply_floaters}")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/pose/loop_closure.py b/pose/loop_closure.py
deleted file mode 100644
index 961af61..0000000
--- a/pose/loop_closure.py
+++ /dev/null
@@ -1,150 +0,0 @@
-"""Loop-closure drift correction on a LOOPING re10k clip (see find_loop.py).
-
-accumulate.py chains Sim(3) open-loop and the drift grows to the endpoint (~11%
-ATE). When the camera RETURNS near its start, frame 0 and frame n overlap -- a
-loop-closure constraint. Here we:
-  1. chain open-loop  -> per-frame global poses P_k, recovered trajectory, ATE vs GT.
-  2. measure the loop -> run the closing pair (f_0, f_n), scale-aligned to run 0,
-     giving f_n's pose via the loop (P_n_loop), INDEPENDENT of the chain.
-  3. the discrepancy D (P_n_loop = D . P_n) IS the accumulated drift (scale/rot/trans).
-  4. distribute D over the chain (even Sim(3) relaxation, D^(k/n)) and show the
-     trajectory ATE and endpoint error collapse.
-
-Closes the drift story accumulate.py left open; exercises align's chaining on a
-real loop. Poses are 4x4 similarity matrices [[sR,t],[0,1]] (compose = matmul).
-
-    FS_DEVICE=cpu nix develop -c python3 pose/loop_closure.py CLIP FRAME_DIR \\
-        --stride 20 --count 13 --cache DIR
-"""
-import os
-import argparse
-import numpy as np
-
-import re10k_control as rc
-import re10k_fetch as rf
-import re10k_experiment as ex
-import pnp
-import align
-
-THR = 0.05
-simmat = align.sim_matrix          # Sim(3) 4x4
-matfrac = align.sim_frac_power     # M^f for even loop-closure relaxation
-
-
-def fit_sim_M(X, Y):
-    scene = align._rms(Y - Y.mean(0))
-    s, R, t, inl = align.fit_similarity_ransac(X, Y, thresh=0.02 * scene, iters=300)
-    return simmat(s, R, t), inl
-
-
-def decompose(M):
-    A = M[:3, :3]
-    s = float(np.linalg.det(A) ** (1.0 / 3.0))
-    R = A / s
-    ang = float(np.degrees(np.arccos(np.clip((np.trace(R) - 1) / 2, -1, 1))))
-    return s, ang, M[:3, 3]
-
-
-def load(dump):
-    a = np.fromfile(dump, np.float32).reshape(2, 512, 512, 23)
-    pts = [a[v, ..., 0:3].astype(np.float64) for v in (0, 1)]
-    op = [a[v, ..., 15].astype(np.float64) for v in (0, 1)]
-    return pts, op
-
-
-def ate(C, gt):
-    sA, RA, tA = align.fit_similarity(C, gt)
-    Cal = align.apply_sim((sA, RA, tA), C)
-    return align._rms(Cal - gt), np.linalg.norm(Cal - gt, axis=1)
-
-
-def main():
-    ap = argparse.ArgumentParser()
-    ap.add_argument("clip"); ap.add_argument("frame_dir")
-    ap.add_argument("--start", type=int, default=0)
-    ap.add_argument("--stride", type=int, default=20)
-    ap.add_argument("--count", type=int, default=13)
-    ap.add_argument("--cache", default=os.environ.get("SCRATCH", "/tmp"))
-    a = ap.parse_args()
-
-    url, frames = rc.parse_clip(a.clip)
-    idx = [a.start + k * a.stride for k in range(a.count + 1)]
-    os.makedirs(a.cache, exist_ok=True)
-    fp = lambda i: os.path.join(a.frame_dir, f"f{i:04d}.png")
-    if any(not os.path.exists(fp(i)) for i in idx):
-        rf.fetch(a.clip, a.frame_dir, indices=idx)
-    print(f"=== loop closure: {os.path.basename(a.clip)}  frames {idx[0]}..{idx[-1]} "
-          f"stride {a.stride}  device={ex.DEVICE} ===\n{url}")
-
-    # --- run / load chain pairs ---
-    runs = []
-    for k in range(a.count):
-        dump = os.path.join(a.cache, f"pair_{idx[k]}_{idx[k+1]}.f32")
-        if not os.path.exists(dump):
-            ex.run_engine(fp(idx[k]), fp(idx[k + 1]), dump)
-        pts, op = load(dump)
-        c2w = pnp.estimate_poses(pts, op, normalize=False)["cam2world"]
-        runs.append({"pts": pts, "op": op, "c2w": c2w})
-
-    # --- open-loop chain: T[k] maps run k -> global ---
-    T = [np.eye(4)]; chain_diag = []
-    for k in range(1, a.count):
-        XA = runs[k]["pts"][0].reshape(-1, 3); oA = runs[k]["op"][0].reshape(-1)
-        XB = runs[k-1]["pts"][1].reshape(-1, 3); oB = runs[k-1]["op"][1].reshape(-1)
-        m = (oA > THR) & (oB > THR)
-        S, inlk = fit_sim_M(XA[m], XB[m]); T.append(T[k-1] @ S)
-        sk, _, _ = decompose(S)
-        chain_diag.append((sk, 100 * inlk.mean(), 100 * m.mean()))
-    P = [T[k] @ runs[k]["c2w"][0] for k in range(a.count)]
-    P.append(T[a.count-1] @ runs[a.count-1]["c2w"][1])      # f_n from last run's view1
-    centers = np.array([p[:3, 3] for p in P])
-    gt = np.array([frames[i]["c2w"][:3, 3] for i in idx])
-    ext = align._rms(gt - gt.mean(0))
-
-    # --- loop measurement: closing pair (f_0, f_n) ---
-    cdump = os.path.join(a.cache, f"close_{idx[0]}_{idx[-1]}.f32")
-    if not os.path.exists(cdump):
-        ex.run_engine(fp(idx[0]), fp(idx[-1]), cdump)
-    cpts, cop = load(cdump)
-    cc2w = pnp.estimate_poses(cpts, cop, normalize=False)["cam2world"]
-    # scale/pose-align the closing run's f_0 (view0) to run 0's f_0 (view0, == global)
-    XA = cpts[0].reshape(-1, 3); oA = cop[0].reshape(-1)
-    XB = runs[0]["pts"][0].reshape(-1, 3); oB = runs[0]["op"][0].reshape(-1)
-    m = (oA > THR) & (oB > THR)
-    close_valid = 100 * m.mean()
-    G, inl = fit_sim_M(XA[m], XB[m])
-    P_n_loop = G @ cc2w[1]                                  # f_n via the loop, in global
-
-    # --- drift D (P_n_loop = D . P_n) and even distribution ---
-    D = P_n_loop @ np.linalg.inv(P[-1])
-    s, ang, t = decompose(D)
-    n = a.count
-    centers_c = np.array([(matfrac(D, k / n) @ np.append(centers[k], 1.0))[:3]
-                          for k in range(len(centers))])
-
-    # --- report ---
-    ate0, perr0 = ate(centers, gt)
-    ate1, perr1 = ate(centers_c, gt)
-    # diagnostics: is the open-loop chain a clean drift, and does it even return?
-    sA, RA, tA = align.fit_similarity(centers, gt)
-    pnl_al = align.apply_sim((sA, RA, tA), P_n_loop[:3, 3])
-    print("\nchain per-link scale/inlier%/valid%: " +
-          "  ".join(f"{c[0]:.2f}/{c[1]:.0f}/{c[2]:.0f}" for c in chain_diag))
-    print(f"return-to-start dist: recovered {np.linalg.norm(centers[-1]-centers[0]):.3f}  "
-          f"GT {np.linalg.norm(gt[-1]-gt[0]):.3f}  (GT extent {ext:.3f})")
-    print(f"loop measurement P_n_loop endpoint err vs GT: "
-          f"{100*np.linalg.norm(pnl_al-gt[-1])/ext:.0f}% of extent")
-    print(f"\nclosing pair (f_{idx[0]}, f_{idx[-1]}): valid%={close_valid:.1f}  "
-          f"inliers={100*inl.mean():.0f}%")
-    print(f"LOOP-CLOSURE ERROR (open-loop drift at the loop point):")
-    print(f"   scale {s:.3f} ({100*abs(np.log(s)):.1f}% log)  rotation {ang:.1f} deg  "
-          f"translation {np.linalg.norm(t):.3f} ({100*np.linalg.norm(t)/ext:.0f}% of extent)")
-    print(f"\ntrajectory ATE vs GT (after global Sim(3) align):")
-    print(f"   open-loop : {100*ate0/ext:5.1f}% of extent   endpoint {100*perr0[-1]/ext:.0f}%")
-    print(f"   closed    : {100*ate1/ext:5.1f}% of extent   endpoint {100*perr1[-1]/ext:.0f}%")
-    print(f"   per-frame % (open ): " + " ".join(f"{100*e/ext:.0f}" for e in perr0))
-    print(f"   per-frame % (closed): " + " ".join(f"{100*e/ext:.0f}" for e in perr1))
-
-
-if __name__ == "__main__":
-    main()
diff --git a/pose/pnp.py b/pose/pnp.py
deleted file mode 100644
index 85f3dd8..0000000
--- a/pose/pnp.py
+++ /dev/null
@@ -1,245 +0,0 @@
-"""PnP camera recovery from the engine's per-pixel gaussians.
-
-Mirrors FreeSplatter's estimate_poses (freesplatter/models/model.py, which calls
-the DUSt3R-derived fast_pnp in utils/recon_util.py):
-
-  * correspondences : every pixel -> its predicted 3D gaussian center X (channels
-    0:3) <-> its 2D pixel coordinate.
-  * validity mask   : opacity > 0.05 (drops floaters before PnP).
-  * focal           : Weiszfeld per view, averaged into one shared focal (focal.py).
-  * pose            : cv2.solvePnPRansac(reprojectionError=5, SOLVEPNP_SQPNP,
-                      iterationsCount=10); world2cam, then cam2world = inv(.);
-                      OpenCV convention; view 0 comes out ~identity by construction
-                      (gaussians are predicted in view 0's frame), no re-anchor.
-  * scene scale     : upstream's `estimate_poses` does NOT normalize; the runner
-                      (run_freesplatter_scene) rescales camera centers so the
-                      view0->view1 baseline is 1.0 (s = 1/(||t1-t0||+1e-2)). See
-                      `rescale_to_baseline`. (An earlier 1/mean||pts|| guess was
-                      wrong -- verified against .cache/upstream/{model,runner}.py.)
-
-Two solver backends, selected by `backend`:
-  - "cv2"  : the exact upstream call (needs opencv; available in the CUDA
-             container or after adding opencv to the nix shell). Use this for
-             numerical parity against the original.
-  - "numpy": a dependency-free DLT + RANSAC reference solver, so the pipeline and
-             its golden tests run anywhere numpy is present. Verified against
-             analytic ground truth in test_pose.py.
-  - "auto" : cv2 if importable, else numpy.
-
-Gaussian channel layout (scene, 23 ch): xyz[0:3] SH[3:15] opacity[15] scale[16:19]
-rotation[19:23]; opacity is assumed already activated (sigmoid) in [0,1].
-"""
-import numpy as np
-
-OPACITY_CHANNEL = 15
-
-
-# --- numpy reference PnP (known intrinsics) --------------------------------
-
-def _dlt(Xw, xn):
-    """Direct Linear Transform for world2cam [R|t] from normalized image coords.
-
-    Xw : (N,3) world points. xn : (N,2) normalized coords ( K^-1 [u v 1] ).
-    Returns the raw 3x4 matrix p (defined up to sign/scale); decode with _decode.
-    """
-    N = len(Xw)
-    A = np.zeros((2 * N, 12))
-    Xh = np.hstack([Xw, np.ones((N, 1))])         # (N,4)
-    A[0::2, 0:4] = Xh
-    A[0::2, 8:12] = -xn[:, 0:1] * Xh
-    A[1::2, 4:8] = Xh
-    A[1::2, 8:12] = -xn[:, 1:2] * Xh
-    _, _, Vt = np.linalg.svd(A)
-    return Vt[-1].reshape(3, 4)
-
-
-def _decode(p, Xw, xn):
-    """Turn a DLT matrix into a proper (R,t) world2cam, resolving sign by
-    reprojection error and cheirality (points must sit in front of the camera)."""
-    best = None
-    for sgn in (1.0, -1.0):
-        M = sgn * p[:, :3]
-        tau = sgn * p[:, 3]
-        U, Sv, Vt = np.linalg.svd(M)
-        d = np.linalg.det(U @ Vt)
-        R = U @ np.diag([1, 1, d]) @ Vt           # nearest proper rotation
-        lam = Sv.mean()                            # M ~= lam * R
-        if lam < 1e-12:
-            continue
-        t = tau / lam
-        Xc = (R @ Xw.T).T + t
-        if np.any(Xc[:, 2] <= 0):
-            err = np.inf
-        else:
-            proj = Xc[:, :2] / Xc[:, 2:3]
-            err = float(np.sqrt(((proj - xn) ** 2).sum(1)).mean())
-        if best is None or err < best[0]:
-            best = (err, R, t)
-    if best is None:
-        raise np.linalg.LinAlgError("PnP decode failed (all-behind camera)")
-    return best[1], best[2], best[0]
-
-
-def solve_pnp_numpy(Xw, pixels, K, thresh_px=5.0, iters=100, seed=0):
-    """RANSAC DLT PnP. Returns (cam2world 4x4, inlier_mask)."""
-    Kinv = np.linalg.inv(K)
-    uv1 = np.hstack([pixels, np.ones((len(pixels), 1))])
-    xn = (Kinv @ uv1.T).T[:, :2]                  # normalized image coords
-    thresh = thresh_px / K[0, 0]                   # px tol -> normalized tol
-
-    rng = np.random.default_rng(seed)
-    n = len(Xw)
-    best = np.zeros(n, bool)
-    for _ in range(iters):
-        idx = rng.choice(n, 6, replace=False)
-        try:
-            R, t, _ = _decode(_dlt(Xw[idx], xn[idx]), Xw[idx], xn[idx])
-        except np.linalg.LinAlgError:
-            continue
-        Xc = (R @ Xw.T).T + t
-        good = Xc[:, 2] > 0
-        proj = np.full((n, 2), 1e9)
-        proj[good] = Xc[good, :2] / Xc[good, 2:3]
-        inl = np.linalg.norm(proj - xn, axis=1) < thresh
-        if inl.sum() > best.sum():
-            best = inl
-    if best.sum() < 6:
-        best = np.ones(n, bool)
-    R, t, _ = _decode(_dlt(Xw[best], xn[best]), Xw[best], xn[best])
-    world2cam = np.eye(4)
-    world2cam[:3, :3] = R
-    world2cam[:3, 3] = t
-    return np.linalg.inv(world2cam), best
-
-
-def solve_pnp_cv2(Xw, pixels, K, iters=20, reproj=5.0):
-    """Exact upstream call (needs cv2). Returns (cam2world, inlier_mask)."""
-    import cv2
-    ok, rvec, tvec, inliers = cv2.solvePnPRansac(
-        Xw.astype(np.float64), pixels.astype(np.float64), K.astype(np.float64),
-        None, iterationsCount=iters, reprojectionError=reproj,
-        flags=cv2.SOLVEPNP_SQPNP)
-    if not ok:
-        raise RuntimeError("cv2.solvePnPRansac failed")
-    R, _ = cv2.Rodrigues(rvec)
-    world2cam = np.eye(4)
-    world2cam[:3, :3] = R
-    world2cam[:3, 3] = tvec[:, 0]
-    mask = np.zeros(len(Xw), bool)
-    if inliers is not None:
-        mask[inliers[:, 0]] = True
-    return np.linalg.inv(world2cam), mask
-
-
-def _have_cv2():
-    try:
-        import cv2  # noqa: F401
-        return True
-    except Exception:
-        return False
-
-
-def solve_pnp(Xw, pixels, K, backend="auto", **kw):
-    if backend == "auto":
-        backend = "cv2" if _have_cv2() else "numpy"
-    if backend == "cv2":
-        return solve_pnp_cv2(Xw, pixels, K, **kw)
-    return solve_pnp_numpy(Xw, pixels, K, **kw)
-
-
-# --- orchestration ---------------------------------------------------------
-
-def pixel_grid(H, W):
-    """(H*W, 2) INTEGER pixel coords [x=col, y=row], row-major -- matches upstream
-    xy_grid / np.mgrid[:W,:H].T exactly (NO half-pixel offset). Row-major so it
-    lines up with a C-order (H,W,3) gaussian map flattened by reshape(-1,3)."""
-    xs, ys = np.meshgrid(np.arange(W), np.arange(H))   # both (H,W): xs=col, ys=row
-    return np.stack([xs.ravel(), ys.ravel()], axis=1).astype(float)
-
-
-def rescale_to_baseline(cam2world):
-    """Scene normalization from runner.run_freesplatter_scene: rescale camera
-    centers so the view0->view(last) baseline is 1.0. Returns (list, scale_factor).
-
-    Mirrors upstream exactly: baseline = ||t_last - t_0|| + 1e-2; s = 1/baseline;
-    every camera translation *= s (rotations untouched)."""
-    t0 = cam2world[0][:3, 3]
-    baseline = float(np.linalg.norm(cam2world[-1][:3, 3] - t0)) + 1e-2
-    s = 1.0 / baseline
-    out = []
-    for c in cam2world:
-        c = c.copy()
-        c[:3, 3] *= s
-        out.append(c)
-    return out, s
-
-
-def estimate_poses(points, opacities=None, focal=None, pp=None,
-                   opacity_threshold=0.05, backend="auto", cam_frame_points=None,
-                   use_first_focal=True, focal_use_opacity_mask=False,
-                   pnp_iter=10, normalize=False):
-    """Faithful port of FreeSplatter's scene estimate_poses (.cache/upstream/model.py)
-    plus the runner's optional baseline rescale.
-
-    Defaults mirror the scene recipe: focal from view 0 over ALL pixels
-    (use_first_focal=True, masks=None), per-view PnP masked by opacity>thr,
-    pnp_iter=10, cam2world=inv(world2cam). estimate_poses returns the RAW poses
-    (view 0 comes out ~identity because the gaussians live in view 0's frame -- no
-    explicit re-anchor, matching upstream). Set normalize=True for the runner's
-    1/baseline camera rescale (the scene viewer normalization).
-
-    points          : list of (H,W,3) gaussian-center maps (view 0 frame).
-    opacities       : list of (H,W) activated opacities for the PnP mask (or None).
-    cam_frame_points: per-view points in each view's OWN frame for focal. With
-                      use_first_focal only view 0 is used, and view 0's frame IS the
-                      world frame, so points[0] suffices; pass None to use `points`.
-    focal_use_opacity_mask: scene=False (all pixels, faithful); object would pass an
-                      alpha mask -- approximated here by the opacity mask if True.
-    Returns dict: cam2world (list of 4x4), focal, scale.
-    """
-    from focal import estimate_focal
-
-    backend_cv2 = backend == "cv2" or (backend == "auto" and _have_cv2())
-
-    N = len(points)
-    H, W = points[0].shape[:2]
-    if pp is None:
-        pp = np.array([W / 2.0, H / 2.0])
-    grid = pixel_grid(H, W)
-
-    masks = []
-    for i in range(N):
-        if opacities is None:
-            masks.append(np.ones(H * W, bool))
-        else:
-            masks.append(opacities[i].ravel() > opacity_threshold)
-
-    # focal: mirror estimate_focals -- view 0 only (use_first_focal), all pixels
-    # unless an opacity mask is explicitly requested.
-    if focal is None:
-        cfp = cam_frame_points if cam_frame_points is not None else points
-        n_focal = 1 if use_first_focal else N
-        fs = []
-        for i in range(n_focal):
-            m = masks[i] if (focal_use_opacity_mask and opacities is not None) \
-                else np.ones(H * W, bool)
-            fs.append(estimate_focal(cfp[i].reshape(-1, 3)[m], grid[m], pp))
-        focal = float(np.mean(fs))
-
-    K = np.array([[focal, 0, pp[0]], [0, focal, pp[1]], [0, 0, 1.0]])
-
-    cam2world = []
-    for i in range(N):
-        Xw = points[i].reshape(-1, 3)[masks[i]]
-        px = grid[masks[i]]
-        if backend_cv2:
-            c2w, _ = solve_pnp_cv2(Xw, px, K, iters=pnp_iter, reproj=5.0)
-        else:
-            c2w, _ = solve_pnp_numpy(Xw, px, K)
-        cam2world.append(c2w)
-
-    scale = 1.0
-    if normalize:
-        cam2world, scale = rescale_to_baseline(cam2world)
-
-    return {"cam2world": cam2world, "focal": focal, "scale": scale}
diff --git a/pose/re10k_control.py b/pose/re10k_control.py
deleted file mode 100644
index fe389ea..0000000
--- a/pose/re10k_control.py
+++ /dev/null
@@ -1,105 +0,0 @@
-"""RealEstate10K loader -- the dense, GT-posed, IN-DISTRIBUTION control.
-
-FreeSplatter-scene is trained on RealEstate10K-style wide-FOV walkthroughs, so
-(unlike Tanks-and-Temples, which is out of distribution) the model is known-good
-here. Each clip .txt is:
-
-    line 1 : YouTube URL
-    line k : timestamp(us)  fx fy cx cy  0 0  m00 m01 m02 m03 m10 ... m23
-             ^int            ^normalized intrinsics  ^3x4 world->camera, row-major
-
-Intrinsics are resolution-independent (fx normalized by width, fy by height).
-Frames come from the YouTube video at the given timestamps (yt-dlp + ffmpeg).
-
-Camera convention: the pose is world->camera extrinsic P=[R|t] (KP maps a world
-point to the image), OpenCV-style. c2w = inv([[R,t],[0,0,0,1]]). We VALIDATE this
-convention empirically by checking that GT relative rotation agrees with the pose
-FreeSplatter recovers on the same pair (the model is accurate in-distribution).
-
-GT focal after our 512x512 preprocessing (center-crop to height, resize): because
-fy is normalized by height and we crop to the full height then scale to 512,
-    focal_512 = fy * 512    (independent of source resolution).
-
-Pure numpy for parsing; frame extraction shells out to yt-dlp/ffmpeg.
-"""
-import os
-import subprocess
-import numpy as np
-
-HERE = os.path.dirname(os.path.abspath(__file__))
-RE10K = os.environ.get("RE10K_ROOT", os.path.join(HERE, "..", ".cache", "re10k", "RealEstate10K"))
-
-
-def parse_clip(path):
-    """Return (url, frames). frames: list of dicts {ts, fx,fy,cx,cy, w2c(3x4), c2w(4x4)}."""
-    with open(path) as f:
-        lines = [ln.strip() for ln in f if ln.strip()]
-    url = lines[0]
-    frames = []
-    for ln in lines[1:]:
-        v = ln.split()
-        ts = int(v[0])
-        fx, fy, cx, cy = (float(x) for x in v[1:5])
-        # v[5], v[6] are zeros (distortion); v[7:19] are the 3x4 world->cam
-        P = np.array([float(x) for x in v[7:19]], float).reshape(3, 4)
-        w2c = np.eye(4)
-        w2c[:3, :4] = P
-        c2w = np.linalg.inv(w2c)
-        frames.append({"ts": ts, "fx": fx, "fy": fy, "cx": cx, "cy": cy,
-                       "w2c": w2c, "c2w": c2w})
-    return url, frames
-
-
-def gt_focal_512(frame):
-    """GT focal in the engine's 512x512 input (center-crop to height + resize)."""
-    return frame["fy"] * 512.0
-
-
-def rel_pose(c2w_a, c2w_b):
-    return np.linalg.inv(c2w_a) @ c2w_b
-
-
-def rot_deg(R):
-    return float(np.degrees(np.arccos(np.clip((np.trace(R[:3, :3]) - 1) / 2, -1, 1))))
-
-
-def list_clips(split="test"):
-    d = os.path.join(RE10K, split)
-    return sorted(os.path.join(d, f) for f in os.listdir(d) if f.endswith(".txt"))
-
-
-def clip_video_id(url):
-    """YouTube id from the clip header URL."""
-    for key in ("watch?v=", "youtu.be/", "/embed/"):
-        if key in url:
-            return url.split(key)[1][:11]
-    return url.rsplit("/", 1)[-1][:11]
-
-
-def baseline_report(clip_path):
-    url, fr = parse_clip(clip_path)
-    n = len(fr)
-    cen = np.array([f["c2w"][:3, 3] for f in fr])
-    span = float(np.linalg.norm(cen - cen.mean(0), axis=1).mean())
-    print(f"=== {os.path.basename(clip_path)}  ({n} frames)  {url} ===")
-    print(f"video id: {clip_video_id(url)}   duration: "
-          f"{(fr[-1]['ts']-fr[0]['ts'])/1e6:.1f}s   GT focal(512)~{gt_focal_512(fr[0]):.1f}")
-    print(f"camera-center spread (mean dist to centroid): {span:.4f}")
-    print(f"  {'stride':>6} {'pairs':>6} {'rot(deg) med':>13} {'p10..p90':>16} "
-          f"{'baseline med':>13}")
-    for s in (1, 2, 5, 10, 20, 40):
-        rots, bl = [], []
-        for i in range(0, n - s):
-            T = rel_pose(fr[i]["c2w"], fr[i + s]["c2w"])
-            rots.append(rot_deg(T)); bl.append(np.linalg.norm(T[:3, 3]))
-        if not rots:
-            continue
-        rots, bl = np.array(rots), np.array(bl)
-        print(f"  {s:>6} {len(rots):>6} {np.median(rots):>13.2f} "
-              f"{np.percentile(rots,10):>7.2f}..{np.percentile(rots,90):<7.2f} "
-              f"{np.median(bl):>13.4f}")
-
-
-if __name__ == "__main__":
-    import sys
-    baseline_report(sys.argv[1])
diff --git a/pose/re10k_crossrun.py b/pose/re10k_crossrun.py
deleted file mode 100644
index 422c1d5..0000000
--- a/pose/re10k_crossrun.py
+++ /dev/null
@@ -1,69 +0,0 @@
-"""Cross-run consistency on IN-DISTRIBUTION re10k -- the accumulation question.
-
-A shared frame m is reconstructed by two overlapping pairs:
-    run A = (m-s, m)  -> frame m is view 1 (gaussians in frame (m-s)'s world)
-    run B = (m, m+s)  -> frame m is view 0 (gaussians in frame m's world)
-Frame m's per-pixel gaussian centers from A and B are the SAME physical points in
-two coordinate systems -- exact correspondences. Fit a robust similarity B->A and
-read the residual ladder + how many pixels actually agree. High agreement => clean
-accumulation; low => floaters dominate and we need consensus fusion.
-
-This mirrors empirical.py but on data the model handles well (re10k), and sweeps
-the baseline stride s so we can see whether a sweet spot maximizes consistency.
-
-    FS_DEVICE=cpu nix develop -c python3 pose/re10k_crossrun.py CLIP.txt FRAME_DIR 120 20,40,80
-"""
-import os
-import sys
-import numpy as np
-
-import re10k_control as rc
-import re10k_experiment as ex          # run_engine
-import align
-
-SCRATCH = os.environ.get("SCRATCH", "/tmp")
-THR = 0.05
-
-
-def crossrun(clip, frame_dir, m, strides):
-    _, frames = rc.parse_clip(clip)
-    print(f"=== cross-run consistency: {os.path.basename(clip)}  shared frame #{m} ===")
-    print(f"  {'stride':>6} {'GTrot(i-j)':>10} {'valid%':>6} {'scale':>6} "
-          f"{'<1%':>5} {'<2%':>5} {'<5%':>5} {'<10%':>6} {'rigid%':>7} {'sim%':>6} "
-          f"{'aff%':>6} {'verdict':>20}")
-    fp = lambda i: os.path.join(frame_dir, f"f{i:04d}.png")
-    for s in strides:
-        i, j = m - s, m + s
-        if not all(os.path.exists(fp(k)) for k in (i, m, j)):
-            print(f"  {s:>6}  (missing a frame among {i},{m},{j})"); continue
-        ptsA, opA = ex.run_engine(fp(i), fp(m), os.path.join(SCRATCH, f"xrA_{m}_{s}.f32"))
-        ptsB, opB = ex.run_engine(fp(m), fp(j), os.path.join(SCRATCH, f"xrB_{m}_{s}.f32"))
-        XA, oA = ptsA[1].reshape(-1, 3), opA[1].reshape(-1)    # m as view1 of A
-        XB, oB = ptsB[0].reshape(-1, 3), opB[0].reshape(-1)    # m as view0 of B
-        mask = (oA > THR) & (oB > THR)
-        A, B = XA[mask], XB[mask]
-        if mask.sum() < 200:
-            print(f"  {s:>6}  (too few valid: {mask.sum()})"); continue
-        scene = align._rms(A - A.mean(0))
-        sc, R, t, inl = align.fit_similarity_ransac(B, A, thresh=0.02 * scene, iters=400)
-        res = np.linalg.norm(align.apply_sim((sc, R, t), B) - A, axis=1)
-        within = {q: 100 * (res < q * scene).mean() for q in (0.01, 0.02, 0.05, 0.10)}
-        L = align.diagnose(B[inl], A[inl])
-        gtrot = rc.rot_deg(rc.rel_pose(frames[i]["c2w"], frames[j]["c2w"]))
-        print(f"  {s:>6} {gtrot:>10.2f} {100*mask.mean():>6.1f} {sc:>6.3f} "
-              f"{within[0.01]:>5.0f} {within[0.02]:>5.0f} {within[0.05]:>5.0f} "
-              f"{within[0.10]:>6.0f} {100*L['rigid']/L['scene']:>7.1f} "
-              f"{100*L['similarity']/L['scene']:>6.1f} {100*L['affine']/L['scene']:>6.1f} "
-              f"{L['verdict']:>20}")
-    print("\n<q% = fraction of shared-frame pixels whose two reconstructions agree "
-          "within q% of scene extent under the best similarity. High => accumulation\n"
-          "is clean; low => most per-pixel gaussians are partner-dependent floaters "
-          "(need consensus fusion).")
-
-
-if __name__ == "__main__":
-    clip, frame_dir = sys.argv[1], sys.argv[2]
-    m = int(sys.argv[3]) if len(sys.argv) > 3 else 120
-    strides = tuple(int(x) for x in sys.argv[4].split(",")) if len(sys.argv) > 4 \
-        else (20, 40, 80)
-    crossrun(clip, frame_dir, m, strides)
diff --git a/pose/re10k_experiment.py b/pose/re10k_experiment.py
deleted file mode 100644
index a2712d3..0000000
--- a/pose/re10k_experiment.py
+++ /dev/null
@@ -1,76 +0,0 @@
-"""Run FreeSplatter-scene on RealEstate10K pairs (IN-distribution) and check the
-recovered relative pose + focal against ground truth, vs frame stride (baseline).
-
-Unlike Tanks-and-Temples (out of distribution -> garbage), the model is known-good
-here, so this is a real control: low poseErr confirms our PnP against an INDEPENDENT
-GT (re10k poses), and agreement validates the re10k camera convention.
-
-Frames must already be fetched into frame_dir as fNNNN.png (re10k_fetch.py); both
-anchor and anchor+stride must be among the fetched indices.
-
-    FS_DEVICE=cpu nix develop -c python3 pose/re10k_experiment.py CLIP.txt FRAME_DIR 40 20,40,80,160
-"""
-import os
-import sys
-import subprocess
-import numpy as np
-
-import re10k_control as rc
-import pnp
-
-HERE = os.path.dirname(os.path.abspath(__file__))
-GGUF = os.environ.get("FS_GGUF", os.path.join(HERE, "..", ".cache", "freesplatter-scene-f16.gguf"))
-CLI = os.environ.get("FS_CLI", os.path.join(HERE, "..", "build", "release", "bin", "free_splatter-cli"))
-DEVICE = os.environ.get("FS_DEVICE", "cpu")
-SCRATCH = os.environ.get("SCRATCH", "/tmp")
-
-
-def run_engine(img0, img1, out):
-    r = subprocess.run([CLI, "--device", DEVICE, "--out", out, GGUF, img0, img1],
-                       capture_output=True, text=True)
-    if r.returncode != 0:
-        raise RuntimeError(f"CLI failed: {r.stderr[-500:]}")
-    a = np.fromfile(out, np.float32).reshape(2, 512, 512, 23)
-    pts = [a[v, ..., 0:3].astype(np.float64) for v in (0, 1)]
-    op = [a[v, ..., 15].astype(np.float64) for v in (0, 1)]
-    return pts, op
-
-
-def cos(a, b):
-    return float(a @ b / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-12))
-
-
-def experiment(clip, frame_dir, anchor, strides):
-    url, frames = rc.parse_clip(clip)
-    f_gt = rc.gt_focal_512(frames[0])
-    print(f"=== {os.path.basename(clip)}  anchor #{anchor}  GT focal(512)={f_gt:.1f}  "
-          f"device={DEVICE} ===\n{url}")
-    print(f"  {'stride':>6} {'GTrot':>6} {'valid%':>6} {'poseErr':>8} "
-          f"{'transDir':>8} {'scaleRat':>8} {'focal':>7} {'fErr%':>6}")
-    for s in strides:
-        j = anchor + s
-        img0 = os.path.join(frame_dir, f"f{anchor:04d}.png")
-        img1 = os.path.join(frame_dir, f"f{j:04d}.png")
-        if not (os.path.exists(img0) and os.path.exists(img1)):
-            print(f"  {s:>6}  (missing frame {anchor} or {j})"); continue
-        out = os.path.join(SCRATCH, f"re_{anchor}_{s}.f32")
-        pts, op = run_engine(img0, img1, out)
-        res = pnp.estimate_poses(pts, op, normalize=False)
-        c2w, f_est = res["cam2world"], res["focal"]
-        rel = np.linalg.inv(c2w[0]) @ c2w[1]
-        G = rc.rel_pose(frames[anchor]["c2w"], frames[j]["c2w"])
-        poseErr = rc.rot_deg(rel[:3, :3] @ G[:3, :3].T)
-        tdir = cos(rel[:3, 3], G[:3, 3])
-        srat = np.linalg.norm(rel[:3, 3]) / (np.linalg.norm(G[:3, 3]) + 1e-12)
-        validp = min((op[0] > 0.05).mean(), (op[1] > 0.05).mean()) * 100
-        ferr = abs(f_est - f_gt) / f_gt * 100
-        print(f"  {s:>6} {rc.rot_deg(G):>6.2f} {validp:>6.1f} {poseErr:>8.2f} "
-              f"{tdir:>8.4f} {srat:>8.3f} {f_est:>7.1f} {ferr:>6.1f}")
-
-
-if __name__ == "__main__":
-    clip, frame_dir = sys.argv[1], sys.argv[2]
-    anchor = int(sys.argv[3]) if len(sys.argv) > 3 else 40
-    strides = tuple(int(x) for x in sys.argv[4].split(",")) if len(sys.argv) > 4 \
-        else (20, 40, 80, 160)
-    experiment(clip, frame_dir, anchor, strides)
diff --git a/pose/re10k_fetch.py b/pose/re10k_fetch.py
deleted file mode 100644
index 94b1770..0000000
--- a/pose/re10k_fetch.py
+++ /dev/null
@@ -1,84 +0,0 @@
-"""Fetch a RealEstate10K clip's frames via yt-dlp + ffmpeg -- the standard re10k
-workflow, since the dataset ships only poses + YouTube URLs (frames can't be
-redistributed). Gives us a dense, in-distribution, GT-posed sequence to drive the
-control. Frames are written as fNNNN.png (NNNN = pose-line index) so they pair 1:1
-with re10k_control.parse_clip()'s frames.
-
-    nix develop -c python3 pose/re10k_fetch.py CLIP.txt OUT_DIR [step]
-
-yt-dlp comes from `nix run nixpkgs#yt-dlp`; ffmpeg is on PATH. Dead/blocked videos
-raise -- use find_live_clip to skip them.
-"""
-import os
-import subprocess
-import re10k_control as rc
-
-
-def _ytdlp(*a):
-    return ["nix", "run", "nixpkgs#yt-dlp", "--", *a]
-
-
-def is_live(url, timeout=60):
-    r = subprocess.run(_ytdlp("--no-warnings", "--simulate", "--get-duration", url),
-                       capture_output=True, text=True, timeout=timeout)
-    return r.returncode == 0
-
-
-def download_video(url, out_dir, height=720, timeout=900):
-    os.makedirs(out_dir, exist_ok=True)
-    vid = os.path.join(out_dir, "video.mp4")
-    if os.path.exists(vid) and os.path.getsize(vid) > 0:
-        return vid
-    fmt = (f"bv*[height<={height}][ext=mp4]/bv*[height<={height}]/"
-           f"b[height<={height}]/best")
-    r = subprocess.run(_ytdlp("-f", fmt, "--no-warnings", "--no-playlist",
-                              "-o", vid, url),
-                       capture_output=True, text=True, timeout=timeout)
-    if not (os.path.exists(vid) and os.path.getsize(vid) > 0):
-        raise RuntimeError("yt-dlp failed:\n" + r.stderr[-600:])
-    return vid
-
-
-def extract_frames(vid, frames, indices, out_dir):
-    imgs = {}
-    for i in indices:
-        ts = frames[i]["ts"] / 1e6                     # absolute video time (s)
-        p = os.path.join(out_dir, f"f{i:04d}.png")
-        if not os.path.exists(p):
-            subprocess.run(["ffmpeg", "-nostdin", "-loglevel", "error",
-                            "-ss", f"{ts:.6f}", "-i", vid, "-frames:v", "1",
-                            "-q:v", "2", "-y", p], check=True, timeout=120)
-        imgs[i] = p
-    return imgs
-
-
-def fetch(clip_path, out_dir, indices=None, step=10, height=720):
-    url, frames = rc.parse_clip(clip_path)
-    if indices is None:
-        indices = list(range(0, len(frames), step))
-    vid = download_video(url, out_dir, height)
-    imgs = extract_frames(vid, frames, indices, out_dir)
-    return url, frames, imgs
-
-
-def find_live_clip(split="test", start=0, limit=300, min_frames=120):
-    """First clip in [start, start+limit) that has >= min_frames AND a live video."""
-    for cp in rc.list_clips(split)[start:start + limit]:
-        _, fr = rc.parse_clip(cp)
-        if len(fr) < min_frames:
-            continue
-        try:
-            if is_live(rc.parse_clip(cp)[0]):
-                return cp
-        except subprocess.TimeoutExpired:
-            continue
-    return None
-
-
-if __name__ == "__main__":
-    import sys
-    clip, out_dir = sys.argv[1], sys.argv[2]
-    step = int(sys.argv[3]) if len(sys.argv) > 3 else 10
-    url, frames, imgs = fetch(clip, out_dir, step=step)
-    print(f"got {len(imgs)} frames from {url}")
-    print("indices:", sorted(imgs)[:6], "...", sorted(imgs)[-3:])
diff --git a/pose/render_ply.py b/pose/render_ply.py
deleted file mode 100644
index 1cde6c7..0000000
--- a/pose/render_ply.py
+++ /dev/null
@@ -1,54 +0,0 @@
-"""Quick pinhole projection of a colored PLY to a PNG (z-buffered point splat) --
-a visual coherence check for the accumulated cloud, no 3D viewer needed.
-
-    nix develop -c python3 pose/render_ply.py world.ply out.png [focal] [radius]
-"""
-import sys
-import numpy as np
-import cv2
-
-
-def read_ply(path):
-    with open(path, "rb") as f:
-        n = 0
-        while True:
-            l = f.readline()
-            if l.startswith(b"element vertex"):
-                n = int(l.split()[-1])
-            if l.strip() == b"end_header":
-                break
-        rec = np.frombuffer(f.read(n * 15),
-                            dtype=[("x", "<f4"), ("y", "<f4"), ("z", "<f4"),
-                                   ("r", "u1"), ("g", "u1"), ("b", "u1")])
-    xyz = np.stack([rec["x"], rec["y"], rec["z"]], 1).astype(np.float64)
-    rgb = np.stack([rec["r"], rec["g"], rec["b"]], 1)
-    return xyz, rgb
-
-
-def render(xyz, rgb, focal, H=512, W=512, c2w=None, radius=1):
-    w2c = np.eye(4) if c2w is None else np.linalg.inv(c2w)
-    Xc = (w2c[:3, :3] @ xyz.T).T + w2c[:3, 3]
-    good = Xc[:, 2] > 1e-3
-    Xc, col = Xc[good], rgb[good]
-    u = focal * Xc[:, 0] / Xc[:, 2] + W / 2
-    v = focal * Xc[:, 1] / Xc[:, 2] + H / 2
-    ui, vi = np.round(u).astype(int), np.round(v).astype(int)
-    img = np.zeros((H, W, 3), np.uint8)
-    order = np.argsort(-Xc[:, 2])                       # far first; near overwrites
-    ui, vi, col = ui[order], vi[order], col[order]
-    for dy in range(-radius, radius + 1):
-        for dx in range(-radius, radius + 1):
-            uu, vv = ui + dx, vi + dy
-            inb = (uu >= 0) & (uu < W) & (vv >= 0) & (vv < H)
-            img[vv[inb], uu[inb]] = col[inb]
-    return img
-
-
-if __name__ == "__main__":
-    ply, out = sys.argv[1], sys.argv[2]
-    focal = float(sys.argv[3]) if len(sys.argv) > 3 else 274.0
-    radius = int(sys.argv[4]) if len(sys.argv) > 4 else 1
-    xyz, rgb = read_ply(ply)
-    img = render(xyz, rgb, focal, radius=radius)
-    cv2.imwrite(out, cv2.cvtColor(img, cv2.COLOR_RGB2BGR))
-    print(f"rendered {len(xyz):,} pts -> {out}  ({(img.any(2)).mean()*100:.0f}% filled)")
diff --git a/pose/test_pose.py b/pose/test_pose.py
deleted file mode 100644
index 3ad0c80..0000000
--- a/pose/test_pose.py
+++ /dev/null
@@ -1,258 +0,0 @@
-"""Asset-free golden tests for the pose / coordinate-normalization prototype.
-
-No model, no fixtures, no cv2: synthetic geometry with KNOWN ground truth, the
-way CLAUDE.md wants new ops verified ("pinned to hand-computed references... a
-wrong op should fail with zero fixtures"). Run:
-
-    nix develop -c python3 pose/test_pose.py     # from repo root
-    cd pose && nix develop -c python3 test_pose.py
-
-Each test prints PASS/FAIL with the numbers it checked; exit code is nonzero on
-any failure.
-"""
-import numpy as np
-
-import align
-import focal
-import pnp
-
-RNG = np.random.default_rng(12345)
-_fails = []
-
-
-def check(name, cond, detail: object = ""):
-    tag = "PASS" if cond else "FAIL"
-    print(f"  [{tag}] {name}" + (f"  ({detail})" if detail else ""))
-    if not cond:
-        _fails.append(name)
-
-
-def rand_rotation(rng):
-    Q, _ = np.linalg.qr(rng.normal(size=(3, 3)))
-    if np.linalg.det(Q) < 0:
-        Q[:, 0] = -Q[:, 0]
-    return Q
-
-
-def rot_angle(R):
-    return np.degrees(np.arccos(np.clip((np.trace(R) - 1) / 2, -1, 1)))
-
-
-# ===========================================================================
-# Coordinate-system normalization (the scale question)
-# ===========================================================================
-
-def test_similarity_roundtrip():
-    """Recover a KNOWN (s,R,t) exactly -- the fundamental correctness golden."""
-    print("test_similarity_roundtrip")
-    X = RNG.normal(size=(200, 3)) * np.array([3, 2, 5])
-    s_t, R_t, t_t = 1.7, rand_rotation(RNG), np.array([4.0, -1.0, 2.0])
-    Y = align.apply_sim((s_t, R_t, t_t), X)
-    s, R, t = align.fit_similarity(X, Y)
-    check("scale", abs(s - s_t) < 1e-9, f"{s:.12f} vs {s_t}")
-    check("rotation", rot_angle(R @ R_t.T) < 1e-7, f"{rot_angle(R @ R_t.T):.2e} deg")
-    check("translation", np.linalg.norm(t - t_t) < 1e-7)
-    check("residual~0", align._rms(align.apply_sim((s, R, t), X) - Y) < 1e-9)
-
-
-def test_scale_detection():
-    """B differs from A by PURE uniform scale: rigid fails, similarity nails it.
-    This is the detector for 'you need the scale DoF' -- the expected cross-run case."""
-    print("test_scale_detection")
-    X = RNG.normal(size=(300, 3)) * 2.0
-    s_t = 2.5
-    Y = align.apply_sim((s_t, rand_rotation(RNG), np.array([1.0, 2.0, -3.0])), X)
-    L = align.diagnose(X, Y)
-    check("recovered scale", abs(L["scale"] - s_t) < 1e-8, f"{L['scale']:.10f}")
-    check("rigid residual large", L["rigid"] / L["scene"] > 0.1, f"{L['rigid']/L['scene']:.3f}")
-    check("similarity residual ~0", L["similarity"] / L["scene"] < 1e-9)
-    check("verdict needs_scale", L["verdict"] == "needs_scale", L["verdict"])
-
-
-def test_nonlinear_detection():
-    """Affine and genuinely-nonlinear warps are distinguished from pure scale."""
-    print("test_nonlinear_detection")
-    X = RNG.normal(size=(400, 3)) * 2.0 + np.array([0, 0, 6.0])
-
-    # (a) anisotropic (affine) warp: stretch axes unequally -> not a similarity,
-    #     but affine fits exactly.
-    A = np.diag([1.0, 1.6, 0.7]) @ rand_rotation(RNG)
-    Yaff = (A @ X.T).T + np.array([1.0, 0.0, 2.0])
-    L = align.diagnose(X, Yaff)
-    check("affine: similarity fails", L["similarity"] / L["scene"] > 1e-3,
-          f"{L['similarity']/L['scene']:.3f}")
-    check("affine: affine fits", L["affine"] / L["scene"] < 1e-9)
-    check("affine: verdict needs_affine", L["verdict"] == "needs_affine", L["verdict"])
-
-    # (b) nonlinear warp: depth-quadratic stretch (focal-error fingerprint) ->
-    #     even affine cannot remove it, and the residual correlates with depth.
-    Z = X[:, 2]
-    Ynl = X.copy()
-    Ynl[:, 0] *= (1.0 + 0.15 * (Z - Z.mean()) ** 2 / Z.var())
-    L2 = align.diagnose(X, Ynl)
-    check("nonlinear: affine fails", L2["affine"] / L2["scene"] > 1e-3,
-          f"{L2['affine']/L2['scene']:.3f}")
-    check("nonlinear: verdict nonlinear", L2["verdict"] == "nonlinear", L2["verdict"])
-    check("nonlinear: depth-correlated residual", L2["structured"],
-          f"corr={L2['depth_corr']:.2f}")
-
-
-def test_outlier_robustness():
-    """RANSAC similarity survives 30% gross floaters in the overlap."""
-    print("test_outlier_robustness")
-    n = 400
-    X = RNG.normal(size=(n, 3)) * 2.0
-    s_t, R_t, t_t = 1.3, rand_rotation(RNG), np.array([0.5, -2.0, 1.0])
-    Y = align.apply_sim((s_t, R_t, t_t), X)
-    n_out = int(0.3 * n)
-    out_idx = RNG.choice(n, n_out, replace=False)
-    Y[out_idx] += RNG.normal(size=(n_out, 3)) * 10.0           # floaters
-    s, R, t, inl = align.fit_similarity_ransac(X, Y, thresh=0.05)
-    check("scale recovered", abs(s - s_t) < 1e-3, f"{s:.6f} vs {s_t}")
-    check("rotation recovered", rot_angle(R @ R_t.T) < 0.1)
-    check("translation recovered", np.linalg.norm(t - t_t) < 1e-2)
-    check("outliers excluded", (~inl[out_idx]).mean() > 0.9,
-          f"{(~inl[out_idx]).mean()*100:.0f}% of outliers rejected")
-
-
-def test_loop_correction():
-    """Sim(3) even-distribution loop closure recovers a loop from uniform drift."""
-    print("test_loop_correction")
-    R = rand_rotation(RNG)
-    M = align.sim_matrix(1.07, R, np.array([0.3, -0.2, 0.1]))
-    check("M^0 == I", np.allclose(align.sim_frac_power(M, 0.0), np.eye(4), atol=1e-9))
-    check("M^1 == M", np.allclose(align.sim_frac_power(M, 1.0), M, atol=1e-9))
-    check("(M^.5)^2 == M",
-          np.allclose(align.sim_frac_power(M, .5) @ align.sim_frac_power(M, .5), M, atol=1e-9))
-    # distributing a known accumulated drift D^(k/n) cancels it to ~0
-    n = 12
-    D = align.sim_matrix(1.15, R, np.array([0.4, -0.2, 0.1]))
-    clean = np.array([[np.cos(t), np.sin(t), 0.1 * t]
-                      for t in np.linspace(0, 2 * np.pi, n + 1)])
-    drift = np.array([(align.sim_frac_power(np.linalg.inv(D), k / n) @
-                       np.append(clean[k], 1))[:3] for k in range(n + 1)])
-    corr = np.array([(align.sim_frac_power(D, k / n) @
-                      np.append(drift[k], 1))[:3] for k in range(n + 1)])
-    check("drift present before", align._rms(drift - clean) > 0.1)
-    check("recovers clean loop", align._rms(corr - clean) < 1e-9,
-          f"ATE {align._rms(corr - clean):.1e}")
-
-
-def test_loop_closure():
-    """A closed loop of consistent links ~= identity; injected scale drift shows."""
-    print("test_loop_closure")
-    links = []
-    for _ in range(4):
-        links.append((RNG.uniform(0.8, 1.2), rand_rotation(RNG), RNG.normal(size=3)))
-    # close the loop exactly: last link = inverse of the composed first three
-    T = align.identity()
-    for Ti in links[:3]:
-        T = align.compose(Ti, T)
-    links_closed = links[:3] + [align.invert(T)]
-    e = align.loop_closure_error(links_closed)
-    check("closed loop ~ identity", e["scale_err"] < 1e-9 and e["rot_deg"] < 1e-6
-          and e["trans"] < 1e-9, f"scale_err={e['scale_err']:.1e}")
-    # now inject a 12% scale error into one link
-    bad = links_closed[:]
-    bad[0] = (bad[0][0] * 1.12, bad[0][1], bad[0][2])
-    e2 = align.loop_closure_error(bad)
-    check("scale drift detected", abs(e2["scale_err"] - np.log(1.12)) < 1e-9,
-          f"scale_err={e2['scale_err']:.4f} (expected {np.log(1.12):.4f})")
-
-
-# ===========================================================================
-# PnP (verified against analytic ground truth, numpy backend)
-# ===========================================================================
-
-def make_intrinsics(f=400.0, H=512, W=512):
-    pp = np.array([W / 2.0, H / 2.0])
-    K = np.array([[f, 0, pp[0]], [0, f, pp[1]], [0, 0, 1.0]])
-    return K, pp, f, H, W
-
-
-def project(Pw, R, t, K):
-    Xc = (R @ Pw.T).T + t
-    proj = (K @ Xc.T).T
-    return proj[:, :2] / proj[:, 2:3], Xc[:, 2]
-
-
-def test_focal_recovery():
-    """Weiszfeld recovers a known focal from exact (point, pixel) pairs."""
-    print("test_focal_recovery")
-    K, pp, f, H, W = make_intrinsics(f=437.0)
-    Xc = RNG.uniform([-2, -2, 4], [2, 2, 9], size=(500, 3))   # camera-frame, Z>0
-    px = f * Xc[:, :2] / Xc[:, 2:3] + pp
-    f_est = focal.estimate_focal(Xc, px, pp)
-    check("exact focal", abs(f_est - f) < 1e-6, f"{f_est:.9f} vs {f}")
-    px_noisy = px + RNG.normal(size=px.shape) * 0.5
-    f_n = focal.estimate_focal(Xc, px_noisy, pp)
-    check("focal under noise", abs(f_n - f) / f < 0.02, f"{f_n:.2f} vs {f}")
-
-
-def test_pnp_recovery():
-    """numpy DLT+RANSAC PnP recovers known per-view extrinsics (cheirality ok)."""
-    print("test_pnp_recovery")
-    K, pp, f, H, W = make_intrinsics()
-    Pw = RNG.uniform([-2, -2, 4], [2, 2, 8], size=(500, 3))   # world == view-0 cam frame
-    worst_R, worst_t = 0.0, 0.0
-    for _ in range(8):
-        # small motion so all points stay in front of the moved camera
-        ax = RNG.normal(size=3)
-        ax /= np.linalg.norm(ax)
-        ang = np.radians(RNG.uniform(3, 20))
-        K_ = np.array([[0, -ax[2], ax[1]], [ax[2], 0, -ax[0]], [-ax[1], ax[0], 0]])
-        R = np.eye(3) + np.sin(ang) * K_ + (1 - np.cos(ang)) * K_ @ K_
-        t = RNG.uniform(-0.6, 0.6, size=3)
-        px, z = project(Pw, R, t, K)
-        if np.any(z <= 0):
-            continue
-        c2w, inl = pnp.solve_pnp_numpy(Pw, px, K)
-        w2c = np.linalg.inv(c2w)
-        worst_R = max(worst_R, rot_angle(w2c[:3, :3] @ R.T))
-        worst_t = max(worst_t, np.linalg.norm(w2c[:3, 3] - t))
-    check("rotation recovered", worst_R < 1e-4, f"worst {worst_R:.2e} deg")
-    check("translation recovered", worst_t < 1e-5, f"worst {worst_t:.2e}")
-
-
-def test_pnp_outliers():
-    """PnP RANSAC tolerates corrupted correspondences (floaters)."""
-    print("test_pnp_outliers")
-    K, pp, f, H, W = make_intrinsics()
-    Pw = RNG.uniform([-2, -2, 4], [2, 2, 8], size=(600, 3))
-    R = rand_rotation(RNG)
-    # keep rotation modest so points stay in front
-    R = np.eye(3) + 0.15 * (R - R.T)
-    R, _ = np.linalg.qr(R)
-    t = np.array([0.3, -0.2, 0.4])
-    px, z = project(Pw, R, t, K)
-    keep = z > 0
-    Pw, px = Pw[keep], px[keep]
-    n_out = int(0.25 * len(px))
-    oi = RNG.choice(len(px), n_out, replace=False)
-    px[oi] += RNG.uniform(-150, 150, size=(n_out, 2))          # corrupt pixels
-    c2w, inl = pnp.solve_pnp_numpy(Pw, px, K, thresh_px=2.0, iters=300)
-    w2c = np.linalg.inv(c2w)
-    check("rotation under outliers", rot_angle(w2c[:3, :3] @ R.T) < 0.2,
-          f"{rot_angle(w2c[:3, :3] @ R.T):.3f} deg")
-    check("translation under outliers", np.linalg.norm(w2c[:3, 3] - t) < 0.02)
-    check("outliers rejected", (~inl[oi]).mean() > 0.85,
-          f"{(~inl[oi]).mean()*100:.0f}% rejected")
-
-
-def main():
-    tests = [
-        test_similarity_roundtrip, test_scale_detection, test_nonlinear_detection,
-        test_outlier_robustness, test_loop_correction, test_loop_closure,
-        test_focal_recovery, test_pnp_recovery, test_pnp_outliers,
-    ]
-    for t in tests:
-        t()
-    print()
-    if _fails:
-        print(f"FAILED ({len(_fails)}): " + ", ".join(_fails))
-        raise SystemExit(1)
-    print("ALL POSE TESTS OK")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/pose/tt_control.py b/pose/tt_control.py
deleted file mode 100644
index fdd402f..0000000
--- a/pose/tt_control.py
+++ /dev/null
@@ -1,124 +0,0 @@
-"""Tanks-and-Temples (NSVF release) loader -- the dense, GT-posed control.
-
-A known-good, ground-truth-posed sequence so that when the live FreeSplatter path
-misbehaves we can tell whether it's the data/model or our code. NSVF layout per
-scene (.cache/tt/TanksAndTemple/<Scene>/):
-    rgb/{split}_{idx}_{origframe}.png   1920x1080
-    pose/{split}_{idx}_{origframe}.txt  4x4 camera-to-world, OpenGL convention
-    intrinsics.txt                       4x4 (fx 0 cx 0 / 0 fy cy 0 / ...)
-split 0 = train, 1 = test (held-out orbit views). Frames sorted by origframe give
-true capture order.
-
-What this gives the control:
-  * GT relative pose between any two frames  -> check our PnP relative pose.
-  * GT focal after 512x512 preprocessing     -> check FreeSplatter's recovered focal.
-  * GT baseline / parallax per frame-stride   -> drive the baseline sweep.
-
-OpenGL->OpenCV: negate the camera y,z axes (columns 1,2 of the c2w rotation),
-matching the runner's `c2ws[:, :, 1:3] *= -1`. cam2world, OpenCV (x right, y down,
-z forward) -- the same convention our pnp.estimate_poses returns.
-
-Usage:
-    nix develop -c python3 pose/tt_control.py [Scene]      # default Truck
-"""
-import os
-import re
-import sys
-import numpy as np
-
-TT_ROOT = os.environ.get(
-    "TT_ROOT",
-    os.path.join(os.path.dirname(__file__), "..", ".cache", "tt", "TanksAndTemple"))
-
-_GL2CV = np.diag([1.0, -1.0, -1.0, 1.0])      # negate camera y,z axes (right-mul)
-
-
-def gl_to_cv(c2w):
-    """OpenGL camera-to-world -> OpenCV camera-to-world."""
-    return c2w @ _GL2CV
-
-
-def load_intrinsics(scene_dir):
-    M = np.loadtxt(os.path.join(scene_dir, "intrinsics.txt"))
-    return {"fx": float(M[0, 0]), "fy": float(M[1, 1]),
-            "cx": float(M[0, 2]), "cy": float(M[1, 2])}
-
-
-def _origframe(name):
-    # {split}_{idx}_{origframe}.ext -> int(origframe)
-    return int(re.split(r"[_.]", name)[2])
-
-
-def load_scene(scene, split=None):
-    """Return (frames, intr). frames: list of dicts {name, idx, orig, img, c2w(OpenCV)}
-    sorted by capture order. split=0/1 to filter train/test, None for all."""
-    scene_dir = scene if os.path.isdir(scene) else os.path.join(TT_ROOT, scene)
-    intr = load_intrinsics(scene_dir)
-    pose_dir, rgb_dir = os.path.join(scene_dir, "pose"), os.path.join(scene_dir, "rgb")
-    frames = []
-    for pf in os.listdir(pose_dir):
-        if not pf.endswith(".txt"):
-            continue
-        sp = int(pf.split("_")[0])
-        if split is not None and sp != split:
-            continue
-        c2w_gl = np.loadtxt(os.path.join(pose_dir, pf))
-        img = os.path.join(rgb_dir, pf[:-4] + ".png")
-        frames.append({"name": pf[:-4], "split": sp, "orig": _origframe(pf),
-                       "img": img, "c2w": gl_to_cv(c2w_gl)})
-    frames.sort(key=lambda f: f["orig"])
-    return frames, intr
-
-
-def rel_pose(c2w_a, c2w_b):
-    """Camera b expressed in camera a's frame: inv(c2w_a) @ c2w_b (OpenCV)."""
-    return np.linalg.inv(c2w_a) @ c2w_b
-
-
-def rot_deg(R):
-    return float(np.degrees(np.arccos(np.clip((np.trace(R[:3, :3]) - 1) / 2, -1, 1))))
-
-
-def scene_radius(frames):
-    """Mean camera-center distance to the centroid of all camera centers."""
-    cen = np.array([f["c2w"][:3, 3] for f in frames])
-    return float(np.linalg.norm(cen - cen.mean(0), axis=1).mean()), cen.mean(0)
-
-
-def baseline_report(scene="Truck"):
-    frames, intr = load_scene(scene)
-    n = len(frames)
-    radius, _ = scene_radius(frames)
-    f512 = intr["fx"] * 512.0 / 1080.0            # after center-crop 1080 + resize 512
-    print(f"=== {scene}: {n} frames "
-          f"(train {sum(f['split']==0 for f in frames)}, "
-          f"test {sum(f['split']==1 for f in frames)}) ===")
-    print(f"intrinsics 1920x1080: fx={intr['fx']:.1f} fy={intr['fy']:.1f} "
-          f"cx={intr['cx']:.1f} cy={intr['cy']:.1f}")
-    print(f"GT focal after 512x512 preprocess (center-crop 1080 -> resize): "
-          f"{f512:.1f} px  (pp=256)")
-    print(f"orbit radius (mean cam dist to center): {radius:.4f}")
-    print("\nframe-stride sweep -- viewpoint change between frame i and i+stride:")
-    print(f"  {'stride':>6} {'pairs':>6} {'rot(deg) med':>13} {'p10..p90':>16} "
-          f"{'baseline/r med':>15} {'parallax(deg)':>14}")
-    for s in (1, 2, 3, 5, 10, 20, 40):
-        rots, bl = [], []
-        for i in range(0, n - s):
-            T = rel_pose(frames[i]["c2w"], frames[i + s]["c2w"])
-            rots.append(rot_deg(T))
-            bl.append(np.linalg.norm(T[:3, 3]))
-        if not rots:
-            continue
-        rots, bl = np.array(rots), np.array(bl)
-        blr = bl / radius
-        # small-angle parallax ~ baseline / radius (rad) -> deg
-        par = np.degrees(np.median(blr))
-        print(f"  {s:>6} {len(rots):>6} {np.median(rots):>13.2f} "
-              f"{np.percentile(rots,10):>7.2f}..{np.percentile(rots,90):<7.2f} "
-              f"{np.median(blr):>15.3f} {par:>14.2f}")
-    print("\nUse this to pick pair strides for the sweet-spot sweep: too small a "
-          "stride -> ~0 parallax (degenerate depth); too large -> low overlap.")
-
-
-if __name__ == "__main__":
-    baseline_report(sys.argv[1] if len(sys.argv) > 1 else "Truck")
diff --git a/pose/tt_experiment.py b/pose/tt_experiment.py
deleted file mode 100644
index 8f9449a..0000000
--- a/pose/tt_experiment.py
+++ /dev/null
@@ -1,79 +0,0 @@
-"""Run FreeSplatter-scene on Tanks-and-Temples pairs; check recovered pose + focal
-against GROUND TRUTH as a function of frame stride (baseline). The control: GT
-poses are known-good, so error here is OUR pipeline or the model being out of
-distribution -- not unknown data.
-
-Metrics per pair (view0 -> view1):
-  GTrot    : GT relative rotation (deg) = the viewpoint change we asked for.
-  valid%   : min over views of opacity>0.05 fraction (PnP support).
-  poseErr  : angle between our recovered relative rotation and GT (deg) -- the
-             scale-free "is the pose right" check.
-  transDir : cos angle between our relative-translation direction and GT (1=perfect;
-             FreeSplatter's metric scale is arbitrary, so only direction is checked).
-  scaleRat : ||our t|| / ||GT t||  -- the monocular scale FreeSplatter chose vs metric.
-  focalErr%: |recovered focal - GT focal(512)| / GT.
-
-    FS_DEVICE=cpu nix develop -c python3 pose/tt_experiment.py Truck 120 2,5,10,20
-"""
-import os
-import sys
-import subprocess
-import numpy as np
-
-import tt_control as tt
-import pnp
-
-HERE = os.path.dirname(os.path.abspath(__file__))
-GGUF = os.environ.get("FS_GGUF", os.path.join(HERE, "..", ".cache", "freesplatter-scene-f16.gguf"))
-CLI = os.environ.get("FS_CLI", os.path.join(HERE, "..", "build", "release", "bin", "free_splatter-cli"))
-DEVICE = os.environ.get("FS_DEVICE", "cpu")
-SCRATCH = os.environ.get("SCRATCH", "/tmp")
-
-
-def run_engine(img0, img1, out):
-    r = subprocess.run([CLI, "--device", DEVICE, "--out", out, GGUF, img0, img1],
-                       capture_output=True, text=True)
-    if r.returncode != 0:
-        raise RuntimeError(f"CLI failed: {r.stderr[-500:]}")
-    a = np.fromfile(out, np.float32).reshape(2, 512, 512, 23)
-    pts = [a[v, ..., 0:3].astype(np.float64) for v in (0, 1)]
-    op = [a[v, ..., 15].astype(np.float64) for v in (0, 1)]
-    return pts, op
-
-
-def cos(a, b):
-    return float(a @ b / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-12))
-
-
-def experiment(scene="Truck", anchor=120, strides=(2, 5, 10, 20)):
-    frames, intr = tt.load_scene(scene)
-    f_gt = intr["fx"] * 512.0 / 1080.0
-    print(f"=== {scene}  anchor #{anchor} ({frames[anchor]['name']})  "
-          f"GT focal(512)={f_gt:.1f}  device={DEVICE} ===")
-    print(f"  {'stride':>6} {'GTrot':>6} {'valid%':>6} {'poseErr':>8} "
-          f"{'transDir':>8} {'scaleRat':>8} {'focal':>7} {'fErr%':>6}")
-    for s in strides:
-        j = anchor + s
-        if j >= len(frames):
-            continue
-        out = os.path.join(SCRATCH, f"tt_{scene}_{anchor}_{s}.f32")
-        pts, op = run_engine(frames[anchor]["img"], frames[j]["img"], out)
-        res = pnp.estimate_poses(pts, op, normalize=False)     # scene recipe
-        c2w, f_est = res["cam2world"], res["focal"]
-        rel = np.linalg.inv(c2w[0]) @ c2w[1]
-        G = tt.rel_pose(frames[anchor]["c2w"], frames[j]["c2w"])
-        poseErr = tt.rot_deg(rel[:3, :3] @ G[:3, :3].T)
-        tdir = cos(rel[:3, 3], G[:3, 3])
-        srat = np.linalg.norm(rel[:3, 3]) / (np.linalg.norm(G[:3, 3]) + 1e-12)
-        validp = min((op[0] > 0.05).mean(), (op[1] > 0.05).mean()) * 100
-        ferr = abs(f_est - f_gt) / f_gt * 100
-        print(f"  {s:>6} {tt.rot_deg(G):>6.1f} {validp:>6.1f} {poseErr:>8.2f} "
-              f"{tdir:>8.4f} {srat:>8.3f} {f_est:>7.1f} {ferr:>6.1f}")
-
-
-if __name__ == "__main__":
-    scene = sys.argv[1] if len(sys.argv) > 1 else "Truck"
-    anchor = int(sys.argv[2]) if len(sys.argv) > 2 else 120
-    strides = tuple(int(x) for x in sys.argv[3].split(",")) if len(sys.argv) > 3 \
-        else (2, 5, 10, 20)
-    experiment(scene, anchor, strides)
diff --git a/src/linalg.h b/src/linalg.h
index 5705621..a510471 100644
--- a/src/linalg.h
+++ b/src/linalg.h
@@ -6,8 +6,9 @@
 // C++ with no extra runtime deps beyond ggml) — every routine here reduces to one
 // primitive: a symmetric cyclic-Jacobi eigensolver. SVD of a 3x3 is built as the
 // eigendecomposition of MᵀM; the DLT nullspace is the smallest eigenvector of
-// AᵀA. This mirrors the dependency-free numpy reference solver in pose/pnp.py,
-// which is itself verified bit-for-bit (~1e-7) against cv2 in check_cv2_parity.py.
+// AᵀA. This mirrored the dependency-free numpy reference solver of the (now
+// removed; see git history) pose/ prototype, itself verified bit-for-bit (~1e-7)
+// against cv2.
 //
 // Everything is f64, row-major, and header-only. Sizes are tiny (<=12) so the
 // O(n^3) Jacobi sweeps are negligible next to the engine forward pass.
diff --git a/src/pose.cpp b/src/pose.cpp
index 01ea13e..78cc1b3 100644
--- a/src/pose.cpp
+++ b/src/pose.cpp
@@ -1,9 +1,13 @@
-// pose.cpp — C++ port of the pose/ prototype (focal.py + align.py + pnp.py).
+// pose.cpp — C++ port of the (now removed) pose/ research prototype: focal +
+// Umeyama/Sim(3) align + robust PnP + sliding-window accumulation / loop closure
+// / consensus fusion. See git history for the Python prototype it was validated
+// against, layer by layer.
 //
-// Faithful to FreeSplatter's scene recipe; the dependency-free numpy reference
-// solver is what ships here (verified ~1e-7 vs cv2, bit-exact vs upstream
-// estimate_poses on real engine output). All linear algebra goes through the
-// self-contained Jacobi eigensolver in linalg.h — no Eigen, no OpenCV.
+// Faithful to FreeSplatter's scene recipe; the dependency-free solver is what
+// ships here (the prototype's numpy reference was verified ~1e-7 vs cv2 and
+// bit-exact vs upstream estimate_poses on real engine output). All linear algebra
+// goes through the self-contained Jacobi eigensolver in linalg.h — no Eigen, no
+// OpenCV.
 #include "pose.h"
 
 #include <algorithm>
diff --git a/src/pose.h b/src/pose.h
index dbe1a23..7641d1a 100644
--- a/src/pose.h
+++ b/src/pose.h
@@ -1,14 +1,14 @@
 // pose.h — downstream camera-pose recovery + cross-run alignment (C++ port).
 //
-// This is the C++ port of the proven `pose/` research prototype (focal.py +
-// align.py + pnp.py): given the engine's per-pixel gaussians it recovers each
+// This is the C++ port of the (now removed; see git history) proven `pose/`
+// research prototype: given the engine's per-pixel gaussians it recovers each
 // view's camera (PnP), and aligns successive runs into one accumulating world
 // (Sim(3)). It is dependency-free — only the self-contained linalg.h — so it
 // ships from the CLI / C API with no Python and no Eigen/OpenCV (see CLAUDE.md).
 //
 // Faithful to FreeSplatter's scene recipe (estimate_poses -> DUSt3R fast_pnp):
-// the numpy reference solver, verified ~1e-7 against cv2 and bit-exact to upstream
-// estimate_poses on real engine output (pose/check_upstream_parity.py).
+// the prototype's numpy reference solver was verified ~1e-7 against cv2 and
+// bit-exact to upstream estimate_poses on real engine output.
 //
 // Conventions: f64 throughout; a similarity acts as x -> s*(R@x)+t; gaussian
 // channel layout (scene, 23ch) is xyz[0:3] SH[3:15] opacity[15] scale[16:19]
@@ -71,7 +71,7 @@ LoopError loop_closure_error(const std::vector<Sim3> & links);
 Mat4 sim_frac_power(const Mat4 & M, double f);
 
 // ---- PnP -------------------------------------------------------------------
-// RANSAC DLT PnP with known intrinsics K (mirrors pose/pnp.py numpy backend).
+// RANSAC DLT PnP with known intrinsics K (the prototype's numpy backend).
 // Xw is N*3 world points, pixels is N*2 (col,row). Returns cam2world (4x4) and
 // fills `inliers` (size N, 0/1). Kept as the asset-free golden-test reference; on
 // real scenes the DLT inherits the planar/mirror degeneracy — use solve_pnp.

From 831a13db1157ccfde0ff32589363adf8c2994c40 Mon Sep 17 00:00:00 2001
From: Richard Palethorpe <io@richiejp.com>
Date: Sat, 27 Jun 2026 16:10:48 +0100
Subject: [PATCH 15/33] fuzz: pose C-API + image-decode harnesses; fix a SIGFPE
 the fuzzer found

Fuzz the untrusted user-facing surfaces opened by the pose work (GGUF stays
trusted, not fuzzed):

- fuzz_pose: the public pose C-API (free_splatter_estimate_poses + the
  accumulator add_pair/cloud/fuse/camera_path) on arbitrary float gaussian
  buffers (NaN/Inf/denormals) and fuzz-chosen geometry.
- fuzz_decode: the image-FILE path (arbitrary bytes -> stb_image -> crop/resize
  -> CHW), the surface a user photo crosses in the CLI/demo. stb is third-party;
  per CLAUDE.md we fuzz the boundary and would guard rather than patch a
  stb-internal trip -- none seen (31k+ runs clean).

Fixes found by fuzz_pose (our code, so fixed not guarded):
- SIGFPE: fit_similarity_ransac sampled `% N` with N=0 (an image pair with no
  overlapping valid pixels). Guard N<3 (RANSAC's minimal sample): all-inlier,
  plain fit when N>=1, else identity.
- Latent float-cast UB: consensus_fuse now skips non-finite cloud points and
  clamps the voxel-coordinate cast, so an Inf point can't make (int32)floor(NaN).

All four fuzzers clean; asset-free ctest tier (incl. test_pose fusion goldens)
still green under ASan/UBSan.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 CMakeLists.txt       |  9 ++++++-
 fuzz/README.md       | 21 ++++++++++++---
 fuzz/fuzz_decode.cpp | 39 ++++++++++++++++++++++++++++
 fuzz/fuzz_pose.cpp   | 62 ++++++++++++++++++++++++++++++++++++++++++++
 src/pose.cpp         | 35 +++++++++++++++++++++----
 5 files changed, 156 insertions(+), 10 deletions(-)
 create mode 100644 fuzz/fuzz_decode.cpp
 create mode 100644 fuzz/fuzz_pose.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ae59312..2e757af 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -145,12 +145,19 @@ if (FREE_SPLATTER_BUILD_TOOLS AND NOT EMSCRIPTEN)
 endif()
 
 if (FREE_SPLATTER_FUZZ)
-    foreach(target fuzz_image fuzz_options)
+    foreach(target fuzz_image fuzz_options fuzz_pose)
         add_executable(${target} fuzz/${target}.cpp)
         target_link_libraries(${target} PRIVATE free_splatter)
         target_include_directories(${target} PRIVATE src)
         target_link_options(${target} PRIVATE -fsanitize=fuzzer)  # libFuzzer main
     endforeach()
+    # fuzz_decode also drives the vendored (third-party) stb decoder, so it needs
+    # the stb implementation TU + headers (built warning-quiet like the CLI).
+    add_executable(fuzz_decode fuzz/fuzz_decode.cpp tools/stb_impl.cpp)
+    target_link_libraries(fuzz_decode PRIVATE free_splatter)
+    target_include_directories(fuzz_decode PRIVATE src ${CMAKE_SOURCE_DIR}/third_party/stb)
+    target_link_options(fuzz_decode PRIVATE -fsanitize=fuzzer)
+    set_source_files_properties(tools/stb_impl.cpp PROPERTIES COMPILE_OPTIONS "-w")
 endif()
 
 # --- WASM (Emscripten) -------------------------------------------------------
diff --git a/fuzz/README.md b/fuzz/README.md
index 8789e0e..e36f47d 100644
--- a/fuzz/README.md
+++ b/fuzz/README.md
@@ -6,8 +6,10 @@ libFuzzer harnesses for the **untrusted** input surfaces, built with
 ```sh
 nix develop .#fuzz
 cmake --preset fuzz && cmake --build --preset fuzz
-mkdir -p corpus_img && ./build/fuzz/fuzz_image -max_total_time=60 corpus_img
-mkdir -p corpus_opt && ./build/fuzz/fuzz_options -max_total_time=60 corpus_opt
+mkdir -p corpus_img    && ./build/fuzz/fuzz_image   -max_total_time=60 corpus_img
+mkdir -p corpus_opt    && ./build/fuzz/fuzz_options -max_total_time=60 corpus_opt
+mkdir -p corpus_pose   && ./build/fuzz/fuzz_pose    -max_total_time=60 corpus_pose
+mkdir -p corpus_decode && ./build/fuzz/fuzz_decode  -max_total_time=60 corpus_decode
 ```
 
 ## Targets
@@ -18,9 +20,20 @@ mkdir -p corpus_opt && ./build/fuzz/fuzz_options -max_total_time=60 corpus_opt
   crashes, reads OOB, or returns a populated buffer on a rejected input.
 - **`fuzz_options`** — the options-builder / device-string setters with
   arbitrary byte strings; asserts no crash and NULL-safe frees.
+- **`fuzz_pose`** — the public pose C-API: `free_splatter_estimate_poses` and the
+  accumulator (`add_pair` → `cloud` / `fuse` / `camera_path`) driven with
+  arbitrary float gaussian buffers (NaN/Inf/denormals) and fuzz-chosen geometry.
+  Caught a real SIGFPE (the RANSAC sampler's `% N` with N=0 valid
+  correspondences) and motivated the non-finite guards in `consensus_fuse`
+  (the float→int voxel-coordinate cast).
+- **`fuzz_decode`** — the untrusted image-FILE path: arbitrary bytes →
+  `stb_image` → center-crop + resize → CHW (the surface a user photo crosses in
+  the CLI / demo). stb is vendored THIRD-PARTY; per CLAUDE.md we fuzz the boundary
+  and would guard (not patch) any stb-internal sanitizer trip — none seen so far.
 
 ## Trust boundary (intentional)
 
 The **GGUF model file is TRUSTED** and is deliberately **not** fuzzed: it is our
-own converter's output, loaded by us. Image inputs and the public C-API string
-arguments are **untrusted** and fuzzed. Keep this asymmetry — see `CLAUDE.md`.
+own converter's output, loaded by us. Image inputs (encoded files and decoded
+pixels) and the public C-API arguments are **untrusted** and fuzzed. Keep this
+asymmetry — see `CLAUDE.md`.
diff --git a/fuzz/fuzz_decode.cpp b/fuzz/fuzz_decode.cpp
new file mode 100644
index 0000000..dcb0a5c
--- /dev/null
+++ b/fuzz/fuzz_decode.cpp
@@ -0,0 +1,39 @@
+// libFuzzer harness for the UNTRUSTED image-FILE decode path: arbitrary bytes ->
+// stb_image -> the CLI's center-crop + resize -> [0,1] CHW. This is the actual
+// surface a user photo crosses in `free_splatter-cli` / the demo. stb_image is a
+// vendored THIRD-PARTY decoder; per CLAUDE.md we fuzz the boundary and, where stb
+// itself trips a sanitizer on malformed input, GUARD it rather than patch stb.
+#include "stb_image.h"
+#include "stb_image_resize2.h"
+
+#include <cstdint>
+#include <cstring>
+#include <vector>
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t * data, size_t size) {
+    int w = 0, h = 0, n = 0;
+    // Decode arbitrary bytes as an image, forcing RGB (the CLI's contract).
+    unsigned char * px = stbi_load_from_memory(data, (int) size, &w, &h, &n, 3);
+    if (!px) return 0;                      // rejected -> nothing to do
+
+    // Guard against a decoded size that would overflow the crop/resize math (stb
+    // can report large dims on crafted headers); the CLI works on sane photos.
+    if ((int64_t) w * h > 64ll * 1024 * 1024 || w <= 0 || h <= 0) { stbi_image_free(px); return 0; }
+
+    // center-crop to a square, then resize to a small size (mirror the CLI).
+    const int s = w < h ? w : h, left = (w - s) / 2, top = (h - s) / 2;
+    std::vector<unsigned char> sq((size_t) s * s * 3);
+    for (int y = 0; y < s; y++)
+        std::memcpy(&sq[(size_t) y * s * 3], &px[((size_t)(top + y) * w + left) * 3], (size_t) s * 3);
+    stbi_image_free(px);
+
+    const int outsz = 32;
+    std::vector<unsigned char> rz((size_t) outsz * outsz * 3);
+    stbir_resize_uint8_linear(sq.data(), s, s, 0, rz.data(), outsz, outsz, 0, STBIR_RGB);
+
+    std::vector<float> chw((size_t) 3 * outsz * outsz);
+    for (int c = 0; c < 3; c++)
+        for (int i = 0; i < outsz * outsz; i++)
+            chw[(size_t) c * outsz * outsz + i] = rz[(size_t) i * 3 + c] / 255.0f;
+    return 0;
+}
diff --git a/fuzz/fuzz_pose.cpp b/fuzz/fuzz_pose.cpp
new file mode 100644
index 0000000..4d737c5
--- /dev/null
+++ b/fuzz/fuzz_pose.cpp
@@ -0,0 +1,62 @@
+// libFuzzer harness for the public pose C-API surface (free_splatter_estimate_poses
+// + the accumulator). These now take caller-supplied float buffers across the ABI,
+// so a binding that feeds NaN/Inf/degenerate geometry must not crash, read OOB, or
+// trip UBSan (e.g. a non-finite float -> int voxel-coord cast). The gaussian buffer
+// is engine output, but the C-API is a public boundary; keep it robust.
+#include "free_splatter.h"
+
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+#include <vector>
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t * data, size_t size) {
+    if (size < 8) return 0;
+
+    // small, fuzz-driven geometry so many inputs are explored cheaply
+    const int gc = 23;
+    const int H = 4 + (data[0] % 5) * 4;          // 4,8,12,16,20
+    const int W = 4 + (data[1] % 5) * 4;
+    const int P = H * W;
+
+    auto fill = [&](std::vector<float> & buf, size_t off) {
+        // reinterpret the fuzz bytes as floats (NaN/Inf/denormals all appear),
+        // tiling deterministically to fill the whole byte range.
+        uint8_t * b = (uint8_t *) buf.data();
+        const size_t nbytes = buf.size() * sizeof(float);
+        for (size_t i = 0; i < nbytes; i++) b[i] = data[(off + i) % size];
+    };
+
+    // 1) estimate_poses on a 2-view buffer
+    {
+        std::vector<float> g((size_t) 2 * P * gc);
+        fill(g, 2);
+        std::vector<float> c2w((size_t) 2 * 16);
+        float focal = 0;
+        free_splatter_estimate_poses(g.data(), 2, H, W, gc, 0.05f, c2w.data(), &focal);
+    }
+
+    // 2) accumulator: add a few pairs, then cloud / fuse / camera_path
+    free_splatter_accumulator * acc = free_splatter_accumulator_new(H, W, 0.05f);
+    if (acc) {
+        const int npairs = 1 + (data[2] % 3);     // 1..3 pairs
+        std::vector<float> g((size_t) 2 * P * gc);
+        for (int k = 0; k < npairs; k++) {
+            fill(g, (size_t) 3 + k);
+            free_splatter_accumulator_add_pair(acc, g.data(), gc);
+        }
+        free_splatter_point * cloud = nullptr; size_t nc = 0;
+        if (free_splatter_accumulator_cloud(acc, &cloud, &nc) == 0) free_splatter_buf_free(cloud);
+
+        const float voxel = 0.005f + (data[3] % 16) * 0.01f;   // 0.005..0.155
+        const int k = 1 + (data[4] % 4);                       // 1..4
+        free_splatter_point * fused = nullptr; size_t nf = 0;
+        if (free_splatter_accumulator_fuse(acc, voxel, k, &fused, &nf) == 0) free_splatter_buf_free(fused);
+
+        float * path = nullptr; int32_t nfr = 0;
+        if (free_splatter_accumulator_camera_path(acc, &path, &nfr) == 0) free_splatter_buf_free(path);
+
+        free_splatter_accumulator_free(acc);
+    }
+    return 0;
+}
diff --git a/src/pose.cpp b/src/pose.cpp
index 78cc1b3..f71a8ac 100644
--- a/src/pose.cpp
+++ b/src/pose.cpp
@@ -218,6 +218,14 @@ Sim3 fit_similarity(const double * X, const double * Y, int N, bool with_scale)
 Sim3 fit_similarity_ransac(const double * X, const double * Y, int N, double thresh,
                            int iters, std::vector<char> & inliers,
                            bool with_scale, uint64_t seed) {
+    // RANSAC's minimal sample is 3 pairs; with fewer there is nothing to sample
+    // (and the sampler would divide by N). Degenerate but valid at the public
+    // boundary (e.g. an image pair with no overlapping valid pixels): take all
+    // points as inliers, and a plain fit when there is at least one.
+    if (N < 3) {
+        inliers.assign(N, 1);
+        return (N >= 1) ? fit_similarity(X, Y, N, with_scale) : sim_identity();
+    }
     Rng rng(seed);
     std::vector<char> best(N, 0);
     int best_cnt = 0;
@@ -962,19 +970,30 @@ FuseStats consensus_fuse(const std::vector<AccumPoint> & cloud, double voxel_fra
     st.raw_points = (int64_t) cloud.size();
     if (cloud.empty()) return st;
 
+    // A non-finite point (NaN/Inf — possible since the cloud is engine output and
+    // this is a public boundary) would poison the extent and, worse, make the
+    // float->int voxel-coord cast UB. Skip them throughout; fuse only finite points.
+    auto finite = [](const AccumPoint & p) {
+        return std::isfinite(p.x) && std::isfinite(p.y) && std::isfinite(p.z);
+    };
     // extent = rms distance to centroid; voxel = voxel_frac * extent; coords from min.
     double mean[3] = {0,0,0}, lo[3] = {1e300,1e300,1e300};
+    int64_t nf = 0;
     for (const auto & p : cloud) {
+        if (!finite(p)) continue;
         mean[0]+=p.x; mean[1]+=p.y; mean[2]+=p.z;
         lo[0]=std::min(lo[0],(double)p.x); lo[1]=std::min(lo[1],(double)p.y); lo[2]=std::min(lo[2],(double)p.z);
+        nf++;
     }
-    for (int c = 0; c < 3; c++) mean[c] /= cloud.size();
+    if (nf == 0) return st;
+    for (int c = 0; c < 3; c++) mean[c] /= nf;
     double ss = 0;
     for (const auto & p : cloud) {
+        if (!finite(p)) continue;
         const double dx=p.x-mean[0], dy=p.y-mean[1], dz=p.z-mean[2];
         ss += dx*dx + dy*dy + dz*dz;
     }
-    const double ext = std::sqrt(ss / cloud.size());
+    const double ext = std::sqrt(ss / nf);
     const double v = voxel_frac * ext;
     if (!(v > 0)) return st;
 
@@ -988,10 +1007,16 @@ FuseStats consensus_fuse(const std::vector<AccumPoint> & cloud, double voxel_fra
     std::unordered_map<Key, Vox, KeyHash> grid;
     grid.reserve(cloud.size() / 2 + 1);
 
+    auto vcoord = [&](double x, double lov) -> int32_t {
+        const double c = std::floor((x - lov) / v);     // finite (x finite, v>0)
+        // clamp to int32 so an extreme-but-finite coordinate can't overflow the cast
+        if (c < -2147483640.0) return -2147483640;
+        if (c >  2147483640.0) return  2147483640;
+        return (int32_t) c;
+    };
     for (const auto & p : cloud) {
-        Key key{ (int32_t) std::floor((p.x - lo[0]) / v),
-                 (int32_t) std::floor((p.y - lo[1]) / v),
-                 (int32_t) std::floor((p.z - lo[2]) / v) };
+        if (!finite(p)) continue;
+        Key key{ vcoord(p.x, lo[0]), vcoord(p.y, lo[1]), vcoord(p.z, lo[2]) };
         Vox & vx = grid[key];
         vx.sx+=p.x; vx.sy+=p.y; vx.sz+=p.z; vx.sr+=p.r; vx.sg+=p.g; vx.sb+=p.b; vx.cnt++;
         bool seen = false;

From 9850bf32c8cb21aa00c4c2708ba5c289da172ed8 Mon Sep 17 00:00:00 2001
From: Richard Palethorpe <io@richiejp.com>
Date: Sat, 27 Jun 2026 16:24:37 +0100
Subject: [PATCH 16/33] demo: accumulating-reconstruction viewer (cloud grows
 as photos are added)

A web demo of the live idea: feed a photo stream, watch one world assemble.

- web/accumulate.html: forks the index.html EWA splat renderer to play a SEQUENCE
  of clouds (the reconstruction from 2 photos, then 3, then 4, ...), with the
  input photos accumulating in the TOP-RIGHT filmstrip (newest highlighted) as
  each is folded in. Auto-advances + gentle auto-orbit; ?start=/auto=/ms=/spin=
  deep-link params. Seeds an unsorted draw order on each step so the cloud paints
  immediately (the depth-sort worker then refines it).
- scripts/make_accumulate_demo.sh: one engine pass over the frames via
  `free_splatter-cli --accumulate` -> acc_2.splat..acc_N.splat + input thumbnails
  + manifest.json + the viewer, a self-contained servable dir.
- CLI --splat-scale default 0.0015 -> 0.006 of extent (point clouds read as
  surfaces, not grains); documented in web/README.md.

Verified end-to-end on a RealEstate10K clip (8 frames -> 7 steps): headless
chromium/SwiftShader renders a coherent dining-room reconstruction that grows
259,702 -> 413,517 splats across the steps, filmstrip populating as designed.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 scripts/make_accumulate_demo.sh |  53 +++++
 tools/free_splatter-cli.cpp     |   2 +-
 web/README.md                   |  22 ++
 web/accumulate.html             | 351 ++++++++++++++++++++++++++++++++
 4 files changed, 427 insertions(+), 1 deletion(-)
 create mode 100755 scripts/make_accumulate_demo.sh
 create mode 100644 web/accumulate.html

diff --git a/scripts/make_accumulate_demo.sh b/scripts/make_accumulate_demo.sh
new file mode 100755
index 0000000..4e6ce94
--- /dev/null
+++ b/scripts/make_accumulate_demo.sh
@@ -0,0 +1,53 @@
+#!/usr/bin/env bash
+# Bake the accumulating-reconstruction demo: run the engine over a stream of
+# photos, fold them into one growing world (free_splatter-cli --accumulate), and
+# lay out a self-contained web demo — the cloud reconstructed from 2 photos, then
+# 3, then 4, …, plus the input thumbnails and the accumulate.html viewer.
+#
+#   MODEL=.cache/freesplatter-scene-f16.gguf \
+#     scripts/make_accumulate_demo.sh OUT_DIR FRAME0 FRAME1 FRAME2 ...
+#
+# Then serve it:  cd OUT_DIR && python3 -m http.server 8080   (open index.html)
+set -euo pipefail
+cd "$(dirname "$0")/.."
+ROOT=$(pwd)
+MODEL=${MODEL:-.cache/freesplatter-scene-f16.gguf}
+MAXSPLATS=${MAXSPLATS:-500000}
+DEVICE=${DEVICE:-cpu}
+CLI=${CLI:-$ROOT/build/release/bin/free_splatter-cli}
+
+[ "$#" -ge 3 ] || { echo "usage: $0 OUT_DIR FRAME0 FRAME1 [FRAME2 ...]  (>=2 frames)" >&2; exit 1; }
+OUT=$1; shift
+FRAMES=("$@")
+mkdir -p "$OUT"
+
+# 1) input thumbnails: center-crop square + resize, mirroring the engine's own
+#    preprocessing so the strip shows exactly the view that was reconstructed.
+for i in "${!FRAMES[@]}"; do
+  ffmpeg -y -loglevel error -i "${FRAMES[$i]}" \
+    -vf "crop='min(iw,ih)':'min(iw,ih)',scale=360:360" "$OUT/view_$i.jpg"
+done
+
+# 2) accumulate: one engine pass over the stream writes acc_2.splat, acc_3.splat,
+#    ... (the cloud after each photo is folded in).
+"$CLI" --device "$DEVICE" --accumulate --splat-prefix "$OUT/acc" \
+  --max-splats "$MAXSPLATS" "$MODEL" "${FRAMES[@]}"
+
+# 3) manifest.json: one step per added photo (acc_n.splat + the n thumbnails,
+#    the last being the one just added).
+n=${#FRAMES[@]}
+{
+  echo '{ "steps": ['
+  for ((k=2;k<=n;k++)); do
+    imgs=""
+    for ((j=0;j<k;j++)); do imgs+="\"view_$j.jpg\""; [ $j -lt $((k-1)) ] && imgs+=", "; done
+    printf '    {"splat":"acc_%d.splat","images":[%s],"n":%d}' "$k" "$imgs" "$k"
+    [ $k -lt $n ] && echo "," || echo ""
+  done
+  echo '  ] }'
+} > "$OUT/manifest.json"
+
+# 4) the viewer, self-contained next to its assets
+cp "$ROOT/web/accumulate.html" "$OUT/index.html"
+echo "demo baked -> $OUT"
+echo "serve: (cd $OUT && python3 -m http.server 8080)  then open http://localhost:8080/"
diff --git a/tools/free_splatter-cli.cpp b/tools/free_splatter-cli.cpp
index 683db95..0ad110e 100644
--- a/tools/free_splatter-cli.cpp
+++ b/tools/free_splatter-cli.cpp
@@ -151,7 +151,7 @@ int main(int argc, char ** argv) {
     float opac_thr = 5e-3f;
     long  max_splats = 0;
     bool  accumulate = false, fuse = false;
-    float voxel = 0.02f, splat_scale = 0.0015f;
+    float voxel = 0.02f, splat_scale = 0.006f;   // splat radius as a fraction of cloud extent
     int   fuse_k = 2;
     std::vector<std::string> inputs;
 
diff --git a/web/README.md b/web/README.md
index 7ca0ec3..094e258 100644
--- a/web/README.md
+++ b/web/README.md
@@ -24,6 +24,28 @@ python3 -m http.server -d web 8000      # then open http://localhost:8000
 
 Load a different file with `?splat=other.splat`.
 
+## Accumulating-reconstruction demo (`accumulate.html`)
+
+A second viewer that plays the **growing world** the pose pipeline builds: the
+cloud reconstructed from 2 photos, then 3, then 4, … as each new view is folded
+in (`free_splatter-cli --accumulate` → cross-run Sim(3) alignment). The input
+photos appear one-per-step in the top-right filmstrip, newest highlighted; the
+WebGL renderer is the same EWA splatting as `index.html`.
+
+```sh
+# bake the demo (engine pass over a photo stream -> a self-contained demo dir)
+nix develop -c scripts/make_accumulate_demo.sh .cache/demo/accumulate \
+    frames/f0000.png frames/f0020.png frames/f0040.png frames/f0060.png ...
+
+# serve it (the bake copies accumulate.html in as index.html)
+python3 -m http.server -d .cache/demo/accumulate 8080   # open http://localhost:8080
+```
+
+URL params: `?ms=2600` step interval, `?start=N` begin at step N, `?auto=0` no
+auto-advance, `?spin=0` no auto-orbit. The cloud shown is the raw accumulated
+cloud; `--fuse` also writes a consensus-fused `acc_fused.splat` (the edge-haze
+floaters removed).
+
 ## Sanity-checking the data separately
 
 The `.splat` uses the antimatter15 layout, so you can verify the *data* (apart
diff --git a/web/accumulate.html b/web/accumulate.html
new file mode 100644
index 0000000..2fe6b39
--- /dev/null
+++ b/web/accumulate.html
@@ -0,0 +1,351 @@
+<!doctype html>
+<!--
+  free-splatter.cpp — ACCUMULATING reconstruction demo.
+  Plays a sequence of splats produced by `free_splatter-cli --accumulate`: the
+  cloud reconstructed from 2 photos, then 3, then 4, … as each new view is folded
+  into one world (cross-run Sim(3) alignment). The input photos appear, one per
+  step, in the top-right filmstrip — the newest highlighted. The WebGL splat
+  renderer is the same EWA formulation as index.html (3DGS / antimatter15).
+
+  Assets come from a manifest.json next to this page:
+    { "steps": [ {"splat":"acc_2.splat","images":["f0.jpg","f1.jpg"],"n":2}, … ] }
+  Each step's splat is the COMPLETE cloud at n frames; images lists every photo
+  folded in so far (the last entry is the one just added).
+-->
+<html lang="en">
+<head>
+<meta charset="utf-8"/>
+<meta name="viewport" content="width=device-width, initial-scale=1"/>
+<title>free-splatter.cpp — accumulating reconstruction</title>
+<style>
+  :root { color-scheme: dark; }
+  * { margin: 0; box-sizing: border-box; }
+  html, body { height: 100%; background: #0b0d10; overflow: hidden; font-family: ui-sans-serif, system-ui, sans-serif; }
+  #gl { position: fixed; inset: 0; width: 100%; height: 100%; display: block; }
+  #panel {
+    position: fixed; top: 16px; left: 16px; z-index: 10; width: 264px;
+    background: rgba(16,19,24,.82); backdrop-filter: blur(8px);
+    border: 1px solid rgba(255,255,255,.08); border-radius: 12px; padding: 14px 16px;
+    color: #e8eaed; font-size: 13px; line-height: 1.45; box-shadow: 0 8px 30px rgba(0,0,0,.4);
+  }
+  #panel h1 { font-size: 14px; font-weight: 650; letter-spacing: .2px; }
+  #panel .sub { color: #8b93a1; font-size: 11.5px; margin-top: 2px; }
+  #bar { height: 6px; background: rgba(255,255,255,.08); border-radius: 99px; overflow: hidden; margin: 12px 0 8px; }
+  #bar > div { height: 100%; width: 0; background: linear-gradient(90deg,#5b8cff,#8a5bff); transition: width .35s ease; }
+  .stat { display: flex; justify-content: space-between; font-variant-numeric: tabular-nums; }
+  .stat span:first-child { color: #8b93a1; }
+  #status { margin-top: 8px; color: #9fd3a0; font-size: 11.5px; min-height: 1.2em; }
+  #status.err { color: #ff8a8a; }
+  #ctl { display: flex; gap: 6px; margin-top: 10px; }
+  #ctl button { flex: 1; padding: 5px 8px; font-size: 12px; color: #e8eaed; cursor: pointer;
+    background: rgba(255,255,255,.06); border: 1px solid rgba(255,255,255,.12); border-radius: 8px; }
+  #ctl button:hover { background: rgba(255,255,255,.12); }
+  /* top-right filmstrip of the input photos, newest on top */
+  #strip { position: fixed; top: 16px; right: 16px; z-index: 10; width: 168px;
+    display: flex; flex-direction: column-reverse; gap: 8px; align-items: flex-end; }
+  #strip .thumb { width: 156px; border-radius: 8px; overflow: hidden; position: relative;
+    border: 1px solid rgba(255,255,255,.12); box-shadow: 0 6px 18px rgba(0,0,0,.45);
+    opacity: .55; transform: scale(.92); transition: opacity .4s ease, transform .4s ease, filter .4s ease;
+    filter: saturate(.8); background: #11141a; }
+  #strip .thumb img { width: 100%; display: block; }
+  #strip .thumb .tag { position: absolute; left: 6px; bottom: 5px; font-size: 10px; padding: 1px 6px;
+    border-radius: 6px; background: rgba(0,0,0,.55); color: #cdd3dc; letter-spacing: .3px; }
+  #strip .thumb.new { opacity: 1; transform: scale(1); filter: saturate(1.05);
+    border-color: rgba(122,162,255,.9); box-shadow: 0 0 0 2px rgba(122,162,255,.5), 0 8px 22px rgba(0,0,0,.5); }
+  #strip .thumb.new .tag { background: rgba(91,140,255,.92); color: #fff; }
+  #hint { position: fixed; bottom: 14px; left: 50%; transform: translateX(-50%); z-index: 10;
+    color: #6b7280; font-size: 11.5px; background: rgba(16,19,24,.7); padding: 6px 12px; border-radius: 8px; }
+</style>
+</head>
+<body>
+<canvas id="gl"></canvas>
+<div id="panel">
+  <h1>accumulating reconstruction</h1>
+  <div class="sub">free-splatter.cpp — one world from a growing photo stream.</div>
+  <div id="bar"><div></div></div>
+  <div class="stat"><span>photos folded in</span><span id="s-frames">0</span></div>
+  <div class="stat"><span>splats</span><span id="s-splats">0</span></div>
+  <div class="stat"><span>fps</span><span id="s-fps">—</span></div>
+  <div id="status">starting…</div>
+  <div id="ctl"><button id="prev">‹ prev</button><button id="play">⏸ pause</button><button id="next">next ›</button></div>
+</div>
+<div id="strip"></div>
+<div id="hint">drag to orbit · scroll to zoom · the cloud grows as each photo is added</div>
+
+<script type="module">
+const ROW = 32;                       // bytes per splat
+const _q = new URLSearchParams(location.search);
+const BASE = (_q.get("dir") || ".").replace(/\/$/, "");
+const STEP_MS = +(_q.get("ms") || 2600);
+const START_STEP = parseInt(_q.get("start") || "0", 10);   // begin at this step
+const AUTOPLAY = _q.get("auto") !== "0";                    // ?auto=0 = no advance
+const $ = (id) => document.getElementById(id);
+const setStatus = (t, err=false) => { const s=$("status"); s.textContent=t; s.classList.toggle("err",err); };
+
+const canvas = $("gl");
+const gl = canvas.getContext("webgl2", { antialias: false, premultipliedAlpha: true, alpha: false });
+if (!gl) { setStatus("WebGL2 not available in this browser.", true); throw new Error("no webgl2"); }
+
+// ---------- shaders (EWA splatting — identical to index.html) ----------
+const VERT = `#version 300 es
+precision highp float; precision highp int;
+uniform highp usampler2D u_tex;
+uniform mat4 u_proj, u_view;
+uniform vec2 u_focal, u_viewport;
+in vec2 a_corner;
+in uint a_index;
+out vec4 v_color;
+out vec2 v_pos;
+void main() {
+  uint t = a_index * 2u;
+  ivec2 wh = textureSize(u_tex, 0);
+  uvec4 A = texelFetch(u_tex, ivec2(int(t % uint(wh.x)), int(t / uint(wh.x))), 0);
+  uvec4 B = texelFetch(u_tex, ivec2(int((t+1u) % uint(wh.x)), int((t+1u) / uint(wh.x))), 0);
+  vec3 center = uintBitsToFloat(A.xyz);
+  vec3 scale  = vec3(uintBitsToFloat(A.w), uintBitsToFloat(B.x), uintBitsToFloat(B.y));
+  uint rgba = B.z, rotp = B.w;
+  v_color = vec4(float(rgba & 0xffu), float((rgba>>8)&0xffu), float((rgba>>16)&0xffu), float((rgba>>24)&0xffu)) / 255.0;
+  vec4 q = (vec4(float(rotp&0xffu), float((rotp>>8)&0xffu), float((rotp>>16)&0xffu), float((rotp>>24)&0xffu)) - 128.0) / 128.0;
+  q = normalize(q);
+  float w=q.x, x=q.y, y=q.z, z=q.w;
+  mat3 R = mat3(
+    1.-2.*(y*y+z*z), 2.*(x*y+w*z),   2.*(x*z-w*y),
+    2.*(x*y-w*z),    1.-2.*(x*x+z*z),2.*(y*z+w*x),
+    2.*(x*z+w*y),    2.*(y*z-w*x),   1.-2.*(x*x+y*y));
+  mat3 S = mat3(scale.x,0.,0., 0.,scale.y,0., 0.,0.,scale.z);
+  mat3 M = R * S;
+  mat3 cov3d = M * transpose(M);
+  vec4 cam = u_view * vec4(center, 1.0);
+  if (cam.z >= -0.2) { gl_Position = vec4(0.0,0.0,2.0,1.0); return; }
+  vec4 clip = u_proj * cam;
+  if (clip.w <= 0.0 || abs(clip.x) > 1.3*clip.w || abs(clip.y) > 1.3*clip.w) {
+    gl_Position = vec4(0.0,0.0,2.0,1.0); return;
+  }
+  mat3 J = mat3(
+    u_focal.x/cam.z, 0.0, -(u_focal.x*cam.x)/(cam.z*cam.z),
+    0.0, -u_focal.y/cam.z, (u_focal.y*cam.y)/(cam.z*cam.z),
+    0.0, 0.0, 0.0);
+  mat3 W = transpose(mat3(u_view));
+  mat3 T = W * J;
+  mat3 cov2 = transpose(T) * cov3d * T;
+  float a = cov2[0][0] + 0.3, b = cov2[0][1], c = cov2[1][1] + 0.3;
+  float det = a*c - b*b;
+  if (det <= 0.0) { gl_Position = vec4(0.0,0.0,2.0,1.0); return; }
+  float mid = 0.5*(a+c);
+  float rad = sqrt(max(0.0, mid*mid - det));
+  float l1 = mid + rad;
+  float l2 = max(mid - rad, 0.1);
+  vec2 d = vec2(b, l1 - a);
+  vec2 e1 = (dot(d,d) > 1e-9) ? normalize(d) : vec2(1.0, 0.0);
+  float MAXPX = 0.3 * min(u_viewport.x, u_viewport.y);
+  vec2 v1 = min(sqrt(2.0*l1), MAXPX) * e1;
+  vec2 v2 = min(sqrt(2.0*l2), MAXPX) * vec2(e1.y, -e1.x);
+  vec2 ndc = clip.xy / clip.w;
+  vec2 offset = (a_corner.x*v1 + a_corner.y*v2) / u_viewport * 2.0;
+  v_pos = a_corner;
+  gl_Position = vec4(ndc + offset, clip.z/clip.w, 1.0);
+}`;
+const FRAG = `#version 300 es
+precision highp float;
+in vec4 v_color; in vec2 v_pos;
+out vec4 o;
+void main() {
+  float p = -dot(v_pos, v_pos);
+  if (p < -4.0) discard;
+  float alpha = exp(p) * v_color.a;
+  o = vec4(v_color.rgb * alpha, alpha);
+}`;
+function shader(type, src){ const s=gl.createShader(type); gl.shaderSource(s,src); gl.compileShader(s);
+  if(!gl.getShaderParameter(s,gl.COMPILE_STATUS)) throw new Error(gl.getShaderInfoLog(s)); return s; }
+const prog = gl.createProgram();
+gl.attachShader(prog, shader(gl.VERTEX_SHADER, VERT));
+gl.attachShader(prog, shader(gl.FRAGMENT_SHADER, FRAG));
+gl.linkProgram(prog);
+if(!gl.getProgramParameter(prog, gl.LINK_STATUS)) { setStatus(gl.getProgramInfoLog(prog), true); throw new Error("link"); }
+gl.useProgram(prog);
+
+const quad = new Float32Array([-2,-2, 2,-2, 2,2, -2,-2, 2,2, -2,2]);
+const quadBuf = gl.createBuffer(); gl.bindBuffer(gl.ARRAY_BUFFER, quadBuf); gl.bufferData(gl.ARRAY_BUFFER, quad, gl.STATIC_DRAW);
+const aCorner = gl.getAttribLocation(prog, "a_corner");
+gl.enableVertexAttribArray(aCorner); gl.vertexAttribPointer(aCorner, 2, gl.FLOAT, false, 0, 0);
+const idxBuf = gl.createBuffer();
+const aIndex = gl.getAttribLocation(prog, "a_index");
+const tex = gl.createTexture();
+const TEX_W = 2048;
+const U = {
+  proj: gl.getUniformLocation(prog,"u_proj"), view: gl.getUniformLocation(prog,"u_view"),
+  focal: gl.getUniformLocation(prog,"u_focal"), viewport: gl.getUniformLocation(prog,"u_viewport"),
+};
+gl.uniform1i(gl.getUniformLocation(prog,"u_tex"), 0);
+gl.disable(gl.DEPTH_TEST);
+gl.enable(gl.BLEND);
+gl.blendFuncSeparate(gl.ONE, gl.ONE_MINUS_SRC_ALPHA, gl.ONE, gl.ONE_MINUS_SRC_ALPHA);
+
+// ---------- splat store ----------
+let count = 0;
+let positions = new Float32Array(0);
+let texData = new Uint32Array(0);
+function ensureCapacity(n){ if (n*8 <= texData.length) return;
+  const cap = Math.max(n, Math.ceil(texData.length/8*1.5), 65536);
+  const td = new Uint32Array(cap*8); td.set(texData); texData = td;
+  const ps = new Float32Array(cap*3); ps.set(positions); positions = ps; }
+function uploadTexture(){
+  const rows = Math.max(1, Math.ceil((count*2)/TEX_W));
+  const buf = new Uint32Array(TEX_W*rows*4);
+  buf.set(texData.subarray(0, count*8));
+  gl.activeTexture(gl.TEXTURE0); gl.bindTexture(gl.TEXTURE_2D, tex);
+  gl.texImage2D(gl.TEXTURE_2D, 0, gl.RGBA32UI, TEX_W, rows, 0, gl.RGBA_INTEGER, gl.UNSIGNED_INT, buf);
+  gl.texParameteri(gl.TEXTURE_2D, gl.TEXTURE_MIN_FILTER, gl.NEAREST);
+  gl.texParameteri(gl.TEXTURE_2D, gl.TEXTURE_MAG_FILTER, gl.NEAREST);
+}
+// load a whole step splat, replacing the store
+function setSplat(bytes){
+  const n = Math.floor(bytes.byteLength / ROW);
+  ensureCapacity(n);
+  const f32 = new Float32Array(bytes.buffer, bytes.byteOffset, n*8);
+  const u32 = new Uint32Array(bytes.buffer, bytes.byteOffset, n*8);
+  texData.set(u32.subarray(0, n*8), 0);
+  for (let i=0;i<n;i++){ positions[i*3]=f32[i*8]; positions[i*3+1]=f32[i*8+1]; positions[i*3+2]=f32[i*8+2]; }
+  count = n;
+  uploadTexture();
+  // Seed an unsorted draw order so the cloud paints immediately; the depth-sort
+  // worker refines it to proper back-to-front asynchronously. Without this there
+  // is a blank gap each step until the worker returns (and a single headless
+  // frame would capture nothing).
+  const seq = new Uint32Array(count);
+  for (let i=0;i<count;i++) seq[i]=i;
+  gl.bindBuffer(gl.ARRAY_BUFFER, idxBuf); gl.bufferData(gl.ARRAY_BUFFER, seq, gl.DYNAMIC_DRAW);
+  sortedIndex = seq; sortedCount = count;
+  $("s-splats").textContent = count.toLocaleString();
+  sortDirty = true;
+}
+
+// ---------- depth-sort worker (identical to index.html) ----------
+const worker = new Worker(URL.createObjectURL(new Blob([`
+let pos=null, n=0;
+onmessage = (e) => {
+  if (e.data.pos) { pos = e.data.pos; n = e.data.n; return; }
+  if (!pos) return;
+  const view = e.data.view, N = Math.min(n, e.data.n);
+  const d = new Float32Array(N); let dmin=Infinity, dmax=-Infinity;
+  for (let i=0;i<N;i++){ const z = view[2]*pos[i*3] + view[6]*pos[i*3+1] + view[10]*pos[i*3+2];
+    d[i]=z; if(z<dmin)dmin=z; if(z>dmax)dmax=z; }
+  const B=65536, cnt=new Uint32Array(B);
+  const scale = dmax>dmin ? (B-1)/(dmax-dmin) : 0;
+  const key=new Uint16Array(N);
+  for(let i=0;i<N;i++){ let k=((d[i]-dmin)*scale)|0; k=k<0?0:k>65535?65535:k; key[i]=k; cnt[k]++; }
+  for(let i=1;i<B;i++) cnt[i]+=cnt[i-1];
+  const order=new Uint32Array(N);
+  for(let i=N-1;i>=0;i--) order[--cnt[key[i]]]=i;
+  postMessage({order, n:N}, [order.buffer]);
+};
+`], {type:"text/javascript"})));
+let sortedIndex = new Uint32Array(0), sortedCount = 0, sorting = false, sortDirty = true;
+worker.onmessage = (e) => { sortedIndex = e.data.order; sortedCount = e.data.n; sorting = false;
+  gl.bindBuffer(gl.ARRAY_BUFFER, idxBuf); gl.bufferData(gl.ARRAY_BUFFER, sortedIndex, gl.DYNAMIC_DRAW); };
+function requestSort(view){ if (sorting || count === 0) return; sorting = true;
+  worker.postMessage({ pos: positions.slice(0, count*3), n: count }); worker.postMessage({ view, n: count }); }
+
+// ---------- camera (orbit + zoom + gentle auto-spin) ----------
+const _p = new URLSearchParams(location.search);
+let target = [0,0,0], yaw = parseFloat(_p.get("yaw") ?? "0.0"), pitch = parseFloat(_p.get("pitch") ?? "0.05"), dist = 2.0;
+let framed = false, autospin = _p.get("spin") !== "0";
+function frameToData(mult){
+  if (count === 0) return;
+  let lo=[1e30,1e30,1e30], hi=[-1e30,-1e30,-1e30];
+  for (let i=0;i<count;i++) for (let k=0;k<3;k++){ const v=positions[i*3+k]; if(v<lo[k])lo[k]=v; if(v>hi[k])hi[k]=v; }
+  const diag = Math.hypot(hi[0]-lo[0], hi[1]-lo[1], hi[2]-lo[2]);
+  target = [(lo[0]+hi[0])/2, (lo[1]+hi[1])/2, (lo[2]+hi[2])/2];
+  dist = Math.max(0.5, (mult||1.0)*diag);
+  framed = true; sortDirty = true;
+}
+let drag=false, lx=0, ly=0;
+canvas.addEventListener("pointerdown", e => { drag=true; autospin=false; lx=e.clientX; ly=e.clientY; canvas.setPointerCapture(e.pointerId); });
+canvas.addEventListener("pointerup",   () => drag=false);
+canvas.addEventListener("pointermove", e => { if(!drag)return; yaw -= (e.clientX-lx)*0.005; pitch -= (e.clientY-ly)*0.005;
+  pitch = Math.max(-1.5, Math.min(1.5, pitch)); lx=e.clientX; ly=e.clientY; sortDirty=true; });
+canvas.addEventListener("wheel", e => { dist *= Math.exp(e.deltaY*0.001); dist=Math.max(0.2,Math.min(10,dist)); sortDirty=true; e.preventDefault(); }, {passive:false});
+
+function perspective(fovy, aspect, n, f){ const t=1/Math.tan(fovy/2); return new Float32Array([t/aspect,0,0,0, 0,t,0,0, 0,0,(f+n)/(n-f),-1, 0,0,(2*f*n)/(n-f),0]); }
+function lookAt(eye, c){ const z=norm(sub(eye,c)), x=norm(cross([0,1,0],z)), y=cross(z,x);
+  return new Float32Array([x[0],y[0],z[0],0, x[1],y[1],z[1],0, x[2],y[2],z[2],0, -dot(x,eye),-dot(y,eye),-dot(z,eye),1]); }
+const sub=(a,b)=>[a[0]-b[0],a[1]-b[1],a[2]-b[2]], dot=(a,b)=>a[0]*b[0]+a[1]*b[1]+a[2]*b[2];
+const cross=(a,b)=>[a[1]*b[2]-a[2]*b[1],a[2]*b[0]-a[0]*b[2],a[0]*b[1]-a[1]*b[0]];
+const norm=(a)=>{const l=Math.hypot(...a)||1;return [a[0]/l,a[1]/l,a[2]/l];};
+function resize(){ const dpr=Math.min(devicePixelRatio,2); canvas.width=innerWidth*dpr; canvas.height=innerHeight*dpr; gl.viewport(0,0,canvas.width,canvas.height); }
+addEventListener("resize", resize); resize();
+
+let frames=0, fpsT=performance.now(), lastT=performance.now();
+function render(){
+  const now=performance.now(), dt=(now-lastT)/1000; lastT=now;
+  if (autospin && !drag) { yaw += dt*0.18; sortDirty=true; }
+  const eye=[target[0]+dist*Math.sin(yaw)*Math.cos(pitch), target[1]+dist*Math.sin(pitch), target[2]+dist*Math.cos(yaw)*Math.cos(pitch)];
+  const view=lookAt(eye,target);
+  const aspect=canvas.width/canvas.height, fovy=1.0;
+  const proj=perspective(fovy, aspect, 0.05, 100);
+  const fy=canvas.height/(2*Math.tan(fovy/2)), fx=fy;
+  if(sortDirty){ requestSort(view); sortDirty=false; }
+  gl.clearColor(0.043,0.051,0.063,1); gl.clear(gl.COLOR_BUFFER_BIT);
+  gl.useProgram(prog);
+  gl.uniformMatrix4fv(U.view,false,view); gl.uniformMatrix4fv(U.proj,false,proj);
+  gl.uniform2f(U.focal,fx,fy); gl.uniform2f(U.viewport,canvas.width,canvas.height);
+  gl.bindBuffer(gl.ARRAY_BUFFER,quadBuf); gl.enableVertexAttribArray(aCorner); gl.vertexAttribPointer(aCorner,2,gl.FLOAT,false,0,0);
+  if(sortedCount>0){
+    gl.bindBuffer(gl.ARRAY_BUFFER,idxBuf); gl.enableVertexAttribArray(aIndex);
+    gl.vertexAttribIPointer(aIndex,1,gl.UNSIGNED_INT,0,0); gl.vertexAttribDivisor(aIndex,1);
+    gl.drawArraysInstanced(gl.TRIANGLES,0,6,sortedCount);
+  }
+  frames++; if(now-fpsT>500){ $("s-fps").textContent=Math.round(frames*1000/(now-fpsT)); frames=0; fpsT=now; }
+  requestAnimationFrame(render);
+}
+
+// ---------- step playback ----------
+let manifest=null, cur=-1, playing=AUTOPLAY, timer=null;
+function renderStrip(step){
+  const strip=$("strip"); strip.innerHTML="";
+  const imgs=step.images||[];
+  const start=Math.max(0, imgs.length-6);            // show at most the latest 6
+  for (let i=start;i<imgs.length;i++){
+    const d=document.createElement("div"); d.className="thumb"+(i===imgs.length-1?" new":"");
+    const im=document.createElement("img"); im.src=`${BASE}/${imgs[i]}`; im.alt="view "+(i+1);
+    const tag=document.createElement("div"); tag.className="tag"; tag.textContent=(i===imgs.length-1?"+ view ":"view ")+(i+1);
+    d.appendChild(im); d.appendChild(tag); strip.appendChild(d);
+  }
+}
+async function loadStep(i){
+  if (!manifest || i<0 || i>=manifest.steps.length) return;
+  cur=i; const step=manifest.steps[i];
+  $("bar").firstElementChild.style.width = Math.round((i+1)/manifest.steps.length*100)+"%";
+  $("s-frames").textContent = step.n;
+  setStatus(`folding view ${step.n} — ${step.splat}`);
+  renderStrip(step);
+  try {
+    const resp = await fetch(`${BASE}/${step.splat}`);
+    if(!resp.ok) throw new Error("HTTP "+resp.status+" for "+step.splat);
+    const bytes = new Uint8Array(await resp.arrayBuffer());
+    setSplat(bytes);
+    if(!framed) frameToData(1.25);                   // frame once; camera stays as it grows
+    setStatus(`${step.n} photos · ${count.toLocaleString()} splats`);
+  } catch(err){ setStatus(err.message, true); console.error(err); }
+}
+function schedule(){ clearTimeout(timer); if(!playing) return;
+  timer=setTimeout(async ()=>{ await loadStep((cur+1)%manifest.steps.length); schedule(); }, STEP_MS); }
+$("prev").onclick = ()=>{ playing=false; $("play").textContent="▶ play"; loadStep(Math.max(0,cur-1)); };
+$("next").onclick = ()=>{ playing=false; $("play").textContent="▶ play"; loadStep(Math.min(manifest.steps.length-1,cur+1)); };
+$("play").onclick = ()=>{ playing=!playing; $("play").textContent = playing?"⏸ pause":"▶ play"; schedule(); };
+
+(async function start(){
+  requestAnimationFrame(render);
+  try {
+    setStatus("loading manifest…");
+    manifest = await (await fetch(`${BASE}/manifest.json`)).json();
+    if(!manifest.steps || !manifest.steps.length) throw new Error("manifest has no steps");
+    const s0 = Math.max(0, Math.min(manifest.steps.length-1, START_STEP));
+    await loadStep(s0);
+    $("play").textContent = playing ? "⏸ pause" : "▶ play";
+    schedule();
+  } catch(err){ setStatus("manifest.json not found — "+err.message, true); console.error(err); }
+})();
+</script>
+</body>
+</html>

From ee5b02c3c2a30df148ae5565bfa12e85a45d90d4 Mon Sep 17 00:00:00 2001
From: Richard Palethorpe <io@richiejp.com>
Date: Sat, 27 Jun 2026 18:33:37 +0100
Subject: [PATCH 17/33] pose: carry gaussian scale + rotation through
 accumulation (render as splats)

The accumulated cloud kept only position+color, so the demo rendered as an
isotropic point cloud. A similarity x->s*(R@x)+t scales a gaussian's covariance by
s^2 and rotates it by R, so the shape transforms cleanly: carry it.

- AccumPoint / free_splatter_point gain scale[3] + rotation quaternion (w,x,y,z).
- Accumulator::add_pair de-interleaves the engine's scale (ch16:19) and rotation
  (ch19:23); add_view sets scale_world = T.s * scale_local and q_world =
  quat(T.R) * q_local (new mat3_to_quat / quat_mul / quat_normalize helpers, with
  a zero/NaN-quaternion -> identity guard).
- consensus_fuse averages the scale and keeps a representative orientation.
- write_cloud_splat emits the real anisotropic scale + rotation (OpenCV->OpenGL
  remap, same as the single-run write_splat); --splat-scale is now a radius
  multiplier (default 1.0), not an isotropic fraction.

Verified: anisotropic scale spans the engine's [1e-4,0.02] (mean axis-ratio ~29)
with per-gaussian orientations; golden test_pose green; fuzz_pose clean (the
identity-quaternion guard keeps the new quat math UB-free on garbage input).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 include/free_splatter.h     |  8 +++--
 src/free_splatter.cpp       |  2 ++
 src/pose.cpp                | 59 +++++++++++++++++++++++++++++++++----
 src/pose.h                  |  9 ++++--
 tests/test_pose.cpp         |  8 +++--
 tools/free_splatter-cli.cpp | 36 ++++++++++------------
 6 files changed, 91 insertions(+), 31 deletions(-)

diff --git a/include/free_splatter.h b/include/free_splatter.h
index 2913c94..f486d56 100644
--- a/include/free_splatter.h
+++ b/include/free_splatter.h
@@ -93,11 +93,15 @@ FREE_SPLATTER_API int free_splatter_estimate_poses(
     int32_t gaussian_channels, float opacity_threshold,
     float * cam2world_out, float * focal_out);
 
-// One accumulated point in the global frame: position, color in [0,1], and the
-// source frame it came from (for consensus fusion).
+// One accumulated 3D gaussian in the global frame: position, color in [0,1], the
+// anisotropic scale and rotation quaternion (w,x,y,z) — transformed through the
+// cross-run Sim(3) so the cloud renders as oriented splats — and the source frame
+// it came from (for consensus fusion).
 typedef struct {
     float x, y, z;
     float r, g, b;
+    float sx, sy, sz;
+    float qw, qx, qy, qz;
     int32_t frame;
 } free_splatter_point;
 
diff --git a/src/free_splatter.cpp b/src/free_splatter.cpp
index 53fd94f..9dade9d 100644
--- a/src/free_splatter.cpp
+++ b/src/free_splatter.cpp
@@ -191,6 +191,8 @@ int emit_points(const std::vector<free_splatter::pose::AccumPoint> & src,
     for (size_t i = 0; i < src.size(); i++) {
         (*out)[i].x = src[i].x; (*out)[i].y = src[i].y; (*out)[i].z = src[i].z;
         (*out)[i].r = src[i].r; (*out)[i].g = src[i].g; (*out)[i].b = src[i].b;
+        (*out)[i].sx = src[i].sx; (*out)[i].sy = src[i].sy; (*out)[i].sz = src[i].sz;
+        (*out)[i].qw = src[i].qw; (*out)[i].qx = src[i].qx; (*out)[i].qy = src[i].qy; (*out)[i].qz = src[i].qz;
         (*out)[i].frame = src[i].frame;
     }
     *n_out = src.size();
diff --git a/src/pose.cpp b/src/pose.cpp
index f71a8ac..6d94f71 100644
--- a/src/pose.cpp
+++ b/src/pose.cpp
@@ -854,6 +854,38 @@ PoseResult estimate_poses(const std::vector<const float *> & points,
 namespace {
 inline float clamp01(float v) { return v < 0.0f ? 0.0f : (v > 1.0f ? 1.0f : v); }
 const double SH_C0 = 0.28209479177387814;   // SH degree-0 basis (DC -> rgb)
+
+// quaternion (w,x,y,z) from a rotation matrix (Shepperd's method)
+std::array<double,4> mat3_to_quat(const Mat3 & R) {
+    const double tr = R(0,0) + R(1,1) + R(2,2);
+    double w,x,y,z;
+    if (tr > 0) {
+        double s = std::sqrt(tr + 1.0) * 2;
+        w = 0.25*s; x = (R(2,1)-R(1,2))/s; y = (R(0,2)-R(2,0))/s; z = (R(1,0)-R(0,1))/s;
+    } else if (R(0,0) > R(1,1) && R(0,0) > R(2,2)) {
+        double s = std::sqrt(1.0 + R(0,0) - R(1,1) - R(2,2)) * 2;
+        w = (R(2,1)-R(1,2))/s; x = 0.25*s; y = (R(0,1)+R(1,0))/s; z = (R(0,2)+R(2,0))/s;
+    } else if (R(1,1) > R(2,2)) {
+        double s = std::sqrt(1.0 + R(1,1) - R(0,0) - R(2,2)) * 2;
+        w = (R(0,2)-R(2,0))/s; x = (R(0,1)+R(1,0))/s; y = 0.25*s; z = (R(1,2)+R(2,1))/s;
+    } else {
+        double s = std::sqrt(1.0 + R(2,2) - R(0,0) - R(1,1)) * 2;
+        w = (R(1,0)-R(0,1))/s; x = (R(0,2)+R(2,0))/s; y = (R(1,2)+R(2,1))/s; z = 0.25*s;
+    }
+    return {w,x,y,z};
+}
+// Hamilton product (w,x,y,z): compose rotation a then... q = a ⊗ b applies b first.
+std::array<double,4> quat_mul(const std::array<double,4> & a, const std::array<double,4> & b) {
+    return { a[0]*b[0] - a[1]*b[1] - a[2]*b[2] - a[3]*b[3],
+             a[0]*b[1] + a[1]*b[0] + a[2]*b[3] - a[3]*b[2],
+             a[0]*b[2] - a[1]*b[3] + a[2]*b[0] + a[3]*b[1],
+             a[0]*b[3] + a[1]*b[2] - a[2]*b[1] + a[3]*b[0] };
+}
+std::array<double,4> quat_normalize(std::array<double,4> q) {
+    const double n = std::sqrt(q[0]*q[0]+q[1]*q[1]+q[2]*q[2]+q[3]*q[3]);
+    if (n > 1e-12) for (auto & c : q) c /= n; else q = {1,0,0,0};
+    return q;
+}
 } // namespace
 
 Accumulator::Accumulator(int H, int W, double opacity_threshold,
@@ -862,8 +894,12 @@ Accumulator::Accumulator(int H, int W, double opacity_threshold,
       riters_(ransac_iters), seed_(seed), T_(sim_identity()), final_cam_(mat4_identity()) {}
 
 void Accumulator::add_view(const float * pts, const float * op, const float * rgb,
-                           const Sim3 & T, int frame) {
+                           const float * scl, const float * rot, const Sim3 & T, int frame) {
     const int P = H_ * W_;
+    // A similarity x -> s*(R@x)+t scales each gaussian's covariance by s^2 and
+    // rotates it by R: so the world-frame axis lengths are s*scale and the world
+    // rotation is qT ⊗ q_local. Compute qT (from T.R) once for the whole view.
+    const std::array<double,4> qT = mat3_to_quat(T.R);
     for (int i = 0; i < P; i++) {
         if (op[i] <= thr_) continue;
         Vec3 x = { pts[3*i], pts[3*i+1], pts[3*i+2] };
@@ -871,6 +907,10 @@ void Accumulator::add_view(const float * pts, const float * op, const float * rg
         AccumPoint p;
         p.x = (float) w[0]; p.y = (float) w[1]; p.z = (float) w[2];
         p.r = rgb[3*i]; p.g = rgb[3*i+1]; p.b = rgb[3*i+2];
+        p.sx = (float) (T.s * scl[3*i]); p.sy = (float) (T.s * scl[3*i+1]); p.sz = (float) (T.s * scl[3*i+2]);
+        std::array<double,4> ql = quat_normalize({ rot[4*i], rot[4*i+1], rot[4*i+2], rot[4*i+3] });
+        std::array<double,4> qw = quat_normalize(quat_mul(qT, ql));
+        p.qw = (float) qw[0]; p.qx = (float) qw[1]; p.qy = (float) qw[2]; p.qz = (float) qw[3];
         p.frame = frame;
         cloud_.push_back(p);
     }
@@ -881,6 +921,7 @@ ChainLink Accumulator::add_pair(const float * g, int gc, double focal) {
     // de-interleave the two views: points (ch 0:3), opacity (ch 15, already
     // sigmoid-activated), rgb = clip(SH_DC * C0 + 0.5, 0, 1) (ch 3:6).
     std::vector<float> pts0(3*P), pts1(3*P), op0(P), op1(P), rgb0(3*P), rgb1(3*P);
+    std::vector<float> scl0(3*P), scl1(3*P), rot0(4*P), rot1(4*P);
     for (int i = 0; i < P; i++) {
         const float * a0 = g + (size_t) i * gc;
         const float * a1 = g + (size_t) (P + i) * gc;
@@ -890,6 +931,9 @@ ChainLink Accumulator::add_pair(const float * g, int gc, double focal) {
             rgb0[3*i+c] = clamp01((float) (a0[3+c] * SH_C0 + 0.5));
             rgb1[3*i+c] = clamp01((float) (a1[3+c] * SH_C0 + 0.5));
         }
+        // gaussian shape: scale ch16:19, rotation quaternion (w,x,y,z) ch19:23
+        for (int c = 0; c < 3; c++) { scl0[3*i+c] = a0[16+c]; scl1[3*i+c] = a1[16+c]; }
+        for (int c = 0; c < 4; c++) { rot0[4*i+c] = a0[19+c]; rot1[4*i+c] = a1[19+c]; }
     }
 
     // recover this pair's cameras (view-0 frame, use_first_focal)
@@ -904,8 +948,8 @@ ChainLink Accumulator::add_pair(const float * g, int gc, double focal) {
         link.sim = sim_identity();
         link.global = T_;
         link.scale = 1.0; link.inlier_frac = 1.0; link.valid_frac = 1.0; link.resid_frac = 0.0;
-        add_view(pts0.data(), op0.data(), rgb0.data(), T_, next_frame_++);   // f_0
-        add_view(pts1.data(), op1.data(), rgb1.data(), T_, next_frame_++);   // f_1
+        add_view(pts0.data(), op0.data(), rgb0.data(), scl0.data(), rot0.data(), T_, next_frame_++);   // f_0
+        add_view(pts1.data(), op1.data(), rgb1.data(), scl1.data(), rot1.data(), T_, next_frame_++);   // f_1
     } else {
         // fit Sim(3) from this run's view 0 (the shared frame, run-k coords) to the
         // previous run's view 1 (same frame, run-(k-1) coords). Mask = valid in both.
@@ -940,7 +984,7 @@ ChainLink Accumulator::add_pair(const float * g, int gc, double focal) {
         link.valid_frac  = (double) n / P;
         link.resid_frac  = (scene > 0 && n_inl > 0) ? rms3(rA.data(), n_inl) / scene : 0.0;
 
-        add_view(pts1.data(), op1.data(), rgb1.data(), T_, next_frame_++);   // f_{k+1}
+        add_view(pts1.data(), op1.data(), rgb1.data(), scl1.data(), rot1.data(), T_, next_frame_++);   // f_{k+1}
     }
 
     cams_.push_back(mat4_mul(sim_matrix(T_), c2w0));     // frame f_k camera (view 0)
@@ -997,7 +1041,8 @@ FuseStats consensus_fuse(const std::vector<AccumPoint> & cloud, double voxel_fra
     const double v = voxel_frac * ext;
     if (!(v > 0)) return st;
 
-    struct Vox { double sx=0,sy=0,sz=0,sr=0,sg=0,sb=0; int64_t cnt=0; std::vector<int32_t> frames; };
+    struct Vox { double sx=0,sy=0,sz=0,sr=0,sg=0,sb=0; double ssx=0,ssy=0,ssz=0; int64_t cnt=0;
+                 float q[4]={1,0,0,0}; bool has_q=false; std::vector<int32_t> frames; };
     struct Key { int32_t i,j,k; bool operator==(const Key&o) const { return i==o.i&&j==o.j&&k==o.k; } };
     struct KeyHash { size_t operator()(const Key&q) const {
         uint64_t h = (uint64_t)(uint32_t)q.i * 0x9E3779B97F4A7C15ULL;
@@ -1019,6 +1064,8 @@ FuseStats consensus_fuse(const std::vector<AccumPoint> & cloud, double voxel_fra
         Key key{ vcoord(p.x, lo[0]), vcoord(p.y, lo[1]), vcoord(p.z, lo[2]) };
         Vox & vx = grid[key];
         vx.sx+=p.x; vx.sy+=p.y; vx.sz+=p.z; vx.sr+=p.r; vx.sg+=p.g; vx.sb+=p.b; vx.cnt++;
+        vx.ssx+=p.sx; vx.ssy+=p.sy; vx.ssz+=p.sz;
+        if (!vx.has_q) { vx.q[0]=p.qw; vx.q[1]=p.qx; vx.q[2]=p.qy; vx.q[3]=p.qz; vx.has_q=true; } // representative orientation
         bool seen = false;
         for (int32_t f : vx.frames) if (f == p.frame) { seen = true; break; }
         if (!seen) vx.frames.push_back(p.frame);
@@ -1033,6 +1080,8 @@ FuseStats consensus_fuse(const std::vector<AccumPoint> & cloud, double voxel_fra
         AccumPoint p;
         p.x = (float) (vx.sx / vx.cnt); p.y = (float) (vx.sy / vx.cnt); p.z = (float) (vx.sz / vx.cnt);
         p.r = (float) (vx.sr / vx.cnt); p.g = (float) (vx.sg / vx.cnt); p.b = (float) (vx.sb / vx.cnt);
+        p.sx = (float) (vx.ssx / vx.cnt); p.sy = (float) (vx.ssy / vx.cnt); p.sz = (float) (vx.ssz / vx.cnt);
+        p.qw = vx.q[0]; p.qx = vx.q[1]; p.qy = vx.q[2]; p.qz = vx.q[3];
         p.frame = (int32_t) vx.frames.size();   // support count (informational)
         fused.push_back(p);
     }
diff --git a/src/pose.h b/src/pose.h
index 7641d1a..d4d96fc 100644
--- a/src/pose.h
+++ b/src/pose.h
@@ -111,11 +111,15 @@ PoseResult estimate_poses(const std::vector<const float *> & points,
                           bool normalize = false, uint64_t seed = 0);
 
 // ---- accumulation: chain successive runs into one world (mirrors accumulate.py)
-// One accumulated point in the global frame, tagged with the source frame it came
-// from (consensus fusion needs the frame id). rgb is in [0,1].
+// One accumulated 3D gaussian in the global frame, tagged with the source frame
+// it came from (consensus fusion needs the frame id). Carries the full anisotropic
+// shape (scale + rotation quaternion w,x,y,z), transformed through the cross-run
+// Sim(3), so the cloud renders as oriented splats, not isotropic points. rgb [0,1].
 struct AccumPoint {
     float x, y, z;
     float r, g, b;
+    float sx, sy, sz;        // anisotropic gaussian scale (global frame)
+    float qw, qx, qy, qz;    // gaussian rotation quaternion (w,x,y,z), global frame
     int32_t frame;
 };
 
@@ -156,6 +160,7 @@ class Accumulator {
 
 private:
     void add_view(const float * pts, const float * op, const float * rgb,
+                  const float * scl, const float * rot,    // P*3 scale, P*4 quat (w,x,y,z)
                   const Sim3 & T, int frame);
 
     int H_, W_;
diff --git a/tests/test_pose.cpp b/tests/test_pose.cpp
index 53e5ec9..405409e 100644
--- a/tests/test_pose.cpp
+++ b/tests/test_pose.cpp
@@ -527,11 +527,15 @@ static void test_consensus_fuse() {
     std::vector<AccumPoint> cloud;
     // 5 surface points, each corroborated by frames {0,1,2} (tight cluster -> one
     // voxel, support 3); 20 isolated single-frame floaters far apart (support 1).
+    // AccumPoint: {xyz, rgb, scale[3], quat(w,x,y,z), frame}; scale/quat are
+    // unused by fusion's counts here, set to a unit gaussian / identity rotation.
     for (int loc = 0; loc < 5; loc++)
         for (int fr = 0; fr < 3; fr++)
-            cloud.push_back({ (float)(2.0*loc) + 0.001f*fr, 0.0f, 0.0f, 0.5f, 0.5f, 0.5f, fr });
+            cloud.push_back({ (float)(2.0*loc) + 0.001f*fr, 0.0f, 0.0f, 0.5f, 0.5f, 0.5f,
+                              1.0f, 1.0f, 1.0f, 1.0f, 0.0f, 0.0f, 0.0f, fr });
     for (int j = 0; j < 20; j++)
-        cloud.push_back({ 40.0f + 2.0f*j, 0.0f, 0.0f, 0.2f, 0.2f, 0.2f, j % 4 });
+        cloud.push_back({ 40.0f + 2.0f*j, 0.0f, 0.0f, 0.2f, 0.2f, 0.2f,
+                          1.0f, 1.0f, 1.0f, 1.0f, 0.0f, 0.0f, 0.0f, j % 4 });
 
     std::vector<AccumPoint> fused;
     FuseStats st = consensus_fuse(cloud, /*voxel_frac=*/0.02, /*k=*/2, fused);
diff --git a/tools/free_splatter-cli.cpp b/tools/free_splatter-cli.cpp
index 0ad110e..2d64f38 100644
--- a/tools/free_splatter-cli.cpp
+++ b/tools/free_splatter-cli.cpp
@@ -93,32 +93,28 @@ static bool write_splat(const float * g, size_t n, int gc, float opac_thr,
     return true;
 }
 
-// Write an accumulated point cloud (xyz + rgb, no per-point covariance) as an
-// antimatter15 .splat, each point a small isotropic gaussian. Caps to max_splats
-// by stride-subsampling. scale_frac sets the splat radius as a fraction of the
-// cloud's rms extent.
+// Write an accumulated gaussian cloud as an antimatter15 .splat, emitting each
+// point's true anisotropic scale + rotation (carried through the Sim(3) by the
+// accumulator) so it renders as oriented splats, not isotropic dots. scale_mult
+// scales the radii (1.0 = as-predicted). Caps to max_splats by stride-subsampling.
 static bool write_cloud_splat(const free_splatter_point * pts, size_t n, size_t max_splats,
-                              float scale_frac, const char * path) {
-    // rms extent about the centroid -> per-point isotropic scale
-    double m[3] = {0,0,0};
-    for (size_t i = 0; i < n; i++) { m[0]+=pts[i].x; m[1]+=pts[i].y; m[2]+=pts[i].z; }
-    if (n) { m[0]/=n; m[1]/=n; m[2]/=n; }
-    double ss = 0;
-    for (size_t i = 0; i < n; i++) { const double dx=pts[i].x-m[0], dy=pts[i].y-m[1], dz=pts[i].z-m[2]; ss += dx*dx+dy*dy+dz*dz; }
-    const float ext = (float) std::sqrt(ss / (n ? n : 1));
-    const float scale = std::max(scale_frac * ext, 1e-6f);
-
+                              float scale_mult, const char * path) {
     const size_t stride = (max_splats > 0 && n > max_splats) ? (n + max_splats - 1) / max_splats : 1;
     std::ofstream f(path, std::ios::binary);
     if (!f) { std::fprintf(stderr, "cannot write %s\n", path); return false; }
     auto u8 = [](float v) -> unsigned char { float t = v < 0 ? 0 : v > 255 ? 255 : v; return (unsigned char) t; };
     size_t written = 0;
     for (size_t i = 0; i < n; i += stride) {
-        // OpenCV (y down, z forward) -> viewer OpenGL (y up): negate y,z.
-        float pos[3]   = { pts[i].x, -pts[i].y, -pts[i].z };
-        float sc[3]    = { scale, scale, scale };
-        unsigned char rgba[4] = { u8(pts[i].r*255.0f), u8(pts[i].g*255.0f), u8(pts[i].b*255.0f), 255 };
-        unsigned char rot[4]  = { 255, 128, 128, 128 };   // identity quaternion (w,x,y,z)
+        const free_splatter_point & p = pts[i];
+        // OpenCV (y down, z forward) -> viewer OpenGL (y up): negate y,z; quaternion
+        // (w,x,y,z) -> (-x, w, -z, y) (the same remap as the single-run write_splat).
+        float pos[3]   = { p.x, -p.y, -p.z };
+        float sc[3]    = { scale_mult*p.sx, scale_mult*p.sy, scale_mult*p.sz };
+        unsigned char rgba[4] = { u8(p.r*255.0f), u8(p.g*255.0f), u8(p.b*255.0f), 255 };
+        float q[4] = { -p.qx, p.qw, -p.qz, p.qy };
+        float nrm = std::sqrt(q[0]*q[0]+q[1]*q[1]+q[2]*q[2]+q[3]*q[3]) + 1e-12f;
+        unsigned char rot[4];
+        for (int c = 0; c < 4; c++) rot[c] = u8(q[c]/nrm * 128.0f + 128.0f);
         f.write((const char *) pos, 12);
         f.write((const char *) sc, 12);
         f.write((const char *) rgba, 4);
@@ -151,7 +147,7 @@ int main(int argc, char ** argv) {
     float opac_thr = 5e-3f;
     long  max_splats = 0;
     bool  accumulate = false, fuse = false;
-    float voxel = 0.02f, splat_scale = 0.006f;   // splat radius as a fraction of cloud extent
+    float voxel = 0.02f, splat_scale = 1.0f;   // multiplier on the predicted gaussian radii
     int   fuse_k = 2;
     std::vector<std::string> inputs;
 

From 546046ce79be0ff6eba7bc50b763827309b76990 Mon Sep 17 00:00:00 2001
From: Richard Palethorpe <io@richiejp.com>
Date: Sat, 27 Jun 2026 20:58:03 +0100
Subject: [PATCH 18/33] demo: add consensus-fused step + accumulate from cached
 .f32 pair dumps

- free_splatter-cli --accumulate now also accepts pre-computed [2,H,W,gc] .f32
  pair dumps (each one pair) instead of images, skipping the engine -- for fast
  re-bakes and fusion sweeps off cached runs. Verified byte-identical acc_8 to the
  image path on the same frames, and it runs in ~4s vs ~2min.
- make_accumulate_demo.sh passes --fuse and appends a final consensus-fused step
  to the manifest; the viewer shows a step's optional "label" (so the demo ends on
  "consensus-fused -- single-view floaters removed", 413k -> 149k splats, the
  edge-haze floaters gone).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 scripts/make_accumulate_demo.sh | 15 ++++---
 tools/free_splatter-cli.cpp     | 69 ++++++++++++++++++++++-----------
 web/accumulate.html             |  3 +-
 3 files changed, 58 insertions(+), 29 deletions(-)

diff --git a/scripts/make_accumulate_demo.sh b/scripts/make_accumulate_demo.sh
index 4e6ce94..4ff775f 100755
--- a/scripts/make_accumulate_demo.sh
+++ b/scripts/make_accumulate_demo.sh
@@ -29,21 +29,24 @@ for i in "${!FRAMES[@]}"; do
 done
 
 # 2) accumulate: one engine pass over the stream writes acc_2.splat, acc_3.splat,
-#    ... (the cloud after each photo is folded in).
-"$CLI" --device "$DEVICE" --accumulate --splat-prefix "$OUT/acc" \
+#    ... (the cloud after each photo is folded in) + acc_fused.splat (the
+#    consensus surface: voxels seen by >= K frames, single-view floaters removed).
+"$CLI" --device "$DEVICE" --accumulate --fuse --splat-prefix "$OUT/acc" \
   --max-splats "$MAXSPLATS" "$MODEL" "${FRAMES[@]}"
 
-# 3) manifest.json: one step per added photo (acc_n.splat + the n thumbnails,
-#    the last being the one just added).
+# 3) manifest.json: one step per added photo (acc_n.splat + the n thumbnails, the
+#    last being the one just added), then a final consensus-fused step.
 n=${#FRAMES[@]}
+allimgs=""
+for ((j=0;j<n;j++)); do allimgs+="\"view_$j.jpg\""; [ $j -lt $((n-1)) ] && allimgs+=", "; done
 {
   echo '{ "steps": ['
   for ((k=2;k<=n;k++)); do
     imgs=""
     for ((j=0;j<k;j++)); do imgs+="\"view_$j.jpg\""; [ $j -lt $((k-1)) ] && imgs+=", "; done
-    printf '    {"splat":"acc_%d.splat","images":[%s],"n":%d}' "$k" "$imgs" "$k"
-    [ $k -lt $n ] && echo "," || echo ""
+    printf '    {"splat":"acc_%d.splat","images":[%s],"n":%d},\n' "$k" "$imgs" "$k"
   done
+  printf '    {"splat":"acc_fused.splat","images":[%s],"n":%d,"label":"consensus-fused — single-view floaters removed"}\n' "$allimgs" "$n"
   echo '  ] }'
 } > "$OUT/manifest.json"
 
diff --git a/tools/free_splatter-cli.cpp b/tools/free_splatter-cli.cpp
index 2d64f38..045d537 100644
--- a/tools/free_splatter-cli.cpp
+++ b/tools/free_splatter-cli.cpp
@@ -187,35 +187,60 @@ int main(int argc, char ** argv) {
                 geo.image_width, geo.image_height, geo.in_channels, geo.gaussian_channels);
     if (inputs.empty()) { free_splatter_free(ctx); return 0; }
 
-    // ---- accumulate mode: chain consecutive image pairs into one world --------
+    // ---- accumulate mode: chain a photo stream into one world -----------------
+    // Two input forms: a stream of IMAGES (the engine runs on each consecutive
+    // pair), or pre-computed `.f32` pair dumps (each a [2,H,W,gc] engine output —
+    // the engine is skipped, for fast re-bakes / fusion sweeps off cached runs).
     if (accumulate) {
-        if (inputs.size() < 2) { std::fprintf(stderr, "--accumulate needs >=2 images\n"); free_splatter_free(ctx); return 2; }
-        const int sz = geo.image_width;
-        // decode every frame once (CHW per view)
+        const int sz = geo.image_width, gc = geo.gaussian_channels;
+        const bool dump_mode = std::all_of(inputs.begin(), inputs.end(),
+                                           [](const std::string & s){ return ends_with(s, ".f32"); });
+        if (!dump_mode && inputs.size() < 2) {
+            std::fprintf(stderr, "--accumulate needs >=2 images (or N .f32 pair dumps)\n"); free_splatter_free(ctx); return 2; }
+        const size_t npairs = dump_mode ? inputs.size() : inputs.size() - 1;
+        const int64_t per_view = (int64_t) geo.in_channels * sz * sz;
+        const size_t  pair_floats = (size_t) 2 * sz * sz * gc;
+
+        // image mode: decode every frame once (CHW per view)
         std::vector<std::vector<float>> frames;
-        for (const std::string & p : inputs) {
-            std::vector<float> img;
-            if (!load_image_chw(p.c_str(), sz, img)) { free_splatter_free(ctx); return 1; }
-            frames.push_back(std::move(img));
-        }
+        if (!dump_mode)
+            for (const std::string & p : inputs) {
+                std::vector<float> img;
+                if (!load_image_chw(p.c_str(), sz, img)) { free_splatter_free(ctx); return 1; }
+                frames.push_back(std::move(img));
+            }
+
         free_splatter_accumulator * acc = free_splatter_accumulator_new(sz, sz, opac_thr);
         if (!acc) { std::fprintf(stderr, "accumulator alloc failed\n"); free_splatter_free(ctx); return 1; }
 
-        const int64_t per_view = (int64_t) geo.in_channels * sz * sz;
-        for (size_t k = 0; k + 1 < frames.size(); k++) {
-            std::vector<float> pair(2 * per_view);
-            std::memcpy(&pair[0],        frames[k].data(),   per_view * sizeof(float));
-            std::memcpy(&pair[per_view], frames[k+1].data(), per_view * sizeof(float));
-            float * g = nullptr; size_t ng = 0;
-            if (free_splatter_run(ctx, pair.data(), 2, sz, sz, &g, &ng) != 0) {
-                std::fprintf(stderr, "run pair %zu failed: %s\n", k, free_splatter_last_error(ctx));
-                free_splatter_accumulator_free(acc); free_splatter_free(ctx); return 1;
+        for (size_t k = 0; k < npairs; k++) {
+            const float * g = nullptr;
+            std::vector<float> dumpbuf;
+            float * runout = nullptr;
+            if (dump_mode) {
+                std::ifstream f(inputs[k], std::ios::binary | std::ios::ate);
+                if (!f) { std::fprintf(stderr, "cannot open %s\n", inputs[k].c_str()); free_splatter_accumulator_free(acc); free_splatter_free(ctx); return 1; }
+                const std::streamsize bytes = f.tellg(); f.seekg(0);
+                if ((size_t)(bytes / sizeof(float)) != pair_floats) {
+                    std::fprintf(stderr, "dump %s: %zu floats, expected 2*%d*%d*%d\n", inputs[k].c_str(),
+                                 (size_t)(bytes/sizeof(float)), sz, sz, gc);
+                    free_splatter_accumulator_free(acc); free_splatter_free(ctx); return 1; }
+                dumpbuf.resize(pair_floats); f.read((char *) dumpbuf.data(), bytes);
+                g = dumpbuf.data();
+            } else {
+                std::vector<float> pair(2 * per_view);
+                std::memcpy(&pair[0],        frames[k].data(),   per_view * sizeof(float));
+                std::memcpy(&pair[per_view], frames[k+1].data(), per_view * sizeof(float));
+                size_t ng = 0;
+                if (free_splatter_run(ctx, pair.data(), 2, sz, sz, &runout, &ng) != 0) {
+                    std::fprintf(stderr, "run pair %zu failed: %s\n", k, free_splatter_last_error(ctx));
+                    free_splatter_accumulator_free(acc); free_splatter_free(ctx); return 1; }
+                g = runout;
             }
-            free_splatter_accumulator_add_pair(acc, g, geo.gaussian_channels);
-            free_splatter_buf_free(g);
+            free_splatter_accumulator_add_pair(acc, g, gc);
+            if (runout) free_splatter_buf_free(runout);
             const int nframes = free_splatter_accumulator_frame_count(acc);
-            std::printf("pair %zu (%s, %s) -> %d frames\n", k,
-                        inputs[k].c_str(), inputs[k+1].c_str(), nframes);
+            std::printf("pair %zu -> %d frames\n", k, nframes);
             if (splat_prefix) {
                 free_splatter_point * cloud = nullptr; size_t nc = 0;
                 free_splatter_accumulator_cloud(acc, &cloud, &nc);
diff --git a/web/accumulate.html b/web/accumulate.html
index 2fe6b39..3a74918 100644
--- a/web/accumulate.html
+++ b/web/accumulate.html
@@ -325,7 +325,8 @@ <h1>accumulating reconstruction</h1>
     const bytes = new Uint8Array(await resp.arrayBuffer());
     setSplat(bytes);
     if(!framed) frameToData(1.25);                   // frame once; camera stays as it grows
-    setStatus(`${step.n} photos · ${count.toLocaleString()} splats`);
+    setStatus(step.label ? `${step.label} · ${count.toLocaleString()} splats`
+                         : `${step.n} photos · ${count.toLocaleString()} splats`);
   } catch(err){ setStatus(err.message, true); console.error(err); }
 }
 function schedule(){ clearTimeout(timer); if(!playing) return;

From 9fef9c907c700f008ad8e9115841b250a95d477f Mon Sep 17 00:00:00 2001
From: Richard Palethorpe <io@richiejp.com>
Date: Sat, 27 Jun 2026 21:08:22 +0100
Subject: [PATCH 19/33] demo: make the consensus-fused step unmistakable +
 defeat stale caching

The fused step (8th) showed the same "8 photos" panel as acc_8, so it read as a
duplicate. Now a labelled step shows a prominent top-center banner ("consensus-
fused -- single-view floaters removed -- N splats"), the stat reads "fused (8)",
and manifest/splat fetches use cache:"no-store" so an updated demo dir is never
served stale.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 web/accumulate.html | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/web/accumulate.html b/web/accumulate.html
index 3a74918..a3c4003 100644
--- a/web/accumulate.html
+++ b/web/accumulate.html
@@ -55,6 +55,14 @@
   #strip .thumb.new .tag { background: rgba(91,140,255,.92); color: #fff; }
   #hint { position: fixed; bottom: 14px; left: 50%; transform: translateX(-50%); z-index: 10;
     color: #6b7280; font-size: 11.5px; background: rgba(16,19,24,.7); padding: 6px 12px; border-radius: 8px; }
+  /* prominent banner shown only on a labelled (consensus-fused) step */
+  #banner { position: fixed; top: 18px; left: 50%; transform: translateX(-50%) translateY(-12px); z-index: 11;
+    display: none; opacity: 0; transition: opacity .4s ease, transform .4s ease;
+    padding: 9px 18px; border-radius: 999px; font-size: 13.5px; font-weight: 600; letter-spacing: .2px;
+    color: #eafff0; background: linear-gradient(90deg, rgba(34,160,90,.95), rgba(46,120,210,.95));
+    border: 1px solid rgba(255,255,255,.25); box-shadow: 0 8px 26px rgba(0,0,0,.5); }
+  #banner.show { display: block; opacity: 1; transform: translateX(-50%) translateY(0); }
+  #banner b { font-weight: 750; }
 </style>
 </head>
 <body>
@@ -70,6 +78,7 @@ <h1>accumulating reconstruction</h1>
   <div id="ctl"><button id="prev">‹ prev</button><button id="play">⏸ pause</button><button id="next">next ›</button></div>
 </div>
 <div id="strip"></div>
+<div id="banner"></div>
 <div id="hint">drag to orbit · scroll to zoom · the cloud grows as each photo is added</div>
 
 <script type="module">
@@ -316,17 +325,23 @@ <h1>accumulating reconstruction</h1>
   if (!manifest || i<0 || i>=manifest.steps.length) return;
   cur=i; const step=manifest.steps[i];
   $("bar").firstElementChild.style.width = Math.round((i+1)/manifest.steps.length*100)+"%";
-  $("s-frames").textContent = step.n;
+  $("s-frames").textContent = step.label ? "fused (" + step.n + ")" : step.n;
+  const banner = $("banner");
+  banner.classList.toggle("show", !!step.label);     // prominent only on the fused step
   setStatus(`folding view ${step.n} — ${step.splat}`);
   renderStrip(step);
   try {
-    const resp = await fetch(`${BASE}/${step.splat}`);
+    const resp = await fetch(`${BASE}/${step.splat}`, { cache: "no-store" });
     if(!resp.ok) throw new Error("HTTP "+resp.status+" for "+step.splat);
     const bytes = new Uint8Array(await resp.arrayBuffer());
     setSplat(bytes);
     if(!framed) frameToData(1.25);                   // frame once; camera stays as it grows
-    setStatus(step.label ? `${step.label} · ${count.toLocaleString()} splats`
-                         : `${step.n} photos · ${count.toLocaleString()} splats`);
+    if (step.label) {
+      banner.innerHTML = `<b>✓ ${step.label}</b> · ${count.toLocaleString()} splats`;
+      setStatus(`${step.label} · ${count.toLocaleString()} splats`);
+    } else {
+      setStatus(`${step.n} photos · ${count.toLocaleString()} splats`);
+    }
   } catch(err){ setStatus(err.message, true); console.error(err); }
 }
 function schedule(){ clearTimeout(timer); if(!playing) return;
@@ -339,7 +354,7 @@ <h1>accumulating reconstruction</h1>
   requestAnimationFrame(render);
   try {
     setStatus("loading manifest…");
-    manifest = await (await fetch(`${BASE}/manifest.json`)).json();
+    manifest = await (await fetch(`${BASE}/manifest.json`, { cache: "no-store" })).json();
     if(!manifest.steps || !manifest.steps.length) throw new Error("manifest has no steps");
     const s0 = Math.max(0, Math.min(manifest.steps.length-1, START_STEP));
     await loadStep(s0);

From da9d76d93414d0bca378617a18934f1f3f7bb14a Mon Sep 17 00:00:00 2001
From: Richard Palethorpe <io@richiejp.com>
Date: Sat, 27 Jun 2026 21:31:35 +0100
Subject: [PATCH 20/33] demo: tighter framing + baseline guidance
 (forward-dolly clips reconstruct blurry)

The default demo clip was a near-pure forward dolly (~0.15% lateral baseline of
scene depth), so two-view depth was unconstrained and the gaussians came out
blurry. Re-baking from an orbiting clip with real sideways motion (stride chosen
for ~9% baseline) reconstructs legibly; tighten the viewer framing (1.25x->0.9x of
the cloud diagonal) so it fills the view, and document the lateral-baseline
requirement in web/README.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 web/README.md       | 11 +++++++++--
 web/accumulate.html |  2 +-
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/web/README.md b/web/README.md
index 094e258..47b11c8 100644
--- a/web/README.md
+++ b/web/README.md
@@ -43,8 +43,15 @@ python3 -m http.server -d .cache/demo/accumulate 8080   # open http://localhost:
 
 URL params: `?ms=2600` step interval, `?start=N` begin at step N, `?auto=0` no
 auto-advance, `?spin=0` no auto-orbit. The cloud shown is the raw accumulated
-cloud; `--fuse` also writes a consensus-fused `acc_fused.splat` (the edge-haze
-floaters removed).
+cloud; the bake also writes a consensus-fused `acc_fused.splat` (the edge-haze
+floaters removed), shown as a final step.
+
+**Pick frames with lateral baseline.** Two-view depth needs the camera to
+*translate sideways* — a pure forward dolly (the camera moving along the direction
+it faces) gives near-zero parallax and reconstructs blurry. Frames a few apart
+from an orbiting / strafing clip work far better than tightly-spaced frames of a
+walk-forward clip. Rule of thumb: the per-pair lateral baseline should be a few %
+of scene depth or more (`scripts/` had a `baseline` probe in git history).
 
 ## Sanity-checking the data separately
 
diff --git a/web/accumulate.html b/web/accumulate.html
index a3c4003..230b083 100644
--- a/web/accumulate.html
+++ b/web/accumulate.html
@@ -335,7 +335,7 @@ <h1>accumulating reconstruction</h1>
     if(!resp.ok) throw new Error("HTTP "+resp.status+" for "+step.splat);
     const bytes = new Uint8Array(await resp.arrayBuffer());
     setSplat(bytes);
-    if(!framed) frameToData(1.25);                   // frame once; camera stays as it grows
+    if(!framed) frameToData(0.9);                    // frame once; camera stays as it grows
     if (step.label) {
       banner.innerHTML = `<b>✓ ${step.label}</b> · ${count.toLocaleString()} splats`;
       setStatus(`${step.label} · ${count.toLocaleString()} splats`);

From e9fd2e52f5eccb2ce67dbddd1bdfe6594baf4b07 Mon Sep 17 00:00:00 2001
From: Richard Palethorpe <io@richiejp.com>
Date: Sat, 27 Jun 2026 21:45:08 +0100
Subject: [PATCH 21/33] pose: carry opacity into the splat alpha (fix the
 opaque, swirling, blurry cloud)

The accumulator dropped per-gaussian opacity, so write_cloud_splat emitted every
splat fully opaque (alpha 255). With "over" alpha compositing that means the
front-most splat at each pixel fully occludes the rest, so as the camera orbits
the depth order flips and the hard anisotropic ellipsoids visibly swirl/rotate --
and the over-contribution reads as blur. The single-run write_splat instead uses
the gaussian's activated opacity as the alpha (mean ~41/255), blending many low-
alpha splats into a stable smooth surface.

Fix: AccumPoint / free_splatter_point carry `opacity`; add_view stores it,
consensus_fuse averages it, and write_cloud_splat emits it as the splat alpha and
caps by importance (opacity*volume) like write_splat. On the same pair the cloud
splat is now byte-identical in alpha (1/254/mean 41) to write_splat and renders as
a clean alpha-blended surface instead of an opaque swirling soup.

test_pose green; the C-API struct gains one float (emit_points copies it).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 include/free_splatter.h     |  2 +-
 src/free_splatter.cpp       |  2 +-
 src/pose.cpp                |  7 ++++---
 src/pose.h                  |  2 +-
 tests/test_pose.cpp         |  8 ++++----
 tools/free_splatter-cli.cpp | 31 ++++++++++++++++++++++---------
 6 files changed, 33 insertions(+), 19 deletions(-)

diff --git a/include/free_splatter.h b/include/free_splatter.h
index f486d56..4dc4f6d 100644
--- a/include/free_splatter.h
+++ b/include/free_splatter.h
@@ -99,7 +99,7 @@ FREE_SPLATTER_API int free_splatter_estimate_poses(
 // it came from (for consensus fusion).
 typedef struct {
     float x, y, z;
-    float r, g, b;
+    float r, g, b, opacity;
     float sx, sy, sz;
     float qw, qx, qy, qz;
     int32_t frame;
diff --git a/src/free_splatter.cpp b/src/free_splatter.cpp
index 9dade9d..8e6f0cc 100644
--- a/src/free_splatter.cpp
+++ b/src/free_splatter.cpp
@@ -190,7 +190,7 @@ int emit_points(const std::vector<free_splatter::pose::AccumPoint> & src,
     if (!*out && !src.empty()) return -1;
     for (size_t i = 0; i < src.size(); i++) {
         (*out)[i].x = src[i].x; (*out)[i].y = src[i].y; (*out)[i].z = src[i].z;
-        (*out)[i].r = src[i].r; (*out)[i].g = src[i].g; (*out)[i].b = src[i].b;
+        (*out)[i].r = src[i].r; (*out)[i].g = src[i].g; (*out)[i].b = src[i].b; (*out)[i].opacity = src[i].opacity;
         (*out)[i].sx = src[i].sx; (*out)[i].sy = src[i].sy; (*out)[i].sz = src[i].sz;
         (*out)[i].qw = src[i].qw; (*out)[i].qx = src[i].qx; (*out)[i].qy = src[i].qy; (*out)[i].qz = src[i].qz;
         (*out)[i].frame = src[i].frame;
diff --git a/src/pose.cpp b/src/pose.cpp
index 6d94f71..a3537c5 100644
--- a/src/pose.cpp
+++ b/src/pose.cpp
@@ -906,7 +906,7 @@ void Accumulator::add_view(const float * pts, const float * op, const float * rg
         Vec3 w = sim_apply(T, x);
         AccumPoint p;
         p.x = (float) w[0]; p.y = (float) w[1]; p.z = (float) w[2];
-        p.r = rgb[3*i]; p.g = rgb[3*i+1]; p.b = rgb[3*i+2];
+        p.r = rgb[3*i]; p.g = rgb[3*i+1]; p.b = rgb[3*i+2]; p.opacity = op[i];
         p.sx = (float) (T.s * scl[3*i]); p.sy = (float) (T.s * scl[3*i+1]); p.sz = (float) (T.s * scl[3*i+2]);
         std::array<double,4> ql = quat_normalize({ rot[4*i], rot[4*i+1], rot[4*i+2], rot[4*i+3] });
         std::array<double,4> qw = quat_normalize(quat_mul(qT, ql));
@@ -1041,7 +1041,7 @@ FuseStats consensus_fuse(const std::vector<AccumPoint> & cloud, double voxel_fra
     const double v = voxel_frac * ext;
     if (!(v > 0)) return st;
 
-    struct Vox { double sx=0,sy=0,sz=0,sr=0,sg=0,sb=0; double ssx=0,ssy=0,ssz=0; int64_t cnt=0;
+    struct Vox { double sx=0,sy=0,sz=0,sr=0,sg=0,sb=0,sop=0; double ssx=0,ssy=0,ssz=0; int64_t cnt=0;
                  float q[4]={1,0,0,0}; bool has_q=false; std::vector<int32_t> frames; };
     struct Key { int32_t i,j,k; bool operator==(const Key&o) const { return i==o.i&&j==o.j&&k==o.k; } };
     struct KeyHash { size_t operator()(const Key&q) const {
@@ -1063,7 +1063,7 @@ FuseStats consensus_fuse(const std::vector<AccumPoint> & cloud, double voxel_fra
         if (!finite(p)) continue;
         Key key{ vcoord(p.x, lo[0]), vcoord(p.y, lo[1]), vcoord(p.z, lo[2]) };
         Vox & vx = grid[key];
-        vx.sx+=p.x; vx.sy+=p.y; vx.sz+=p.z; vx.sr+=p.r; vx.sg+=p.g; vx.sb+=p.b; vx.cnt++;
+        vx.sx+=p.x; vx.sy+=p.y; vx.sz+=p.z; vx.sr+=p.r; vx.sg+=p.g; vx.sb+=p.b; vx.sop+=p.opacity; vx.cnt++;
         vx.ssx+=p.sx; vx.ssy+=p.sy; vx.ssz+=p.sz;
         if (!vx.has_q) { vx.q[0]=p.qw; vx.q[1]=p.qx; vx.q[2]=p.qy; vx.q[3]=p.qz; vx.has_q=true; } // representative orientation
         bool seen = false;
@@ -1080,6 +1080,7 @@ FuseStats consensus_fuse(const std::vector<AccumPoint> & cloud, double voxel_fra
         AccumPoint p;
         p.x = (float) (vx.sx / vx.cnt); p.y = (float) (vx.sy / vx.cnt); p.z = (float) (vx.sz / vx.cnt);
         p.r = (float) (vx.sr / vx.cnt); p.g = (float) (vx.sg / vx.cnt); p.b = (float) (vx.sb / vx.cnt);
+        p.opacity = (float) (vx.sop / vx.cnt);
         p.sx = (float) (vx.ssx / vx.cnt); p.sy = (float) (vx.ssy / vx.cnt); p.sz = (float) (vx.ssz / vx.cnt);
         p.qw = vx.q[0]; p.qx = vx.q[1]; p.qy = vx.q[2]; p.qz = vx.q[3];
         p.frame = (int32_t) vx.frames.size();   // support count (informational)
diff --git a/src/pose.h b/src/pose.h
index d4d96fc..bda4666 100644
--- a/src/pose.h
+++ b/src/pose.h
@@ -117,7 +117,7 @@ PoseResult estimate_poses(const std::vector<const float *> & points,
 // Sim(3), so the cloud renders as oriented splats, not isotropic points. rgb [0,1].
 struct AccumPoint {
     float x, y, z;
-    float r, g, b;
+    float r, g, b, opacity;  // color [0,1] and activated opacity [0,1] (the splat alpha)
     float sx, sy, sz;        // anisotropic gaussian scale (global frame)
     float qw, qx, qy, qz;    // gaussian rotation quaternion (w,x,y,z), global frame
     int32_t frame;
diff --git a/tests/test_pose.cpp b/tests/test_pose.cpp
index 405409e..e00abcb 100644
--- a/tests/test_pose.cpp
+++ b/tests/test_pose.cpp
@@ -527,15 +527,15 @@ static void test_consensus_fuse() {
     std::vector<AccumPoint> cloud;
     // 5 surface points, each corroborated by frames {0,1,2} (tight cluster -> one
     // voxel, support 3); 20 isolated single-frame floaters far apart (support 1).
-    // AccumPoint: {xyz, rgb, scale[3], quat(w,x,y,z), frame}; scale/quat are
-    // unused by fusion's counts here, set to a unit gaussian / identity rotation.
+    // AccumPoint: {xyz, rgb, opacity, scale[3], quat(w,x,y,z), frame}; opacity/
+    // scale/quat are unused by fusion's counts here, set to a unit opaque gaussian.
     for (int loc = 0; loc < 5; loc++)
         for (int fr = 0; fr < 3; fr++)
             cloud.push_back({ (float)(2.0*loc) + 0.001f*fr, 0.0f, 0.0f, 0.5f, 0.5f, 0.5f,
-                              1.0f, 1.0f, 1.0f, 1.0f, 0.0f, 0.0f, 0.0f, fr });
+                              1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.0f, 0.0f, 0.0f, fr });
     for (int j = 0; j < 20; j++)
         cloud.push_back({ 40.0f + 2.0f*j, 0.0f, 0.0f, 0.2f, 0.2f, 0.2f,
-                          1.0f, 1.0f, 1.0f, 1.0f, 0.0f, 0.0f, 0.0f, j % 4 });
+                          1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.0f, 0.0f, 0.0f, j % 4 });
 
     std::vector<AccumPoint> fused;
     FuseStats st = consensus_fuse(cloud, /*voxel_frac=*/0.02, /*k=*/2, fused);
diff --git a/tools/free_splatter-cli.cpp b/tools/free_splatter-cli.cpp
index 045d537..10d966c 100644
--- a/tools/free_splatter-cli.cpp
+++ b/tools/free_splatter-cli.cpp
@@ -94,23 +94,37 @@ static bool write_splat(const float * g, size_t n, int gc, float opac_thr,
 }
 
 // Write an accumulated gaussian cloud as an antimatter15 .splat, emitting each
-// point's true anisotropic scale + rotation (carried through the Sim(3) by the
-// accumulator) so it renders as oriented splats, not isotropic dots. scale_mult
-// scales the radii (1.0 = as-predicted). Caps to max_splats by stride-subsampling.
+// point's true anisotropic scale + rotation (carried through the Sim(3)) AND its
+// activated opacity as the splat alpha — so it renders exactly like the single-run
+// write_splat (proper alpha blending), not a fully-opaque, swirling, blurry soup.
+// scale_mult scales the radii (1.0 = as-predicted). When capping, keep the most
+// important splats (opacity * volume), matching write_splat (not a uniform stride).
 static bool write_cloud_splat(const free_splatter_point * pts, size_t n, size_t max_splats,
                               float scale_mult, const char * path) {
-    const size_t stride = (max_splats > 0 && n > max_splats) ? (n + max_splats - 1) / max_splats : 1;
+    auto importance = [&](size_t i) -> double {
+        const free_splatter_point & p = pts[i];
+        const double vol = (double) std::max(p.sx,1e-9f) * std::max(p.sy,1e-9f) * std::max(p.sz,1e-9f);
+        return (double) std::max(p.opacity, 0.0f) * vol;
+    };
+    std::vector<size_t> idx(n);
+    for (size_t i = 0; i < n; i++) idx[i] = i;
+    if (max_splats > 0 && n > max_splats) {
+        std::partial_sort(idx.begin(), idx.begin() + max_splats, idx.end(),
+                          [&](size_t a, size_t b){ return importance(a) > importance(b); });
+        idx.resize(max_splats);
+    }
+
     std::ofstream f(path, std::ios::binary);
     if (!f) { std::fprintf(stderr, "cannot write %s\n", path); return false; }
     auto u8 = [](float v) -> unsigned char { float t = v < 0 ? 0 : v > 255 ? 255 : v; return (unsigned char) t; };
-    size_t written = 0;
-    for (size_t i = 0; i < n; i += stride) {
+    for (size_t i : idx) {
         const free_splatter_point & p = pts[i];
         // OpenCV (y down, z forward) -> viewer OpenGL (y up): negate y,z; quaternion
         // (w,x,y,z) -> (-x, w, -z, y) (the same remap as the single-run write_splat).
         float pos[3]   = { p.x, -p.y, -p.z };
         float sc[3]    = { scale_mult*p.sx, scale_mult*p.sy, scale_mult*p.sz };
-        unsigned char rgba[4] = { u8(p.r*255.0f), u8(p.g*255.0f), u8(p.b*255.0f), 255 };
+        unsigned char rgba[4] = { u8(p.r*255.0f), u8(p.g*255.0f), u8(p.b*255.0f),
+                                  u8(std::min(std::max(p.opacity, 0.0f), 1.0f) * 255.0f) };
         float q[4] = { -p.qx, p.qw, -p.qz, p.qy };
         float nrm = std::sqrt(q[0]*q[0]+q[1]*q[1]+q[2]*q[2]+q[3]*q[3]) + 1e-12f;
         unsigned char rot[4];
@@ -119,9 +133,8 @@ static bool write_cloud_splat(const free_splatter_point * pts, size_t n, size_t
         f.write((const char *) sc, 12);
         f.write((const char *) rgba, 4);
         f.write((const char *) rot, 4);
-        written++;
     }
-    std::printf("wrote %s: %zu splats (of %zu cloud points)\n", path, written, n);
+    std::printf("wrote %s: %zu splats (of %zu cloud points)\n", path, idx.size(), n);
     return true;
 }
 

From 1341214b9d7378f7b0ca4a3a8f94f8691772402a Mon Sep 17 00:00:00 2001
From: Richard Palethorpe <io@richiejp.com>
Date: Sat, 27 Jun 2026 21:58:37 +0100
Subject: [PATCH 22/33] splat: unify the .splat encoder + pin it with
 regression tests (#1, #2)

Prevent the class of regression that dropped first rotation/scale, then opacity,
from the accumulated-cloud splat writer (a second copy of the encoder that drifted
from the proven single-run one).

#1 Unify: new src/splat.h::encode_splat_record is the ONE definition of the
   OpenCV->OpenGL convention, quaternion remap, opacity->alpha and byte packing.
   Both write_splat (single-run) and write_cloud_splat (cloud) now build a
   (pos,scale,quat,rgb,opacity) tuple and call it, so they cannot diverge again.
   Verified byte-identical to the previous write_splat output (pure refactor).

#2 Pin: two asset-free tests in test_pose.cpp --
   - test_splat_record: pins the encoder bytes, incl. the exact regressed field
     (opacity 0.5 -> alpha 127, NOT a forced 255) and the rotation remap.
   - test_accumulate_channels: a one-pair (T=identity) accumulation must preserve
     every gaussian channel (xyz, SH->rgb, opacity, scale, rotation, frame); a
     dropped channel fails immediately with zero fixtures.

Both guards fail on either historical bug. ctest -LE model green.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/splat.h                 | 44 ++++++++++++++++++++
 tests/test_pose.cpp         | 81 +++++++++++++++++++++++++++++++++++++
 tools/free_splatter-cli.cpp | 46 +++++++--------------
 3 files changed, 140 insertions(+), 31 deletions(-)
 create mode 100644 src/splat.h

diff --git a/src/splat.h b/src/splat.h
new file mode 100644
index 0000000..f5cc2e3
--- /dev/null
+++ b/src/splat.h
@@ -0,0 +1,44 @@
+// splat.h — the single, shared encoder for an antimatter15 .splat record.
+//
+// There are two producers of .splat output: the single-run path (write_splat,
+// straight from the engine's gaussians) and the accumulated-cloud path
+// (write_cloud_splat, from pose::AccumPoint). They MUST encode a gaussian
+// identically — same OpenCV->OpenGL convention, same quaternion remap, same byte
+// packing, same opacity->alpha. They used to be two copies and drifted: the cloud
+// path silently dropped first the rotation/scale, then the opacity (rendering a
+// fully-opaque, swirling, blurry soup instead of a proper alpha-blended surface).
+//
+// This is the ONE definition both call, so they cannot diverge again. It is pinned
+// byte-for-byte by tests/test_pose.cpp::test_splat_record.
+#ifndef FREE_SPLATTER_SPLAT_H
+#define FREE_SPLATTER_SPLAT_H
+
+#include <cmath>
+#include <cstring>
+
+namespace free_splatter {
+
+// One 32-byte .splat record: pos[3]f32, scale[3]f32, rgba[4]u8, rot[4]u8 (w,x,y,z).
+// Inputs are in the engine's OpenCV frame (y down, z forward); `rgb` and `opacity`
+// in [0,1]; `quat_wxyz` as (w,x,y,z). Writes exactly 32 bytes to `out`.
+inline void encode_splat_record(unsigned char out[32], const float pos[3], const float scale[3],
+                                const float quat_wxyz[4], const float rgb[3], float opacity) {
+    auto u8  = [](float v) -> unsigned char { float t = v < 0 ? 0 : (v > 255 ? 255 : v); return (unsigned char) t; };
+    auto c01 = [](float v) -> float { return v < 0.0f ? 0.0f : (v > 1.0f ? 1.0f : v); };
+    // OpenCV (y down, z forward) -> viewer OpenGL (y up): 180deg about X = diag(1,-1,-1):
+    // position.yz negate, quaternion (w,x,y,z) -> (-x, w, -z, y).
+    const float p[3] = { pos[0], -pos[1], -pos[2] };
+    float q[4] = { -quat_wxyz[1], quat_wxyz[0], -quat_wxyz[3], quat_wxyz[2] };
+    const float nrm = std::sqrt(q[0]*q[0] + q[1]*q[1] + q[2]*q[2] + q[3]*q[3]) + 1e-12f;
+    std::memcpy(out + 0,  p,     12);
+    std::memcpy(out + 12, scale, 12);
+    out[24] = u8(c01(rgb[0]) * 255.0f);
+    out[25] = u8(c01(rgb[1]) * 255.0f);
+    out[26] = u8(c01(rgb[2]) * 255.0f);
+    out[27] = u8(c01(opacity) * 255.0f);                  // opacity -> alpha (NOT forced opaque)
+    for (int c = 0; c < 4; c++) out[28 + c] = u8(q[c] / nrm * 128.0f + 128.0f);
+}
+
+} // namespace free_splatter
+
+#endif // FREE_SPLATTER_SPLAT_H
diff --git a/tests/test_pose.cpp b/tests/test_pose.cpp
index e00abcb..08c6d82 100644
--- a/tests/test_pose.cpp
+++ b/tests/test_pose.cpp
@@ -8,10 +8,12 @@
 // quantities, not bitstream-matched), so this validates the C++ port to the same
 // correctness bar the Python prototype meets, independently of numpy.
 #include "pose.h"
+#include "splat.h"
 
 #include <algorithm>
 #include <cmath>
 #include <cstdio>
+#include <cstring>
 #include <random>
 #include <vector>
 
@@ -582,6 +584,83 @@ static void test_loop_distribute() {
     check("distribute_drift recovers clean loop", std::sqrt(post/(n+1)) < 1e-9, buf);
 }
 
+// ===========================================================================
+// Regression guards for the gaussian -> .splat seam (the class of bug that
+// dropped rotation, then opacity, from the accumulated cloud writer)
+// ===========================================================================
+
+// #1 guard: pin the shared encoder's byte output. Both writers (single-run and
+// accumulated cloud) call this; if it ever drops a channel or changes the
+// convention the bytes change here. The opacity->alpha byte is the exact field
+// the regression got wrong (it had been forced to 255).
+static void test_splat_record() {
+    std::printf("test_splat_record\n");
+    const float pos[3]   = { 1.0f, 2.0f, 3.0f };
+    const float scale[3] = { 0.01f, 0.02f, 0.03f };
+    const float quat[4]  = { 0.70710678f, 0.70710678f, 0.0f, 0.0f };  // (w,x,y,z), unit
+    const float rgb[3]   = { 0.2f, 0.4f, 0.6f };
+    unsigned char rec[32];
+    free_splatter::encode_splat_record(rec, pos, scale, quat, rgb, 0.5f);
+
+    float p[3], sc[3]; std::memcpy(p, rec, 12); std::memcpy(sc, rec+12, 12);
+    check("pos: OpenCV->GL flips y,z", p[0]==1.0f && p[1]==-2.0f && p[2]==-3.0f);
+    check("scale: passthrough", sc[0]==0.01f && sc[1]==0.02f && sc[2]==0.03f);
+    check("rgb bytes", rec[24]==(unsigned char)(0.2f*255.0f) &&
+                       rec[25]==(unsigned char)(0.4f*255.0f) &&
+                       rec[26]==(unsigned char)(0.6f*255.0f));
+    // THE regression field: opacity must become the alpha, not a forced 255.
+    char buf[48]; std::snprintf(buf, sizeof buf, "alpha=%d (must be 127, not 255)", rec[27]);
+    check("opacity -> alpha (not forced opaque)", rec[27] == 127, buf);
+    // rotation must be encoded with the (w,x,y,z)->(-x,w,-z,y) remap, not dropped.
+    // q=(0.7071,0.7071,0,0) -> (-0.7071,0.7071,0,0) -> bytes (37,218,128,128).
+    check("rotation: remap encoded (not identity)",
+          rec[28]==37 && rec[29]==218 && rec[30]==128 && rec[31]==128);
+    // boundary opacities
+    unsigned char r1[32], r0[32];
+    free_splatter::encode_splat_record(r1, pos, scale, quat, rgb, 1.0f);
+    free_splatter::encode_splat_record(r0, pos, scale, quat, rgb, 0.0f);
+    check("alpha boundaries (1->255, 0->0)", r1[27]==255 && r0[27]==0);
+}
+
+// #2 guard: run-0 accumulation must preserve EVERY gaussian channel. With one pair
+// the global transform is identity, so each AccumPoint should equal the engine's
+// gaussian for that pixel (xyz, SH->rgb, opacity, scale, rotation, frame). A
+// dropped channel — the exact bug — fails here, asset-free.
+static void test_accumulate_channels() {
+    std::printf("test_accumulate_channels\n");
+    const int H = 4, W = 4, gc = 23, P = H*W;
+    std::vector<float> g((size_t) 2 * P * gc, 0.0f);
+    for (int v = 0; v < 2; v++) for (int i = 0; i < P; i++) {
+        float * a = &g[(size_t)(v*P + i) * gc];
+        const float b = (float)(v*100 + i);
+        a[0]=b+0.11f; a[1]=b+0.22f; a[2]=b+0.33f;               // xyz (distinct)
+        a[3]=0.5f; a[4]=-0.5f; a[5]=1.0f;                       // SH-DC -> in-range rgb
+        a[15]=0.2f + 0.5f*((i%3)/2.0f);                         // opacity in {0.2,0.45,0.7} > thr
+        a[16]=0.01f+0.001f*i; a[17]=0.02f; a[18]=0.03f;         // scale (distinct)
+        a[19]=0.8f; a[20]=0.6f; a[21]=0.0f; a[22]=0.0f;         // unit quat (w,x,y,z)
+    }
+    Accumulator acc(H, W, /*opacity_threshold=*/0.05);
+    acc.add_pair(g.data(), gc);
+    const auto & cloud = acc.cloud();
+    check("run-0 cloud = all pixels (both views kept)", cloud.size() == (size_t) 2 * P);
+
+    const double C0 = 0.28209479177387814;
+    bool ok = cloud.size() == (size_t) 2 * P;
+    auto cl = [](float x, float y) { return std::fabs(x - y) < 1e-5f; };
+    for (int v = 0; v < 2 && ok; v++) for (int i = 0; i < P && ok; i++) {
+        const float * a = &g[(size_t)(v*P + i) * gc];
+        const AccumPoint & p = cloud[(size_t) v*P + i];
+        ok &= cl(p.x,a[0]) && cl(p.y,a[1]) && cl(p.z,a[2]);                 // xyz (identity sim)
+        float er=(float)(a[3]*C0+0.5), eg=(float)(a[4]*C0+0.5), eb=(float)(a[5]*C0+0.5);
+        ok &= cl(p.r,er) && cl(p.g,eg) && cl(p.b,eb);                       // SH -> rgb
+        ok &= cl(p.opacity,a[15]);                                          // opacity (the dropped one)
+        ok &= cl(p.sx,a[16]) && cl(p.sy,a[17]) && cl(p.sz,a[18]);           // scale (T.s=1)
+        ok &= cl(p.qw,a[19]) && cl(p.qx,a[20]) && cl(p.qy,a[21]) && cl(p.qz,a[22]); // rotation
+        ok &= (p.frame == v);                                              // source frame
+    }
+    check("every gaussian channel survives run-0 accumulation", ok);
+}
+
 int main() {
     test_similarity_roundtrip();
     test_scale_detection();
@@ -598,6 +677,8 @@ int main() {
     test_accumulate_chain();
     test_consensus_fuse();
     test_loop_distribute();
+    test_splat_record();
+    test_accumulate_channels();
     std::printf(failures ? "\ntest_pose: %d FAILURES\n" : "\ntest_pose: ok\n", failures);
     return failures ? 1 : 0;
 }
diff --git a/tools/free_splatter-cli.cpp b/tools/free_splatter-cli.cpp
index 10d966c..7231643 100644
--- a/tools/free_splatter-cli.cpp
+++ b/tools/free_splatter-cli.cpp
@@ -8,6 +8,7 @@
 //                     [--opacity-threshold T] [--max-splats N] [--dump-taps DIR]
 //                     MODEL.gguf  (IMAGES... | INPUT.f32)
 #include "free_splatter.h"
+#include "splat.h"              // the single shared .splat record encoder
 
 #include "stb_image.h"          // implementation in tools/stb_impl.cpp
 #include "stb_image_resize2.h"
@@ -70,24 +71,15 @@ static bool write_splat(const float * g, size_t n, int gc, float opac_thr,
 
     std::ofstream f(path, std::ios::binary);
     if (!f) { std::fprintf(stderr, "cannot write %s\n", path); return false; }
-    auto u8 = [](float v) -> unsigned char { float t = v < 0 ? 0 : v > 255 ? 255 : v; return (unsigned char) t; };
     for (size_t k = 0; k < m; k++) {
         const float * x = &g[keep[k].second * gc];
-        // FreeSplatter's reference frame is OpenCV (y down, z forward); convert
-        // to the viewer's OpenGL convention (y up) via a 180deg rotation about X
-        // = diag(1,-1,-1): position.yz negate, quaternion (w,x,y,z)->(-x,w,-z,y).
-        float pos[3]   = { x[0], -x[1], -x[2] };
-        float scale[3] = { x[16], x[17], x[18] };
-        unsigned char rgba[4], rot[4];
-        for (int c = 0; c < 3; c++) { float v = 0.5f + (float) C0 * x[3+c]; rgba[c] = u8((v<0?0:v>1?1:v) * 255.0f); }
-        rgba[3] = u8(std::min(std::max(x[15], 0.0f), 1.0f) * 255.0f);
-        float q[4] = { -x[20], x[19], -x[22], x[21] };
-        float nrm = std::sqrt(q[0]*q[0]+q[1]*q[1]+q[2]*q[2]+q[3]*q[3]) + 1e-12f;
-        for (int c = 0; c < 4; c++) rot[c] = u8(q[c]/nrm * 128.0f + 128.0f);
-        f.write((const char *) pos, 12);
-        f.write((const char *) scale, 12);
-        f.write((const char *) rgba, 4);
-        f.write((const char *) rot, 4);
+        const float pos[3]   = { x[0], x[1], x[2] };           // OpenCV; the encoder flips y,z
+        const float scale[3] = { x[16], x[17], x[18] };
+        const float quat[4]  = { x[19], x[20], x[21], x[22] };  // (w,x,y,z)
+        const float rgb[3]   = { 0.5f + (float) C0 * x[3], 0.5f + (float) C0 * x[4], 0.5f + (float) C0 * x[5] };
+        unsigned char rec[32];
+        free_splatter::encode_splat_record(rec, pos, scale, quat, rgb, x[15]);
+        f.write((const char *) rec, 32);
     }
     std::printf("wrote %s: %zu splats (pruned/cap of %zu kept)\n", path, m, keep.size());
     return true;
@@ -116,23 +108,15 @@ static bool write_cloud_splat(const free_splatter_point * pts, size_t n, size_t
 
     std::ofstream f(path, std::ios::binary);
     if (!f) { std::fprintf(stderr, "cannot write %s\n", path); return false; }
-    auto u8 = [](float v) -> unsigned char { float t = v < 0 ? 0 : v > 255 ? 255 : v; return (unsigned char) t; };
     for (size_t i : idx) {
         const free_splatter_point & p = pts[i];
-        // OpenCV (y down, z forward) -> viewer OpenGL (y up): negate y,z; quaternion
-        // (w,x,y,z) -> (-x, w, -z, y) (the same remap as the single-run write_splat).
-        float pos[3]   = { p.x, -p.y, -p.z };
-        float sc[3]    = { scale_mult*p.sx, scale_mult*p.sy, scale_mult*p.sz };
-        unsigned char rgba[4] = { u8(p.r*255.0f), u8(p.g*255.0f), u8(p.b*255.0f),
-                                  u8(std::min(std::max(p.opacity, 0.0f), 1.0f) * 255.0f) };
-        float q[4] = { -p.qx, p.qw, -p.qz, p.qy };
-        float nrm = std::sqrt(q[0]*q[0]+q[1]*q[1]+q[2]*q[2]+q[3]*q[3]) + 1e-12f;
-        unsigned char rot[4];
-        for (int c = 0; c < 4; c++) rot[c] = u8(q[c]/nrm * 128.0f + 128.0f);
-        f.write((const char *) pos, 12);
-        f.write((const char *) sc, 12);
-        f.write((const char *) rgba, 4);
-        f.write((const char *) rot, 4);
+        const float pos[3]   = { p.x, p.y, p.z };               // OpenCV; the encoder flips y,z
+        const float scale[3] = { scale_mult*p.sx, scale_mult*p.sy, scale_mult*p.sz };
+        const float quat[4]  = { p.qw, p.qx, p.qy, p.qz };      // (w,x,y,z)
+        const float rgb[3]   = { p.r, p.g, p.b };
+        unsigned char rec[32];
+        free_splatter::encode_splat_record(rec, pos, scale, quat, rgb, p.opacity);
+        f.write((const char *) rec, 32);
     }
     std::printf("wrote %s: %zu splats (of %zu cloud points)\n", path, idx.size(), n);
     return true;

From 21ffcee824575ffffcc4dd1b80ed0d6509049685 Mon Sep 17 00:00:00 2001
From: Richard Palethorpe <io@richiejp.com>
Date: Sat, 27 Jun 2026 22:34:22 +0100
Subject: [PATCH 23/33] fuse: add dense "kept" mode (fix the sparse fused
 scene)

Consensus fusion only keeps voxels seen by >= K frames; with few frames (a short
arc) only a small fraction of the scene is multiply-observed, and the existing
"averaged" output (one point per consensus voxel) then decimated that further --
so the final fused scene was very sparse (~38k of 1M points on the 4-frame demo).

Add a "kept" mode (the prototype's fuse.py --ply-kept): keep every raw gaussian
whose voxel is corroborated by >= K frames -- floaters still removed, but nothing
averaged away. On the demo that's 446k vs 38k points (12x denser), a solid surface
instead of a sparse scatter.

- consensus_fuse gains `keep_raw` (default false = averaged, unchanged);
  free_splatter_accumulator_fuse gains a `keep_raw` arg; CLI `--fuse-mode
  kept|averaged` (kept is the default for the cloud demo).
- Golden test pins both modes (averaged -> 5 voxel points, kept -> 15 raw points);
  fuzz_pose drives both modes; ctest -LE model green, fuzz clean.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 fuzz/fuzz_pose.cpp          |  3 ++-
 include/free_splatter.h     |  8 +++++---
 src/free_splatter.cpp       |  4 ++--
 src/pose.cpp                | 41 ++++++++++++++++++++++++++-----------
 src/pose.h                  |  9 +++++---
 tests/test_pose.cpp         |  6 +++++-
 tools/free_splatter-cli.cpp |  5 +++--
 web/README.md               |  7 +++++--
 8 files changed, 57 insertions(+), 26 deletions(-)

diff --git a/fuzz/fuzz_pose.cpp b/fuzz/fuzz_pose.cpp
index 4d737c5..ccdf615 100644
--- a/fuzz/fuzz_pose.cpp
+++ b/fuzz/fuzz_pose.cpp
@@ -50,8 +50,9 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t * data, size_t size) {
 
         const float voxel = 0.005f + (data[3] % 16) * 0.01f;   // 0.005..0.155
         const int k = 1 + (data[4] % 4);                       // 1..4
+        const int keep_raw = data[5] & 1;                      // both fuse modes
         free_splatter_point * fused = nullptr; size_t nf = 0;
-        if (free_splatter_accumulator_fuse(acc, voxel, k, &fused, &nf) == 0) free_splatter_buf_free(fused);
+        if (free_splatter_accumulator_fuse(acc, voxel, k, keep_raw, &fused, &nf) == 0) free_splatter_buf_free(fused);
 
         float * path = nullptr; int32_t nfr = 0;
         if (free_splatter_accumulator_camera_path(acc, &path, &nfr) == 0) free_splatter_buf_free(path);
diff --git a/include/free_splatter.h b/include/free_splatter.h
index 4dc4f6d..b783340 100644
--- a/include/free_splatter.h
+++ b/include/free_splatter.h
@@ -126,10 +126,12 @@ FREE_SPLATTER_API int free_splatter_accumulator_cloud(
     const free_splatter_accumulator * acc, free_splatter_point ** out, size_t * n_out);
 
 // Consensus-fuse the current cloud: keep only voxels (size voxel_frac * extent)
-// corroborated by >= k distinct source frames, averaging the agreeing predictions.
-// *out malloc'd as above. Returns 0 on success, -1 on failure.
+// corroborated by >= k distinct source frames (single-frame floaters dropped).
+// keep_raw=0 averages each consensus voxel to one point (denoised but decimated);
+// keep_raw=1 keeps every raw gaussian in a consensus voxel (dense, nothing averaged
+// away — better when there are few frames). *out malloc'd as above. Returns 0 ok.
 FREE_SPLATTER_API int free_splatter_accumulator_fuse(
-    const free_splatter_accumulator * acc, float voxel_frac, int32_t k,
+    const free_splatter_accumulator * acc, float voxel_frac, int32_t k, int32_t keep_raw,
     free_splatter_point ** out, size_t * n_out);
 
 // Copy the global camera trajectory: *out malloc'd (*n_frames)*16 float32, a
diff --git a/src/free_splatter.cpp b/src/free_splatter.cpp
index 8e6f0cc..00a7105 100644
--- a/src/free_splatter.cpp
+++ b/src/free_splatter.cpp
@@ -254,12 +254,12 @@ int free_splatter_accumulator_cloud(const free_splatter_accumulator * acc,
 }
 
 int free_splatter_accumulator_fuse(const free_splatter_accumulator * acc, float voxel_frac,
-                                   int32_t k, free_splatter_point ** out, size_t * n_out) {
+                                   int32_t k, int32_t keep_raw, free_splatter_point ** out, size_t * n_out) {
     if (out) *out = nullptr;
     if (n_out) *n_out = 0;
     if (!acc || !out || !n_out || k < 1 || !(voxel_frac > 0)) return -1;
     std::vector<free_splatter::pose::AccumPoint> fused;
-    free_splatter::pose::consensus_fuse(acc->acc.cloud(), voxel_frac, k, fused);
+    free_splatter::pose::consensus_fuse(acc->acc.cloud(), voxel_frac, k, fused, keep_raw != 0);
     return emit_points(fused, out, n_out);
 }
 
diff --git a/src/pose.cpp b/src/pose.cpp
index a3537c5..ce84ce3 100644
--- a/src/pose.cpp
+++ b/src/pose.cpp
@@ -1008,7 +1008,7 @@ std::vector<Mat4> Accumulator::camera_path() const {
 // ---- consensus fusion -----------------------------------------------------
 
 FuseStats consensus_fuse(const std::vector<AccumPoint> & cloud, double voxel_frac, int k,
-                         std::vector<AccumPoint> & fused) {
+                         std::vector<AccumPoint> & fused, bool keep_raw) {
     fused.clear();
     FuseStats st{};
     st.raw_points = (int64_t) cloud.size();
@@ -1073,20 +1073,37 @@ FuseStats consensus_fuse(const std::vector<AccumPoint> & cloud, double voxel_fra
 
     st.voxels = (int64_t) grid.size();
     for (const auto & kv : grid) {
-        const Vox & vx = kv.second;
-        if ((int) vx.frames.size() < k) continue;
+        if ((int) kv.second.frames.size() < k) continue;
         st.kept_voxels++;
-        st.points_kept += vx.cnt;
-        AccumPoint p;
-        p.x = (float) (vx.sx / vx.cnt); p.y = (float) (vx.sy / vx.cnt); p.z = (float) (vx.sz / vx.cnt);
-        p.r = (float) (vx.sr / vx.cnt); p.g = (float) (vx.sg / vx.cnt); p.b = (float) (vx.sb / vx.cnt);
-        p.opacity = (float) (vx.sop / vx.cnt);
-        p.sx = (float) (vx.ssx / vx.cnt); p.sy = (float) (vx.ssy / vx.cnt); p.sz = (float) (vx.ssz / vx.cnt);
-        p.qw = vx.q[0]; p.qx = vx.q[1]; p.qy = vx.q[2]; p.qz = vx.q[3];
-        p.frame = (int32_t) vx.frames.size();   // support count (informational)
-        fused.push_back(p);
+        st.points_kept += kv.second.cnt;
     }
     st.points_dropped = st.raw_points - st.points_kept;
+
+    if (keep_raw) {
+        // DENSE: keep every raw gaussian whose voxel is corroborated by >= k frames
+        // (floaters dropped, nothing averaged/decimated away).
+        fused.reserve((size_t) st.points_kept);
+        for (const auto & p : cloud) {
+            if (!finite(p)) continue;
+            Key key{ vcoord(p.x, lo[0]), vcoord(p.y, lo[1]), vcoord(p.z, lo[2]) };
+            auto it = grid.find(key);
+            if (it != grid.end() && (int) it->second.frames.size() >= k) fused.push_back(p);
+        }
+    } else {
+        // DENOISED: one averaged gaussian per consensus voxel.
+        for (const auto & kv : grid) {
+            const Vox & vx = kv.second;
+            if ((int) vx.frames.size() < k) continue;
+            AccumPoint p;
+            p.x = (float) (vx.sx / vx.cnt); p.y = (float) (vx.sy / vx.cnt); p.z = (float) (vx.sz / vx.cnt);
+            p.r = (float) (vx.sr / vx.cnt); p.g = (float) (vx.sg / vx.cnt); p.b = (float) (vx.sb / vx.cnt);
+            p.opacity = (float) (vx.sop / vx.cnt);
+            p.sx = (float) (vx.ssx / vx.cnt); p.sy = (float) (vx.ssy / vx.cnt); p.sz = (float) (vx.ssz / vx.cnt);
+            p.qw = vx.q[0]; p.qx = vx.q[1]; p.qy = vx.q[2]; p.qz = vx.q[3];
+            p.frame = (int32_t) vx.frames.size();   // support count (informational)
+            fused.push_back(p);
+        }
+    }
     return st;
 }
 
diff --git a/src/pose.h b/src/pose.h
index bda4666..05366c5 100644
--- a/src/pose.h
+++ b/src/pose.h
@@ -183,10 +183,13 @@ struct FuseStats {
     int64_t raw_points, voxels, kept_voxels, points_kept, points_dropped;
 };
 // Voxelize the cloud at voxel_frac * cloud-extent and keep only voxels corroborated
-// by >= k DISTINCT source frames, averaging the agreeing predictions (which also
-// denoises the surface). `fused` receives one averaged point per consensus voxel.
+// by >= k DISTINCT source frames (single-frame floaters dropped). With keep_raw =
+// false this DENOISES: `fused` gets one averaged gaussian per consensus voxel (clean
+// but decimated — sparse when few voxels are multiply-observed). With keep_raw =
+// true it keeps DENSITY: `fused` gets every raw gaussian whose voxel is corroborated
+// (floaters removed, nothing averaged away) — the better choice for few frames.
 FuseStats consensus_fuse(const std::vector<AccumPoint> & cloud, double voxel_frac, int k,
-                         std::vector<AccumPoint> & fused);
+                         std::vector<AccumPoint> & fused, bool keep_raw = false);
 
 // ---- loop closure (mirrors loop_closure.py) -------------------------------
 // Inverse of a similarity 4x4 [[sR,t],[0,1]] (sR = s*rotation): [[(1/s)Rᵀ,...],[0,1]].
diff --git a/tests/test_pose.cpp b/tests/test_pose.cpp
index 08c6d82..bf31ac0 100644
--- a/tests/test_pose.cpp
+++ b/tests/test_pose.cpp
@@ -543,9 +543,13 @@ static void test_consensus_fuse() {
     FuseStats st = consensus_fuse(cloud, /*voxel_frac=*/0.02, /*k=*/2, fused);
     check("raw points", st.raw_points == 35);
     check("kept voxels (the surface)", st.kept_voxels == 5);
-    check("fused size", fused.size() == 5);
+    check("fused size (averaged: one per voxel)", fused.size() == 5);
     check("points kept", st.points_kept == 15);
     check("floaters dropped", st.points_dropped == 20);
+    // keep_raw=true: dense — every raw point in a consensus voxel, floaters dropped
+    std::vector<AccumPoint> kept;
+    consensus_fuse(cloud, /*voxel_frac=*/0.02, /*k=*/2, kept, /*keep_raw=*/true);
+    check("kept mode: dense (all 15 consensus raw points)", kept.size() == 15);
     // a fused point sits at its cluster centroid (~2*loc), color preserved
     bool centroid_ok = true;
     for (const auto & p : fused) {
diff --git a/tools/free_splatter-cli.cpp b/tools/free_splatter-cli.cpp
index 7231643..c544623 100644
--- a/tools/free_splatter-cli.cpp
+++ b/tools/free_splatter-cli.cpp
@@ -143,7 +143,7 @@ int main(int argc, char ** argv) {
     const char * model = nullptr, * splat_prefix = nullptr;
     float opac_thr = 5e-3f;
     long  max_splats = 0;
-    bool  accumulate = false, fuse = false;
+    bool  accumulate = false, fuse = false, fuse_keep = true;  // dense (kept) by default
     float voxel = 0.02f, splat_scale = 1.0f;   // multiplier on the predicted gaussian radii
     int   fuse_k = 2;
     std::vector<std::string> inputs;
@@ -161,6 +161,7 @@ int main(int argc, char ** argv) {
         else if (a == "--fuse")                         fuse = true;
         else if (a == "--voxel" && i+1 < argc)          voxel = (float) atof(argv[++i]);
         else if (a == "--fuse-k" && i+1 < argc)         fuse_k = atoi(argv[++i]);
+        else if (a == "--fuse-mode" && i+1 < argc)      fuse_keep = std::string(argv[++i]) != "averaged";
         else if (a == "--splat-scale" && i+1 < argc)    splat_scale = (float) atof(argv[++i]);
         else if (a == "-h" || a == "--help")            return usage(argv[0]);
         else if (!model)                                model = argv[i];
@@ -249,7 +250,7 @@ int main(int argc, char ** argv) {
         }
         if (fuse && splat_prefix) {
             free_splatter_point * fc = nullptr; size_t nf = 0;
-            free_splatter_accumulator_fuse(acc, voxel, fuse_k, &fc, &nf);
+            free_splatter_accumulator_fuse(acc, voxel, fuse_k, fuse_keep ? 1 : 0, &fc, &nf);
             char path[1024];
             std::snprintf(path, sizeof path, "%s_fused.splat", splat_prefix);
             write_cloud_splat(fc, nf, (size_t) max_splats, splat_scale, path);
diff --git a/web/README.md b/web/README.md
index 47b11c8..21df4eb 100644
--- a/web/README.md
+++ b/web/README.md
@@ -43,8 +43,11 @@ python3 -m http.server -d .cache/demo/accumulate 8080   # open http://localhost:
 
 URL params: `?ms=2600` step interval, `?start=N` begin at step N, `?auto=0` no
 auto-advance, `?spin=0` no auto-orbit. The cloud shown is the raw accumulated
-cloud; the bake also writes a consensus-fused `acc_fused.splat` (the edge-haze
-floaters removed), shown as a final step.
+cloud; the bake also writes a consensus-fused `acc_fused.splat` (single-view
+floaters removed — only voxels seen by ≥ K frames survive), shown as a final step.
+`--fuse-mode kept` (default) keeps every raw gaussian in a consensus voxel (dense);
+`--fuse-mode averaged` collapses each voxel to one denoised point (cleaner but
+sparse — only worthwhile with many overlapping frames).
 
 **Pick frames with lateral baseline.** Two-view depth needs the camera to
 *translate sideways* — a pure forward dolly (the camera moving along the direction

From d7d0cb611b5654185052a8d56f9f4ef73ec184d5 Mon Sep 17 00:00:00 2001
From: Richard Palethorpe <io@richiejp.com>
Date: Sat, 27 Jun 2026 23:55:11 +0100
Subject: [PATCH 24/33] pose: de-ghost the accumulated cloud -- gaussian-level
 consensus_refine + best-frame fusion

Pairwise Sim(3) chaining leaves per-object (non-rigid) misregistration: overlapping
frames show doubled objects (two lamps, a bed end offset from its top). A rigid
per-frame pose BA can't fix this (tried, measured: 0.6%->0.6%, no-op -- reverted) --
the residual isn't a camera-pose error, it's that each frame predicts an object's
position slightly differently. Two fixes that DO work:

- consensus_refine (gaussian-level, non-rigid): each iteration moves every point a
  fraction toward the opacity-weighted consensus of the OTHER frames' points in its
  coarse-to-fine voxel neighbourhood. Local + non-rigid, so spatially-varying
  ghosting collapses. On the real strong-arc cloud: ghosting 1.2% -> 0.26% (~5x);
  golden 3.7% -> 0.0%. Exposed as Accumulator::refine, free_splatter_refine_cloud /
  _accumulator_refine, CLI --refine.
- consensus_fuse gains FUSE_BEST: per consensus voxel keep only the single most-
  confident frame's gaussians (dense AND de-ghosted, no stacked copies). keep_raw
  bool -> mode int {averaged,kept,best}; CLI --fuse-mode best.

Golden tests pin both (consensus_refine de-ghost, best=one-frame-per-voxel);
fuzz_pose drives the refine functions + all three fuse modes on garbage. ctest
-LE model green, fuzz clean.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 fuzz/fuzz_pose.cpp          | 10 ++--
 include/free_splatter.h     | 22 +++++++--
 src/free_splatter.cpp       | 23 ++++++++-
 src/pose.cpp                | 97 +++++++++++++++++++++++++++++++++++--
 src/pose.h                  | 29 ++++++++---
 tests/test_pose.cpp         | 61 +++++++++++++++++++++--
 tools/free_splatter-cli.cpp | 16 ++++--
 7 files changed, 231 insertions(+), 27 deletions(-)

diff --git a/fuzz/fuzz_pose.cpp b/fuzz/fuzz_pose.cpp
index ccdf615..de8a016 100644
--- a/fuzz/fuzz_pose.cpp
+++ b/fuzz/fuzz_pose.cpp
@@ -46,13 +46,17 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t * data, size_t size) {
             free_splatter_accumulator_add_pair(acc, g.data(), gc);
         }
         free_splatter_point * cloud = nullptr; size_t nc = 0;
-        if (free_splatter_accumulator_cloud(acc, &cloud, &nc) == 0) free_splatter_buf_free(cloud);
+        if (free_splatter_accumulator_cloud(acc, &cloud, &nc) == 0) {
+            free_splatter_refine_cloud(cloud, nc, 0.01f + (data[6] % 8) * 0.02f, 1 + (data[7] % 4), 0.5f);
+            free_splatter_buf_free(cloud);
+        }
+        free_splatter_accumulator_refine(acc, 0.03f, 2, 0.5f);   // de-ghost on garbage
 
         const float voxel = 0.005f + (data[3] % 16) * 0.01f;   // 0.005..0.155
         const int k = 1 + (data[4] % 4);                       // 1..4
-        const int keep_raw = data[5] & 1;                      // both fuse modes
+        const int mode = data[5] % 3;                          // averaged / kept / best
         free_splatter_point * fused = nullptr; size_t nf = 0;
-        if (free_splatter_accumulator_fuse(acc, voxel, k, keep_raw, &fused, &nf) == 0) free_splatter_buf_free(fused);
+        if (free_splatter_accumulator_fuse(acc, voxel, k, mode, &fused, &nf) == 0) free_splatter_buf_free(fused);
 
         float * path = nullptr; int32_t nfr = 0;
         if (free_splatter_accumulator_camera_path(acc, &path, &nfr) == 0) free_splatter_buf_free(path);
diff --git a/include/free_splatter.h b/include/free_splatter.h
index b783340..be2cb15 100644
--- a/include/free_splatter.h
+++ b/include/free_splatter.h
@@ -126,12 +126,12 @@ FREE_SPLATTER_API int free_splatter_accumulator_cloud(
     const free_splatter_accumulator * acc, free_splatter_point ** out, size_t * n_out);
 
 // Consensus-fuse the current cloud: keep only voxels (size voxel_frac * extent)
-// corroborated by >= k distinct source frames (single-frame floaters dropped).
-// keep_raw=0 averages each consensus voxel to one point (denoised but decimated);
-// keep_raw=1 keeps every raw gaussian in a consensus voxel (dense, nothing averaged
-// away — better when there are few frames). *out malloc'd as above. Returns 0 ok.
+// corroborated by >= k distinct source frames (single-frame floaters dropped). mode:
+// 0 = averaged (one denoised point per voxel; sparse), 1 = kept (every raw gaussian
+// in a consensus voxel; dense), 2 = best (only the most-confident frame's gaussians
+// per voxel; dense AND de-ghosted). *out malloc'd as above. Returns 0 on success.
 FREE_SPLATTER_API int free_splatter_accumulator_fuse(
-    const free_splatter_accumulator * acc, float voxel_frac, int32_t k, int32_t keep_raw,
+    const free_splatter_accumulator * acc, float voxel_frac, int32_t k, int32_t mode,
     free_splatter_point ** out, size_t * n_out);
 
 // Copy the global camera trajectory: *out malloc'd (*n_frames)*16 float32, a
@@ -139,6 +139,18 @@ FREE_SPLATTER_API int free_splatter_accumulator_fuse(
 FREE_SPLATTER_API int free_splatter_accumulator_camera_path(
     const free_splatter_accumulator * acc, float ** out, int32_t * n_frames);
 
+// Geometric de-ghost a cloud in place (gaussian-level consensus refinement): each
+// iteration non-rigidly pulls every gaussian a fraction `alpha` toward the
+// multi-frame consensus of its (coarse-to-fine voxel_frac) neighbourhood — removing
+// the doubled objects the pairwise pose chain leaves. Returns 0 on success.
+FREE_SPLATTER_API int free_splatter_refine_cloud(
+    free_splatter_point * points, size_t n, float voxel_frac, int32_t iters, float alpha);
+
+// Same, applied to the accumulator's internal cloud (so a subsequent _fuse also
+// operates on the de-ghosted cloud). Returns 0 on success, -1 on bad args.
+FREE_SPLATTER_API int free_splatter_accumulator_refine(
+    free_splatter_accumulator * acc, float voxel_frac, int32_t iters, float alpha);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/free_splatter.cpp b/src/free_splatter.cpp
index 00a7105..cd4155d 100644
--- a/src/free_splatter.cpp
+++ b/src/free_splatter.cpp
@@ -254,15 +254,34 @@ int free_splatter_accumulator_cloud(const free_splatter_accumulator * acc,
 }
 
 int free_splatter_accumulator_fuse(const free_splatter_accumulator * acc, float voxel_frac,
-                                   int32_t k, int32_t keep_raw, free_splatter_point ** out, size_t * n_out) {
+                                   int32_t k, int32_t mode, free_splatter_point ** out, size_t * n_out) {
     if (out) *out = nullptr;
     if (n_out) *n_out = 0;
     if (!acc || !out || !n_out || k < 1 || !(voxel_frac > 0)) return -1;
     std::vector<free_splatter::pose::AccumPoint> fused;
-    free_splatter::pose::consensus_fuse(acc->acc.cloud(), voxel_frac, k, fused, keep_raw != 0);
+    free_splatter::pose::consensus_fuse(acc->acc.cloud(), voxel_frac, k, fused, mode);
     return emit_points(fused, out, n_out);
 }
 
+int free_splatter_refine_cloud(free_splatter_point * points, size_t n,
+                               float voxel_frac, int32_t iters, float alpha) {
+    if (!points || n == 0 || !(voxel_frac > 0) || iters < 1) return -1;
+    // free_splatter_point and pose::AccumPoint are the same trivially-copyable layout
+    // (xyz, rgba/opacity, scale, quat, frame); copy into the typed vector and back.
+    std::vector<free_splatter::pose::AccumPoint> cloud(n);
+    std::memcpy(cloud.data(), points, n * sizeof(free_splatter_point));
+    free_splatter::pose::consensus_refine(cloud, voxel_frac, iters, alpha);
+    std::memcpy(points, cloud.data(), n * sizeof(free_splatter_point));
+    return 0;
+}
+
+int free_splatter_accumulator_refine(free_splatter_accumulator * acc, float voxel_frac,
+                                     int32_t iters, float alpha) {
+    if (!acc || !(voxel_frac > 0) || iters < 1) return -1;
+    acc->acc.refine(voxel_frac, iters, alpha);
+    return 0;
+}
+
 int free_splatter_accumulator_camera_path(const free_splatter_accumulator * acc,
                                           float ** out, int32_t * n_frames) {
     if (out) *out = nullptr;
diff --git a/src/pose.cpp b/src/pose.cpp
index ce84ce3..e304000 100644
--- a/src/pose.cpp
+++ b/src/pose.cpp
@@ -999,6 +999,10 @@ ChainLink Accumulator::add_pair(const float * g, int gc, double focal) {
     return link;
 }
 
+double Accumulator::refine(double voxel_frac, int iters, double alpha) {
+    return consensus_refine(cloud_, voxel_frac, iters, alpha);
+}
+
 std::vector<Mat4> Accumulator::camera_path() const {
     std::vector<Mat4> path = cams_;
     if (!cams_.empty()) path.push_back(final_cam_);
@@ -1008,7 +1012,7 @@ std::vector<Mat4> Accumulator::camera_path() const {
 // ---- consensus fusion -----------------------------------------------------
 
 FuseStats consensus_fuse(const std::vector<AccumPoint> & cloud, double voxel_frac, int k,
-                         std::vector<AccumPoint> & fused, bool keep_raw) {
+                         std::vector<AccumPoint> & fused, int mode) {
     fused.clear();
     FuseStats st{};
     st.raw_points = (int64_t) cloud.size();
@@ -1042,7 +1046,12 @@ FuseStats consensus_fuse(const std::vector<AccumPoint> & cloud, double voxel_fra
     if (!(v > 0)) return st;
 
     struct Vox { double sx=0,sy=0,sz=0,sr=0,sg=0,sb=0,sop=0; double ssx=0,ssy=0,ssz=0; int64_t cnt=0;
-                 float q[4]={1,0,0,0}; bool has_q=false; std::vector<int32_t> frames; };
+                 float q[4]={1,0,0,0}; bool has_q=false;
+                 std::vector<std::pair<int32_t,double>> frames;   // (frame, opacity weight)
+                 int best_frame() const {
+                     int32_t b=-1; double bw=-1;
+                     for (auto & fr : frames) if (fr.second > bw) { bw=fr.second; b=fr.first; }
+                     return b; } };
     struct Key { int32_t i,j,k; bool operator==(const Key&o) const { return i==o.i&&j==o.j&&k==o.k; } };
     struct KeyHash { size_t operator()(const Key&q) const {
         uint64_t h = (uint64_t)(uint32_t)q.i * 0x9E3779B97F4A7C15ULL;
@@ -1066,9 +1075,10 @@ FuseStats consensus_fuse(const std::vector<AccumPoint> & cloud, double voxel_fra
         vx.sx+=p.x; vx.sy+=p.y; vx.sz+=p.z; vx.sr+=p.r; vx.sg+=p.g; vx.sb+=p.b; vx.sop+=p.opacity; vx.cnt++;
         vx.ssx+=p.sx; vx.ssy+=p.sy; vx.ssz+=p.sz;
         if (!vx.has_q) { vx.q[0]=p.qw; vx.q[1]=p.qx; vx.q[2]=p.qy; vx.q[3]=p.qz; vx.has_q=true; } // representative orientation
+        const double w = std::max((double) p.opacity, 1e-3);
         bool seen = false;
-        for (int32_t f : vx.frames) if (f == p.frame) { seen = true; break; }
-        if (!seen) vx.frames.push_back(p.frame);
+        for (auto & fr : vx.frames) if (fr.first == p.frame) { fr.second += w; seen = true; break; }
+        if (!seen) vx.frames.push_back({ p.frame, w });
     }
 
     st.voxels = (int64_t) grid.size();
@@ -1079,7 +1089,7 @@ FuseStats consensus_fuse(const std::vector<AccumPoint> & cloud, double voxel_fra
     }
     st.points_dropped = st.raw_points - st.points_kept;
 
-    if (keep_raw) {
+    if (mode == FUSE_KEPT) {
         // DENSE: keep every raw gaussian whose voxel is corroborated by >= k frames
         // (floaters dropped, nothing averaged/decimated away).
         fused.reserve((size_t) st.points_kept);
@@ -1089,6 +1099,17 @@ FuseStats consensus_fuse(const std::vector<AccumPoint> & cloud, double voxel_fra
             auto it = grid.find(key);
             if (it != grid.end() && (int) it->second.frames.size() >= k) fused.push_back(p);
         }
+    } else if (mode == FUSE_BEST) {
+        // DENSE + DE-GHOSTED: in each consensus voxel keep only the single most-
+        // confident frame's gaussians, so overlapping (disagreeing) copies aren't
+        // stacked — one frame represents each region.
+        for (const auto & p : cloud) {
+            if (!finite(p)) continue;
+            Key key{ vcoord(p.x, lo[0]), vcoord(p.y, lo[1]), vcoord(p.z, lo[2]) };
+            auto it = grid.find(key);
+            if (it != grid.end() && (int) it->second.frames.size() >= k && p.frame == it->second.best_frame())
+                fused.push_back(p);
+        }
     } else {
         // DENOISED: one averaged gaussian per consensus voxel.
         for (const auto & kv : grid) {
@@ -1107,6 +1128,72 @@ FuseStats consensus_fuse(const std::vector<AccumPoint> & cloud, double voxel_fra
     return st;
 }
 
+// ---- gaussian-level geometric de-ghost -------------------------------------
+
+double consensus_refine(std::vector<AccumPoint> & cloud, double voxel_frac, int iters, double alpha) {
+    if (cloud.size() < 2 || iters < 1) return 0.0;
+    auto finite = [](const AccumPoint & p) {
+        return std::isfinite(p.x) && std::isfinite(p.y) && std::isfinite(p.z);
+    };
+    struct Key { int32_t i,j,k; bool operator==(const Key&o) const { return i==o.i&&j==o.j&&k==o.k; } };
+    struct KeyHash { size_t operator()(const Key&q) const {
+        uint64_t h = (uint64_t)(uint32_t)q.i * 0x9E3779B97F4A7C15ULL;
+        h ^= (uint64_t)(uint32_t)q.j + 0x9E3779B9U + (h<<6) + (h>>2);
+        h ^= (uint64_t)(uint32_t)q.k + 0x85EBCA6BU + (h<<6) + (h>>2);
+        return (size_t) h; } };
+    struct FC { int32_t f; double sx,sy,sz,w; };   // per-frame opacity-weighted centroid
+
+    double disagree = 0.0;
+    for (int it = 0; it < iters; it++) {
+        double mean[3]={0,0,0}, lo[3]={1e300,1e300,1e300}; int64_t nf=0;
+        for (const auto & p : cloud) { if (!finite(p)) continue;
+            mean[0]+=p.x; mean[1]+=p.y; mean[2]+=p.z;
+            lo[0]=std::min(lo[0],(double)p.x); lo[1]=std::min(lo[1],(double)p.y); lo[2]=std::min(lo[2],(double)p.z); nf++; }
+        if (nf == 0) return disagree;
+        for (int c=0;c<3;c++) mean[c]/=nf;
+        double ss=0; for (const auto & p : cloud) { if (!finite(p)) continue;
+            const double dx=p.x-mean[0], dy=p.y-mean[1], dz=p.z-mean[2]; ss+=dx*dx+dy*dy+dz*dz; }
+        const double ext = std::sqrt(ss / nf);
+        const double frac = (iters > 1)
+            ? voxel_frac * std::pow(2.0, 1.0 - (double) it / (iters - 1))   // 2*vf -> vf (coarse->fine)
+            : voxel_frac;
+        const double v = frac * ext;
+        if (!(v > 0)) return disagree;
+        auto vc = [&](double x, double l) -> int32_t {
+            const double c = std::floor((x - l) / v);
+            if (c < -2.1e9) return -2100000000;
+            if (c >  2.1e9) return  2100000000;
+            return (int32_t) c; };
+
+        // pass 1: per-voxel, per-frame opacity-weighted centroid (a snapshot)
+        std::unordered_map<Key, std::vector<FC>, KeyHash> grid;
+        grid.reserve(cloud.size()/2 + 1);
+        for (const auto & p : cloud) { if (!finite(p)) continue;
+            Key key{ vc(p.x,lo[0]), vc(p.y,lo[1]), vc(p.z,lo[2]) };
+            const double w = std::max((double)p.opacity, 1e-3);
+            auto & vec = grid[key]; bool hit=false;
+            for (auto & fc : vec) if (fc.f==p.frame) { fc.sx+=w*p.x; fc.sy+=w*p.y; fc.sz+=w*p.z; fc.w+=w; hit=true; break; }
+            if (!hit) vec.push_back({ p.frame, w*(double)p.x, w*(double)p.y, w*(double)p.z, w });
+        }
+        // pass 2: move each point toward the OTHER frames' consensus in its voxel
+        double dsum=0; int64_t dcnt=0;
+        for (auto & p : cloud) { if (!finite(p)) continue;
+            Key key{ vc(p.x,lo[0]), vc(p.y,lo[1]), vc(p.z,lo[2]) };
+            auto git = grid.find(key);
+            if (git == grid.end() || git->second.size() < 2) continue;
+            double tx=0,ty=0,tz=0,tw=0;
+            for (const auto & fc : git->second) { if (fc.f==p.frame) continue; tx+=fc.sx; ty+=fc.sy; tz+=fc.sz; tw+=fc.w; }
+            if (tw <= 0) continue;
+            tx/=tw; ty/=tw; tz/=tw;
+            const double dx=tx-p.x, dy=ty-p.y, dz=tz-p.z;
+            dsum += dx*dx+dy*dy+dz*dz; dcnt++;
+            p.x += (float)(alpha*dx); p.y += (float)(alpha*dy); p.z += (float)(alpha*dz);
+        }
+        disagree = (dcnt && ext > 0) ? std::sqrt(dsum/dcnt)/ext : 0.0;
+    }
+    return disagree;
+}
+
 // ---- loop closure ---------------------------------------------------------
 
 Mat4 sim4_invert(const Mat4 & M) {
diff --git a/src/pose.h b/src/pose.h
index 05366c5..3e50390 100644
--- a/src/pose.h
+++ b/src/pose.h
@@ -157,6 +157,8 @@ class Accumulator {
     int  frame_count() const { return next_frame_; }
     int  pair_count()  const { return (int) links_.size(); }
     const Sim3 & global_transform() const { return T_; }
+    // Geometric de-ghost the accumulated cloud in place (consensus_refine).
+    double refine(double voxel_frac = 0.03, int iters = 8, double alpha = 0.5);
 
 private:
     void add_view(const float * pts, const float * op, const float * rgb,
@@ -183,13 +185,28 @@ struct FuseStats {
     int64_t raw_points, voxels, kept_voxels, points_kept, points_dropped;
 };
 // Voxelize the cloud at voxel_frac * cloud-extent and keep only voxels corroborated
-// by >= k DISTINCT source frames (single-frame floaters dropped). With keep_raw =
-// false this DENOISES: `fused` gets one averaged gaussian per consensus voxel (clean
-// but decimated — sparse when few voxels are multiply-observed). With keep_raw =
-// true it keeps DENSITY: `fused` gets every raw gaussian whose voxel is corroborated
-// (floaters removed, nothing averaged away) — the better choice for few frames.
+// by >= k DISTINCT source frames (single-frame floaters dropped). `mode` selects how
+// a consensus voxel is emitted:
+//   FUSE_AVERAGED (0): one averaged gaussian per voxel — denoised but decimated
+//                      (sparse unless many frames overlap).
+//   FUSE_KEPT (1):     every raw gaussian in the voxel — dense, nothing averaged.
+//   FUSE_BEST (2):     only the single most-confident frame's gaussians per voxel —
+//                      dense AND de-ghosted (no overlapping copies stacked).
+enum FuseMode { FUSE_AVERAGED = 0, FUSE_KEPT = 1, FUSE_BEST = 2 };
 FuseStats consensus_fuse(const std::vector<AccumPoint> & cloud, double voxel_frac, int k,
-                         std::vector<AccumPoint> & fused, bool keep_raw = false);
+                         std::vector<AccumPoint> & fused, int mode = FUSE_AVERAGED);
+
+// ---- gaussian-level geometric de-ghost (multi-view consensus refinement) ----
+// Pairwise Sim(3) chaining leaves per-OBJECT (non-rigid) misregistration that a
+// per-frame pose fit can't remove, so overlapping frames show doubled objects.
+// This refines the cloud IN PLACE at the gaussian level: each iteration, for every
+// point, moves it a fraction `alpha` toward the opacity-weighted consensus of the
+// OTHER frames' points in its (coarse-to-fine) voxel neighbourhood. Non-rigid and
+// local, so spatially-varying ghosting collapses to one surface; single-frame
+// regions (no other frame nearby) are left untouched. Returns the RMS point-to-
+// consensus distance after refinement, as a fraction of cloud extent.
+double consensus_refine(std::vector<AccumPoint> & cloud, double voxel_frac = 0.03,
+                        int iters = 8, double alpha = 0.5);
 
 // ---- loop closure (mirrors loop_closure.py) -------------------------------
 // Inverse of a similarity 4x4 [[sR,t],[0,1]] (sR = s*rotation): [[(1/s)Rᵀ,...],[0,1]].
diff --git a/tests/test_pose.cpp b/tests/test_pose.cpp
index bf31ac0..6b2b021 100644
--- a/tests/test_pose.cpp
+++ b/tests/test_pose.cpp
@@ -540,16 +540,21 @@ static void test_consensus_fuse() {
                           1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.0f, 0.0f, 0.0f, j % 4 });
 
     std::vector<AccumPoint> fused;
-    FuseStats st = consensus_fuse(cloud, /*voxel_frac=*/0.02, /*k=*/2, fused);
+    FuseStats st = consensus_fuse(cloud, /*voxel_frac=*/0.02, /*k=*/2, fused, FUSE_AVERAGED);
     check("raw points", st.raw_points == 35);
     check("kept voxels (the surface)", st.kept_voxels == 5);
     check("fused size (averaged: one per voxel)", fused.size() == 5);
     check("points kept", st.points_kept == 15);
     check("floaters dropped", st.points_dropped == 20);
-    // keep_raw=true: dense — every raw point in a consensus voxel, floaters dropped
+    // FUSE_KEPT: dense — every raw point in a consensus voxel, floaters dropped
     std::vector<AccumPoint> kept;
-    consensus_fuse(cloud, /*voxel_frac=*/0.02, /*k=*/2, kept, /*keep_raw=*/true);
+    consensus_fuse(cloud, /*voxel_frac=*/0.02, /*k=*/2, kept, FUSE_KEPT);
     check("kept mode: dense (all 15 consensus raw points)", kept.size() == 15);
+    // FUSE_BEST: per voxel only the most-confident frame. Each surface voxel has 3
+    // frames with equal opacity (0.5) so frame 0 (first seen) wins the tie -> 5 points.
+    std::vector<AccumPoint> best;
+    consensus_fuse(cloud, /*voxel_frac=*/0.02, /*k=*/2, best, FUSE_BEST);
+    check("best mode: one frame per voxel (5 points)", best.size() == 5);
     // a fused point sits at its cluster centroid (~2*loc), color preserved
     bool centroid_ok = true;
     for (const auto & p : fused) {
@@ -559,6 +564,55 @@ static void test_consensus_fuse() {
     check("fused centroid + color", centroid_ok);
 }
 
+// Gaussian-level de-ghost: a base scene seen by 3 frames through small per-object
+// misalignments ghosts; consensus_refine (non-rigid, per-point) must pull the
+// overlapping copies together.
+static void test_consensus_refine() {
+    std::printf("test_consensus_refine\n");
+    // well-separated base points (a grid, spacing 1) so each object's 3 copies share
+    // a voxel but distinct objects don't — the real (2D surface) regime.
+    const int side = 4, M = side*side*side, Fr = 3;
+    std::normal_distribution<double> g(0, 1);
+    std::vector<double> base(3*M);
+    { int idx=0; for (int a=0;a<side;a++) for (int b=0;b<side;b++) for (int c=0;c<side;c++) {
+        base[3*idx]=a; base[3*idx+1]=b; base[3*idx+2]=c; idx++; } }
+    auto small_rot = [&](double deg) {
+        Vec3 ax = { g(RNG), g(RNG), g(RNG) };
+        double n = std::sqrt(ax[0]*ax[0]+ax[1]*ax[1]+ax[2]*ax[2]); ax={ax[0]/n,ax[1]/n,ax[2]/n};
+        const double a=deg*M_PI/180, c=std::cos(a), s=std::sin(a), v=1-c, x=ax[0], y=ax[1], z=ax[2];
+        return Mat3{ { c+x*x*v, x*y*v-z*s, x*z*v+y*s,  y*x*v+z*s, c+y*y*v, y*z*v-x*s,  z*x*v-y*s, z*y*v+x*s, c+z*z*v } };
+    };
+    Sim3 P[3] = { sim_identity(),
+                  { 1.0, small_rot(1.0), { 0.04, -0.03, 0.035 } },
+                  { 1.0, small_rot(1.0), { -0.035, 0.04, -0.03 } } };
+    std::vector<AccumPoint> cloud(Fr*M);
+    for (int f = 0; f < Fr; f++) for (int i = 0; i < M; i++) {
+        Vec3 b = { base[3*i], base[3*i+1], base[3*i+2] };
+        Vec3 w = sim_apply(P[f], b);
+        AccumPoint p{}; p.x=(float)w[0]; p.y=(float)w[1]; p.z=(float)w[2];
+        p.opacity=1.0f; p.sx=p.sy=p.sz=0.01f; p.qw=1.0f; p.frame=f;
+        cloud[(size_t)f*M + i] = p;
+    }
+    double mean[3]={0,0,0}; for (auto&p:cloud){ mean[0]+=p.x;mean[1]+=p.y;mean[2]+=p.z; }
+    for (int c=0;c<3;c++) mean[c]/=cloud.size();
+    double ss=0; for (auto&p:cloud){ double dx=p.x-mean[0],dy=p.y-mean[1],dz=p.z-mean[2]; ss+=dx*dx+dy*dy+dz*dz; }
+    const double ext = std::sqrt(ss/cloud.size());
+    auto spread = [&](const std::vector<AccumPoint>& c) {
+        double s=0;
+        for (int i=0;i<M;i++) { double cx=0,cy=0,cz=0;
+            for (int f=0;f<Fr;f++){ cx+=c[(size_t)f*M+i].x; cy+=c[(size_t)f*M+i].y; cz+=c[(size_t)f*M+i].z; }
+            cx/=Fr; cy/=Fr; cz/=Fr;
+            for (int f=0;f<Fr;f++){ const auto&p=c[(size_t)f*M+i]; s += (p.x-cx)*(p.x-cx)+(p.y-cy)*(p.y-cy)+(p.z-cz)*(p.z-cz); } }
+        return std::sqrt(s/(M*Fr))/ext;
+    };
+    const double pre = spread(cloud);
+    consensus_refine(cloud, /*voxel_frac=*/0.1, /*iters=*/12, /*alpha=*/0.6);
+    const double post = spread(cloud);
+    char buf[80]; std::snprintf(buf, sizeof buf, "ghosting %.3f%% -> %.3f%% of extent", 100*pre, 100*post);
+    check("ghosting present before", pre > 0.01);
+    check("consensus_refine de-ghosts", post < 0.4 * pre, buf);
+}
+
 static void test_loop_distribute() {
     std::printf("test_loop_distribute\n");
     Mat3 R = rand_rotation();
@@ -680,6 +734,7 @@ int main() {
     test_pnp_robust_outliers();
     test_accumulate_chain();
     test_consensus_fuse();
+    test_consensus_refine();
     test_loop_distribute();
     test_splat_record();
     test_accumulate_channels();
diff --git a/tools/free_splatter-cli.cpp b/tools/free_splatter-cli.cpp
index c544623..b6a69c4 100644
--- a/tools/free_splatter-cli.cpp
+++ b/tools/free_splatter-cli.cpp
@@ -143,9 +143,11 @@ int main(int argc, char ** argv) {
     const char * model = nullptr, * splat_prefix = nullptr;
     float opac_thr = 5e-3f;
     long  max_splats = 0;
-    bool  accumulate = false, fuse = false, fuse_keep = true;  // dense (kept) by default
+    bool  accumulate = false, fuse = false;
+    int   fuse_mode = 1;                         // 0 averaged, 1 kept, 2 best-frame
     float voxel = 0.02f, splat_scale = 1.0f;   // multiplier on the predicted gaussian radii
     int   fuse_k = 2;
+    bool  refine = false; int refine_iters = 8; float refine_voxel = 0.03f, refine_alpha = 0.5f;  // geometric de-ghost
     std::vector<std::string> inputs;
 
     for (int i = 1; i < argc; i++) {
@@ -161,7 +163,12 @@ int main(int argc, char ** argv) {
         else if (a == "--fuse")                         fuse = true;
         else if (a == "--voxel" && i+1 < argc)          voxel = (float) atof(argv[++i]);
         else if (a == "--fuse-k" && i+1 < argc)         fuse_k = atoi(argv[++i]);
-        else if (a == "--fuse-mode" && i+1 < argc)      fuse_keep = std::string(argv[++i]) != "averaged";
+        else if (a == "--fuse-mode" && i+1 < argc) { std::string m = argv[++i];
+            fuse_mode = (m=="averaged") ? 0 : (m=="best") ? 2 : 1; }
+        else if (a == "--refine")                       refine = true;
+        else if (a == "--refine-iters" && i+1 < argc)   refine_iters = atoi(argv[++i]);
+        else if (a == "--refine-voxel" && i+1 < argc)   refine_voxel = (float) atof(argv[++i]);
+        else if (a == "--refine-alpha" && i+1 < argc)   refine_alpha = (float) atof(argv[++i]);
         else if (a == "--splat-scale" && i+1 < argc)    splat_scale = (float) atof(argv[++i]);
         else if (a == "-h" || a == "--help")            return usage(argv[0]);
         else if (!model)                                model = argv[i];
@@ -242,6 +249,8 @@ int main(int argc, char ** argv) {
             if (splat_prefix) {
                 free_splatter_point * cloud = nullptr; size_t nc = 0;
                 free_splatter_accumulator_cloud(acc, &cloud, &nc);
+                if (refine && nframes >= 2)            // geometric de-ghost a copy (internal untouched)
+                    free_splatter_refine_cloud(cloud, nc, refine_voxel, refine_iters, refine_alpha);
                 char path[1024];
                 std::snprintf(path, sizeof path, "%s_%d.splat", splat_prefix, nframes);
                 write_cloud_splat(cloud, nc, (size_t) max_splats, splat_scale, path);
@@ -249,8 +258,9 @@ int main(int argc, char ** argv) {
             }
         }
         if (fuse && splat_prefix) {
+            if (refine) free_splatter_accumulator_refine(acc, refine_voxel, refine_iters, refine_alpha);
             free_splatter_point * fc = nullptr; size_t nf = 0;
-            free_splatter_accumulator_fuse(acc, voxel, fuse_k, fuse_keep ? 1 : 0, &fc, &nf);
+            free_splatter_accumulator_fuse(acc, voxel, fuse_k, fuse_mode, &fc, &nf);
             char path[1024];
             std::snprintf(path, sizeof path, "%s_fused.splat", splat_prefix);
             write_cloud_splat(fc, nf, (size_t) max_splats, splat_scale, path);

From 3ab0b956655293b65cf706fc92bc24a5dae80824 Mon Sep 17 00:00:00 2001
From: Richard Palethorpe <io@richiejp.com>
Date: Sat, 27 Jun 2026 23:56:31 +0100
Subject: [PATCH 25/33] demo: de-ghost the bake by default (--refine) +
 document the fuse/refine knobs

make_accumulate_demo.sh now passes --refine (gaussian-level consensus de-ghost) by
default (REFINE=0 to disable) and the live demo ends on a best-frame fused surface.
web/README documents the three fuse modes (kept/best/averaged) and the de-ghosting
(why gaussian-level consensus works where rigid pose-BA doesn't).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 scripts/make_accumulate_demo.sh |  5 ++++-
 web/README.md                   | 27 +++++++++++++++++++++------
 2 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/scripts/make_accumulate_demo.sh b/scripts/make_accumulate_demo.sh
index 4ff775f..456a43d 100755
--- a/scripts/make_accumulate_demo.sh
+++ b/scripts/make_accumulate_demo.sh
@@ -31,7 +31,10 @@ done
 # 2) accumulate: one engine pass over the stream writes acc_2.splat, acc_3.splat,
 #    ... (the cloud after each photo is folded in) + acc_fused.splat (the
 #    consensus surface: voxels seen by >= K frames, single-view floaters removed).
-"$CLI" --device "$DEVICE" --accumulate --fuse --splat-prefix "$OUT/acc" \
+#    --refine geometrically de-ghosts each step (pulls overlapping frames together);
+#    set REFINE=0 to disable.
+RFLAG=; [ "${REFINE:-1}" = 1 ] && RFLAG=--refine
+"$CLI" --device "$DEVICE" --accumulate $RFLAG --fuse --splat-prefix "$OUT/acc" \
   --max-splats "$MAXSPLATS" "$MODEL" "${FRAMES[@]}"
 
 # 3) manifest.json: one step per added photo (acc_n.splat + the n thumbnails, the
diff --git a/web/README.md b/web/README.md
index 21df4eb..f974b95 100644
--- a/web/README.md
+++ b/web/README.md
@@ -42,12 +42,27 @@ python3 -m http.server -d .cache/demo/accumulate 8080   # open http://localhost:
 ```
 
 URL params: `?ms=2600` step interval, `?start=N` begin at step N, `?auto=0` no
-auto-advance, `?spin=0` no auto-orbit. The cloud shown is the raw accumulated
-cloud; the bake also writes a consensus-fused `acc_fused.splat` (single-view
-floaters removed — only voxels seen by ≥ K frames survive), shown as a final step.
-`--fuse-mode kept` (default) keeps every raw gaussian in a consensus voxel (dense);
-`--fuse-mode averaged` collapses each voxel to one denoised point (cleaner but
-sparse — only worthwhile with many overlapping frames).
+auto-advance, `?spin=0` no auto-orbit. The bake also writes a consensus-fused
+`acc_fused.splat` (single-view floaters removed — only voxels seen by ≥ K frames
+survive), shown as a final step. `--fuse-mode kept` keeps every raw gaussian in a
+consensus voxel (dense); `best` keeps only the most-confident frame per voxel
+(dense AND de-ghosted); `averaged` collapses each voxel to one denoised point
+(cleaner but sparse — only worthwhile with many overlapping frames).
+
+**De-ghosting.** Pairwise chaining leaves per-object misregistration, so
+overlapping frames show doubled objects. `--refine` (on by default in the bake;
+`REFINE=0` to disable) runs a gaussian-level consensus refinement that
+non-rigidly pulls each point toward the agreement of the other frames — ~5× less
+ghosting on real clips. (A rigid per-frame pose bundle-adjust does **not** help
+here: the residual is per-object reconstruction disagreement, not camera-pose
+error.)
+
+**Pick frames with lateral baseline.** Two-view depth needs the camera to
+*translate sideways* — a pure forward dolly (the camera moving along the direction
+it faces) gives near-zero parallax and reconstructs blurry. Frames a few apart
+from an orbiting / strafing clip work far better than tightly-spaced frames of a
+walk-forward clip. Rule of thumb: the per-pair lateral baseline should be a few %
+of scene depth or more (`scripts/` had a `baseline` probe in git history).
 
 **Pick frames with lateral baseline.** Two-view depth needs the camera to
 *translate sideways* — a pure forward dolly (the camera moving along the direction

From 4238b33c63db488d641f4acd8c310bf1cdc73b65 Mon Sep 17 00:00:00 2001
From: Richard Palethorpe <io@richiejp.com>
Date: Sun, 28 Jun 2026 08:47:24 +0100
Subject: [PATCH 26/33] pose: parallax estimation (after-inference C++ metric +
 independent cv2 reference)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Quantifies how well a 2-view pair constrains depth, two independent ways, to flag
pairs whose reconstruction can't be trusted (and to detect the model inventing
depth the images don't support).

- pose::parallax_stats / pair_parallax (src/pose.{h,cpp}): from the model's OWN
  recovered geometry (estimate_poses, normalize=false so cameras+points share the
  view-0 frame) compute the median triangulation angle, the baseline angle off the
  optical axis (0=dolly/no-parallax, 90=strafe/ideal), and baseline/median-depth.
  All angles scale-invariant.
- C-API free_splatter_pair_parallax + free_splatter_parallax (include/free_splatter.h,
  src/free_splatter.cpp); CLI `--parallax MODEL {img0 img1 | pair.f32}`.
- tests/test_pose.cpp test_parallax_geometry: golden on synthetic cameras — a
  strafe baseline gives lateral=90 / tri=atan(B/Z); a dolly of the SAME length
  gives ~0. Pins the metric measures depth-resolving motion, not raw displacement.
- scripts/parallax_ref.py: dev-time INDEPENDENT reference (cv2, nix devShell only,
  never shipped). Feature matches -> ORB-SLAM homography-vs-fundamental R_H
  (calibration-free degeneracy) + essential-matrix median triangulation angle.

Validation (re10k ladder, focal matched to the model): on the well-conditioned
pair the two agree to 0.3 deg (after 15.6 vs before 15.9); on near-degenerate
pairs the model over-reports parallax 2-4x (after 7.0/4.5 vs independent 1.7/1.7),
i.e. it hallucinates depth the images can't support -- exactly what the cross-check
is for. Wide-baseline orbits (Truck) starve sparse matching, so the geometric
reference is only reliable on well-textured moderate-baseline pairs.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 include/free_splatter.h     |  20 ++++++
 scripts/parallax_ref.py     | 137 ++++++++++++++++++++++++++++++++++++
 src/free_splatter.cpp       |  21 ++++++
 src/pose.cpp                |  69 ++++++++++++++++++
 src/pose.h                  |  26 +++++++
 tests/test_pose.cpp         |  31 ++++++++
 tools/free_splatter-cli.cpp |  41 ++++++++++-
 7 files changed, 344 insertions(+), 1 deletion(-)
 create mode 100644 scripts/parallax_ref.py

diff --git a/include/free_splatter.h b/include/free_splatter.h
index be2cb15..d27c8f4 100644
--- a/include/free_splatter.h
+++ b/include/free_splatter.h
@@ -93,6 +93,26 @@ FREE_SPLATTER_API int free_splatter_estimate_poses(
     int32_t gaussian_channels, float opacity_threshold,
     float * cam2world_out, float * focal_out);
 
+// After-inference parallax: how well a 2-view pair constrains depth, measured
+// from the model's OWN recovered geometry. Angles are scale-invariant.
+typedef struct {
+    float tri_angle_deg;       // median triangulation angle over confident points
+    float lateral_angle_deg;   // baseline angle off view-0 optical axis (0=dolly, 90=strafe)
+    float baseline_over_depth; // ||C1-C0|| / median point depth from camera 0
+    float baseline;
+    float median_depth;
+    float focal;               // estimated (or supplied) focal, px
+    int32_t n_points;          // confident points used for the medians
+} free_splatter_parallax;
+
+// Estimate parallax for the first two views of an engine output buffer.
+// gaussians: n_views(>=2)*height*width*gaussian_channels f32. Returns 0 on
+// success, -1 on bad args. A low tri_angle_deg (< ~1-2 deg) or near-zero
+// lateral_angle_deg means depth is ill-conditioned (reconstruction unreliable).
+FREE_SPLATTER_API int free_splatter_pair_parallax(
+    const float * gaussians, int32_t n_views, int32_t height, int32_t width,
+    int32_t gaussian_channels, float opacity_threshold, free_splatter_parallax * out);
+
 // One accumulated 3D gaussian in the global frame: position, color in [0,1], the
 // anisotropic scale and rotation quaternion (w,x,y,z) — transformed through the
 // cross-run Sim(3) so the cloud renders as oriented splats — and the source frame
diff --git a/scripts/parallax_ref.py b/scripts/parallax_ref.py
new file mode 100644
index 0000000..eab6bfe
--- /dev/null
+++ b/scripts/parallax_ref.py
@@ -0,0 +1,137 @@
+#!/usr/bin/env python3
+"""Independent (model-free) parallax reference for a frame pair.
+
+Dev-time validation ONLY (never shipped; runs in the nix devShell, which provides
+cv2+numpy per flake.nix). The C++ engine's after-inference `--parallax` measures
+depth conditioning from the model's OWN recovered geometry; this measures it from
+raw 2D image evidence (feature matches -> epipolar geometry), so the two can be
+cross-checked. Agreement on a known-good-depth scene validates the engine metric;
+a large gap (geometric angle tiny, model angle confident) flags the model
+hallucinating depth.
+
+Two outputs:
+  * R_H  -- ORB-SLAM homography-vs-fundamental score ratio. ~>0.45 => a homography
+            explains the motion => pure-rotation / planar => no usable parallax.
+            Calibration-free.
+  * median triangulation angle (deg) -- COLMAP's depth-conditioning number; below
+            ~1-2 deg depth is ill-posed. Uses an assumed/supplied focal.
+
+Images are center-cropped square and resized to 512 to mirror the engine's
+preprocessing, so a focal in 512-px units matches the model's estimated focal.
+
+  nix develop -c python3 scripts/parallax_ref.py IMG0 IMG1 [--focal-px F]
+"""
+import argparse, sys
+import numpy as np
+import cv2
+
+
+def preprocess(path, size=512):
+    img = cv2.imread(path, cv2.IMREAD_COLOR)
+    if img is None:
+        sys.exit(f"cannot read {path}")
+    h, w = img.shape[:2]
+    s = min(h, w)
+    img = img[(h - s) // 2:(h - s) // 2 + s, (w - s) // 2:(w - s) // 2 + s]
+    return cv2.resize(img, (size, size), interpolation=cv2.INTER_AREA)
+
+
+def match(img0, img1):
+    g0, g1 = (cv2.cvtColor(i, cv2.COLOR_BGR2GRAY) for i in (img0, img1))
+    try:
+        det = cv2.SIFT_create(nfeatures=4000)
+        norm = cv2.NORM_L2
+    except AttributeError:
+        det = cv2.ORB_create(nfeatures=4000)
+        norm = cv2.NORM_HAMMING
+    k0, d0 = det.detectAndCompute(g0, None)
+    k1, d1 = det.detectAndCompute(g1, None)
+    if d0 is None or d1 is None or len(k0) < 8 or len(k1) < 8:
+        sys.exit("too few features")
+    bf = cv2.BFMatcher(norm)
+    good = [m for m, n in bf.knnMatch(d0, d1, k=2) if m.distance < 0.75 * n.distance]
+    p0 = np.float32([k0[m.queryIdx].pt for m in good])
+    p1 = np.float32([k1[m.trainIdx].pt for m in good])
+    return p0, p1
+
+
+def score_homography(H, p0, p1, sigma=1.0):
+    if H is None:
+        return 0.0
+    th, inv = 5.991, 1.0 / sigma ** 2
+    Hi = np.linalg.inv(H)
+    def transfer(M, a, b):
+        ah = np.c_[a, np.ones(len(a))]
+        q = (M @ ah.T).T
+        q = q[:, :2] / q[:, 2:3]
+        chi = np.sum((q - b) ** 2, axis=1) * inv
+        return np.where(chi < th, th - chi, 0.0)
+    return float(np.sum(transfer(H, p0, p1) + transfer(Hi, p1, p0)))
+
+
+def score_fundamental(F, p0, p1, sigma=1.0):
+    if F is None:
+        return 0.0
+    thF, thScore, inv = 3.841, 5.991, 1.0 / sigma ** 2
+    a0 = np.c_[p0, np.ones(len(p0))]
+    a1 = np.c_[p1, np.ones(len(p1))]
+    def epi(F, src, dst):                             # point-line chi in dst image
+        l = (F @ src.T).T                             # lines in dst
+        num = np.sum(l[:, :2] * dst, axis=1) + l[:, 2]
+        d2 = num ** 2 / (l[:, 0] ** 2 + l[:, 1] ** 2 + 1e-12)
+        chi = d2 * inv
+        return np.where(chi < thF, thScore - chi, 0.0)
+    s01 = epi(F, a0, p1)
+    s10 = epi(F.T, a1, p0)
+    return float(np.sum(s01 + s10))
+
+
+def triangulation_angle(p0, p1, f, cx=256.0, cy=256.0):
+    K = np.array([[f, 0, cx], [0, f, cy], [0, 0, 1]], np.float64)
+    E, _ = cv2.findEssentialMat(p0, p1, K, cv2.RANSAC, 0.999, 1.0)
+    if E is None or E.shape != (3, 3):
+        return None, 0
+    _, R, t, mP = cv2.recoverPose(E, p0, p1, K)
+    inl = (mP.ravel() > 0)
+    if inl.sum() < 8:
+        return None, int(inl.sum())
+    P0 = K @ np.eye(3, 4)
+    P1 = K @ np.hstack([R, t])
+    X4 = cv2.triangulatePoints(P0, P1, p0[inl].T, p1[inl].T)
+    X = (X4[:3] / X4[3]).T
+    C1 = (-R.T @ t).ravel()
+    r0 = X - 0.0
+    r1 = X - C1
+    n0 = np.linalg.norm(r0, axis=1)
+    n1 = np.linalg.norm(r1, axis=1)
+    ok = (n0 > 1e-9) & (n1 > 1e-9) & (X[:, 2] > 0)
+    cosang = np.sum(r0[ok] * r1[ok], axis=1) / (n0[ok] * n1[ok])
+    ang = np.degrees(np.arccos(np.clip(cosang, -1, 1)))
+    if len(ang) == 0:
+        return None, int(inl.sum())
+    return float(np.median(ang)), int(ok.sum())
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("img0"); ap.add_argument("img1")
+    ap.add_argument("--focal-px", type=float, default=None,
+                    help="focal in 512-px units; default from --fov-deg")
+    ap.add_argument("--fov-deg", type=float, default=60.0)
+    a = ap.parse_args()
+    f = a.focal_px if a.focal_px else 0.5 * 512 / np.tan(np.radians(a.fov_deg) / 2)
+
+    i0, i1 = preprocess(a.img0), preprocess(a.img1)
+    p0, p1 = match(i0, i1)
+    H, _ = cv2.findHomography(p0, p1, cv2.RANSAC, 3.0)
+    F, _ = cv2.findFundamentalMat(p0, p1, cv2.FM_RANSAC, 3.0, 0.99)
+    sH, sF = score_homography(H, p0, p1), score_fundamental(F, p0, p1)
+    rH = sH / (sH + sF) if (sH + sF) > 0 else float("nan")
+    tri, ntri = triangulation_angle(p0, p1, f)
+    tri_s = f"{tri:.3f}" if tri is not None else "n/a"
+    print(f"parallax_ref: matches={len(p0)}  R_H={rH:.3f} (>0.45 => degenerate)  "
+          f"tri_angle={tri_s} deg (npts={ntri})  focal={f:.1f}px")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/free_splatter.cpp b/src/free_splatter.cpp
index cd4155d..7305d21 100644
--- a/src/free_splatter.cpp
+++ b/src/free_splatter.cpp
@@ -225,6 +225,27 @@ int free_splatter_estimate_poses(const float * gaussians, int32_t n_views, int32
     return 0;
 }
 
+int free_splatter_pair_parallax(const float * gaussians, int32_t n_views, int32_t height,
+                                int32_t width, int32_t gaussian_channels,
+                                float opacity_threshold, free_splatter_parallax * out) {
+    if (!gaussians || !out || n_views < 2 || height < 1 || width < 1 || gaussian_channels < 16)
+        return -1;
+    const int P = height * width;
+    std::vector<std::vector<float>> pts, ops;
+    std::vector<const float *> pptr, optr;
+    deinterleave(gaussians, n_views, P, gaussian_channels, pts, ops, pptr, optr);
+    free_splatter::pose::Parallax px =
+        free_splatter::pose::pair_parallax(pptr, optr, height, width, opacity_threshold);
+    out->tri_angle_deg       = (float) px.tri_angle_deg;
+    out->lateral_angle_deg   = (float) px.lateral_angle_deg;
+    out->baseline_over_depth = (float) px.baseline_over_depth;
+    out->baseline            = (float) px.baseline;
+    out->median_depth        = (float) px.median_depth;
+    out->focal               = (float) px.focal;
+    out->n_points            = px.n_points;
+    return 0;
+}
+
 free_splatter_accumulator * free_splatter_accumulator_new(int32_t height, int32_t width,
                                                           float opacity_threshold) {
     if (height < 1 || width < 1) return nullptr;
diff --git a/src/pose.cpp b/src/pose.cpp
index e304000..15a15d7 100644
--- a/src/pose.cpp
+++ b/src/pose.cpp
@@ -849,6 +849,75 @@ PoseResult estimate_poses(const std::vector<const float *> & points,
     return res;
 }
 
+// ---- after-inference parallax ---------------------------------------------
+
+Parallax parallax_stats(const Vec3 & C0, const Vec3 & C1, const Vec3 & axis0,
+                        const float * pts, const float * mask, int N, double mask_thr) {
+    constexpr double PI = 3.14159265358979323846;
+    auto sub = [](const Vec3 & a, const Vec3 & b) -> Vec3 { return { a[0]-b[0], a[1]-b[1], a[2]-b[2] }; };
+    auto dot = [](const Vec3 & a, const Vec3 & b) { return a[0]*b[0] + a[1]*b[1] + a[2]*b[2]; };
+    auto nrm = [&](const Vec3 & a) { return std::sqrt(dot(a, a)); };
+
+    Parallax px{};
+    // unit optical axis of view 0
+    Vec3 ax = axis0; const double an = nrm(ax);
+    if (an > 1e-12) { ax[0]/=an; ax[1]/=an; ax[2]/=an; }
+
+    // baseline split: forward (dolly, no parallax) vs lateral (strafe, parallax)
+    const Vec3 b = sub(C1, C0);
+    px.baseline = nrm(b);
+    const double fwd = dot(b, ax);
+    const Vec3 blat = { b[0]-fwd*ax[0], b[1]-fwd*ax[1], b[2]-fwd*ax[2] };
+    px.lateral_angle_deg = std::atan2(nrm(blat), std::fabs(fwd)) * 180.0 / PI;
+
+    // per confident point: depth from cam0 + triangulation angle (rays from C0,C1)
+    std::vector<double> depths, angles;
+    depths.reserve(N); angles.reserve(N);
+    for (int i = 0; i < N; i++) {
+        if (mask && !(mask[i] > mask_thr)) continue;
+        const Vec3 X = { (double)pts[3*i], (double)pts[3*i+1], (double)pts[3*i+2] };
+        if (!std::isfinite(X[0]) || !std::isfinite(X[1]) || !std::isfinite(X[2])) continue;
+        const Vec3 r0 = sub(X, C0), r1 = sub(X, C1);
+        const double n0 = nrm(r0), n1 = nrm(r1);
+        if (n0 < 1e-9 || n1 < 1e-9) continue;
+        const double depth = dot(r0, ax);
+        if (depth > 0) depths.push_back(depth);
+        double c = dot(r0, r1) / (n0 * n1);
+        c = c < -1.0 ? -1.0 : (c > 1.0 ? 1.0 : c);
+        angles.push_back(std::acos(c) * 180.0 / PI);
+    }
+    auto median = [](std::vector<double> & v) -> double {
+        if (v.empty()) return 0.0;
+        const size_t m = v.size() / 2;
+        std::nth_element(v.begin(), v.begin() + m, v.end());
+        return v[m];
+    };
+    px.median_depth = median(depths);
+    px.tri_angle_deg = median(angles);
+    px.baseline_over_depth = (px.median_depth > 1e-9) ? px.baseline / px.median_depth : 0.0;
+    px.n_points = (int) angles.size();
+    return px;
+}
+
+Parallax pair_parallax(const std::vector<const float *> & points,
+                       const std::vector<const float *> & opacities,
+                       int H, int W, double opacity_threshold, double focal) {
+    Parallax px{};
+    if (points.size() < 2) return px;
+    PoseResult pr = estimate_poses(points, opacities, H, W, opacity_threshold,
+                                   focal, 100, /*normalize=*/false, /*seed=*/0);
+    if (pr.cam2world.size() < 2) return px;
+    const Mat4 & c0 = pr.cam2world[0];
+    const Mat4 & c1 = pr.cam2world[1];
+    const Vec3 C0 = { c0(0,3), c0(1,3), c0(2,3) };
+    const Vec3 C1 = { c1(0,3), c1(1,3), c1(2,3) };
+    const Vec3 axis0 = { c0(0,2), c0(1,2), c0(2,2) };   // camera +z (optical axis) in world
+    px = parallax_stats(C0, C1, axis0, points[0],
+                        opacities.empty() ? nullptr : opacities[0], H * W, opacity_threshold);
+    px.focal = pr.focal;
+    return px;
+}
+
 // ---- accumulation ---------------------------------------------------------
 
 namespace {
diff --git a/src/pose.h b/src/pose.h
index 3e50390..ba82a07 100644
--- a/src/pose.h
+++ b/src/pose.h
@@ -110,6 +110,32 @@ PoseResult estimate_poses(const std::vector<const float *> & points,
                           double focal = -1.0, int pnp_iter = 100,
                           bool normalize = false, uint64_t seed = 0);
 
+// ---- after-inference parallax (depth conditioning of a recovered pair) -------
+// Quantifies how well a 2-view pair constrains depth, from the model's OWN
+// recovered geometry. All angle fields are scale-invariant; B/Z is consistent
+// because cameras and points share the (un-normalized) view-0 frame.
+struct Parallax {
+    double tri_angle_deg;       // median triangulation angle over confident points
+    double lateral_angle_deg;   // baseline angle off view-0 optical axis (0=dolly, 90=strafe)
+    double baseline_over_depth; // ||C1-C0|| / median point depth from camera 0
+    double baseline;
+    double median_depth;
+    double focal;               // estimated (or supplied) focal, px
+    int    n_points;            // confident points used for the medians
+};
+
+// Pure geometry core (no PnP) — directly testable against hand-computed angles.
+// pts: 3*N view-0-frame points; mask: per-point weight (>thr kept), or nullptr.
+// axis0 need not be unit; it is normalized internally.
+Parallax parallax_stats(const Vec3 & C0, const Vec3 & C1, const Vec3 & axis0,
+                        const float * pts, const float * mask, int N, double mask_thr);
+
+// Full path: recover the pair's cameras (normalize=false), then parallax_stats.
+// points/opacities: 2 views (view-0-frame xyz + activated opacity), as estimate_poses.
+Parallax pair_parallax(const std::vector<const float *> & points,
+                       const std::vector<const float *> & opacities,
+                       int H, int W, double opacity_threshold = 0.05, double focal = -1.0);
+
 // ---- accumulation: chain successive runs into one world (mirrors accumulate.py)
 // One accumulated 3D gaussian in the global frame, tagged with the source frame
 // it came from (consensus fusion needs the frame id). Carries the full anisotropic
diff --git a/tests/test_pose.cpp b/tests/test_pose.cpp
index 6b2b021..a4816f5 100644
--- a/tests/test_pose.cpp
+++ b/tests/test_pose.cpp
@@ -719,8 +719,39 @@ static void test_accumulate_channels() {
     check("every gaussian channel survives run-0 accumulation", ok);
 }
 
+// Golden: parallax_stats on synthetic cameras with hand-computed angles. A
+// lateral (strafe) baseline yields a large triangulation/lateral angle; a pure
+// forward (dolly) baseline of the SAME length yields ~0 — the whole point of the
+// metric (it measures depth-resolving motion, not raw camera displacement).
+static void test_parallax_geometry() {
+    std::printf("test_parallax_geometry\n");
+    const double DEG = 180.0 / 3.14159265358979323846;
+    const Vec3 axis0 = { 0, 0, 1 };                 // view-0 looks down +z
+    // points at depth 1 (a few, all same depth -> clean medians)
+    const int N = 5;
+    std::vector<float> pts(3 * N);
+    for (int i = 0; i < N; i++) { pts[3*i]=0.0f; pts[3*i+1]=0.0f; pts[3*i+2]=1.0f; }
+
+    // (1) pure strafe: baseline 0.1 along x, perpendicular to the optical axis
+    Parallax s = parallax_stats({0,0,0}, {0.1,0,0}, axis0, pts.data(), nullptr, N, 0.0);
+    const double tri_expect = std::atan(0.1) * DEG;            // 5.7106 deg
+    char b1[96]; std::snprintf(b1, sizeof b1, "strafe lateral=%.3f tri=%.3f (expect 90, %.3f)", s.lateral_angle_deg, s.tri_angle_deg, tri_expect);
+    check("strafe -> lateral angle ~90 deg", std::fabs(s.lateral_angle_deg - 90.0) < 1e-3, b1);
+    check("strafe -> tri angle = atan(B/Z)", std::fabs(s.tri_angle_deg - tri_expect) < 1e-3, b1);
+    check("strafe -> B/Z = 0.1", std::fabs(s.baseline_over_depth - 0.1) < 1e-4);
+    check("strafe -> median depth = 1", std::fabs(s.median_depth - 1.0) < 1e-4);
+
+    // (2) pure dolly: SAME baseline length 0.1 but along the optical axis
+    Parallax d = parallax_stats({0,0,0}, {0,0,0.1}, axis0, pts.data(), nullptr, N, 0.0);
+    char b2[96]; std::snprintf(b2, sizeof b2, "dolly lateral=%.3f tri=%.3f (expect ~0)", d.lateral_angle_deg, d.tri_angle_deg);
+    check("dolly -> lateral angle ~0 deg", std::fabs(d.lateral_angle_deg) < 1e-3, b2);
+    check("dolly -> tri angle ~0 deg (no parallax)", std::fabs(d.tri_angle_deg) < 1e-3, b2);
+    check("dolly baseline equals strafe baseline", std::fabs(d.baseline - s.baseline) < 1e-6);
+}
+
 int main() {
     test_similarity_roundtrip();
+    test_parallax_geometry();
     test_scale_detection();
     test_nonlinear_detection();
     test_outlier_robustness();
diff --git a/tools/free_splatter-cli.cpp b/tools/free_splatter-cli.cpp
index b6a69c4..db8bbf8 100644
--- a/tools/free_splatter-cli.cpp
+++ b/tools/free_splatter-cli.cpp
@@ -143,7 +143,7 @@ int main(int argc, char ** argv) {
     const char * model = nullptr, * splat_prefix = nullptr;
     float opac_thr = 5e-3f;
     long  max_splats = 0;
-    bool  accumulate = false, fuse = false;
+    bool  accumulate = false, fuse = false, parallax = false;
     int   fuse_mode = 1;                         // 0 averaged, 1 kept, 2 best-frame
     float voxel = 0.02f, splat_scale = 1.0f;   // multiplier on the predicted gaussian radii
     int   fuse_k = 2;
@@ -159,6 +159,7 @@ int main(int argc, char ** argv) {
         else if (a == "--opacity-threshold" && i+1<argc) opac_thr = (float) atof(argv[++i]);
         else if (a == "--max-splats" && i+1 < argc)     max_splats = atol(argv[++i]);
         else if (a == "--accumulate")                   accumulate = true;
+        else if (a == "--parallax")                     parallax = true;
         else if (a == "--splat-prefix" && i+1 < argc)   splat_prefix = argv[++i];
         else if (a == "--fuse")                         fuse = true;
         else if (a == "--voxel" && i+1 < argc)          voxel = (float) atof(argv[++i]);
@@ -192,6 +193,44 @@ int main(int argc, char ** argv) {
                 geo.image_width, geo.image_height, geo.in_channels, geo.gaussian_channels);
     if (inputs.empty()) { free_splatter_free(ctx); return 0; }
 
+    // ---- parallax mode: depth-conditioning of a pair (after-inference) --------
+    // Input: exactly two images (engine runs the pair) or one [2,H,W,gc] .f32
+    // gaussian-output dump. Prints the recovered-geometry parallax stats.
+    if (parallax) {
+        const int gc = geo.gaussian_channels;
+        const size_t pair_floats = (size_t) 2 * geo.image_height * geo.image_width * gc;
+        std::vector<float> gbuf;
+        if (inputs.size() == 1 && ends_with(inputs[0], ".f32")) {
+            std::ifstream f(inputs[0], std::ios::binary | std::ios::ate);
+            if (!f) { std::fprintf(stderr, "cannot open %s\n", inputs[0].c_str()); free_splatter_free(ctx); return 1; }
+            const std::streamsize bytes = f.tellg(); f.seekg(0);
+            if ((size_t)(bytes / sizeof(float)) != pair_floats) {
+                std::fprintf(stderr, "dump %s: %zu floats, expected 2*%d*%d*%d\n", inputs[0].c_str(),
+                             (size_t)(bytes/sizeof(float)), geo.image_height, geo.image_width, gc);
+                free_splatter_free(ctx); return 1; }
+            gbuf.resize(pair_floats); f.read((char *) gbuf.data(), bytes);
+        } else if (inputs.size() == 2) {
+            std::vector<float> img;
+            for (const std::string & p : inputs)
+                if (!load_image_chw(p.c_str(), geo.image_width, img)) { free_splatter_free(ctx); return 1; }
+            float * out = nullptr; size_t n_out = 0;
+            if (free_splatter_run(ctx, img.data(), 2, geo.image_height, geo.image_width, &out, &n_out) != 0) {
+                std::fprintf(stderr, "run failed: %s\n", free_splatter_last_error(ctx)); free_splatter_free(ctx); return 1; }
+            gbuf.assign(out, out + n_out); free_splatter_buf_free(out);
+        } else {
+            std::fprintf(stderr, "--parallax needs exactly 2 images or one .f32 pair dump\n");
+            free_splatter_free(ctx); return 2;
+        }
+        free_splatter_parallax px;
+        if (free_splatter_pair_parallax(gbuf.data(), 2, geo.image_height, geo.image_width, gc, opac_thr, &px) != 0) {
+            std::fprintf(stderr, "parallax failed\n"); free_splatter_free(ctx); return 1; }
+        std::printf("parallax: tri_angle=%.3f deg  lateral_angle=%.2f deg  B/Z=%.4f  baseline=%.4f  median_depth=%.4f  focal=%.1f  npts=%d\n",
+                    px.tri_angle_deg, px.lateral_angle_deg, px.baseline_over_depth,
+                    px.baseline, px.median_depth, px.focal, px.n_points);
+        free_splatter_free(ctx);
+        return 0;
+    }
+
     // ---- accumulate mode: chain a photo stream into one world -----------------
     // Two input forms: a stream of IMAGES (the engine runs on each consecutive
     // pair), or pre-computed `.f32` pair dumps (each a [2,H,W,gc] engine output —

From 44a6f67171916f8ddcc9cfea343377e18927e73b Mon Sep 17 00:00:00 2001
From: Richard Palethorpe <io@richiejp.com>
Date: Sun, 28 Jun 2026 09:02:45 +0100
Subject: [PATCH 27/33] cli: --min-parallax keyframe gate for accumulate

Fold a candidate frame into the world only if its triangulation angle vs the last
KEPT frame clears DEG degrees (free_splatter_pair_parallax) -- else its depth is
ill-conditioned and the model would invent it. Implemented as keyframe selection:
the next candidate is re-paired against the last kept frame (not the immediate
predecessor), so skipping a frame re-anchors instead of breaking the Sim(3) chain.
Threshold 0 degenerates to consecutive pairs (no behavior change). Image mode only
(re-pairing needs the engine); .f32 dump mode warns and accumulates all, since
fixed pairs can't be re-anchored.

Verified on the re10k demo frames: (f0000,f0020)=15.6 deg kept, (f0020,f0040)=7.0
skipped, then f0060 re-paired against f0020 = 15.7 deg kept -- the degenerate
middle frame dropped, the world kept to well-conditioned views only.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 tools/free_splatter-cli.cpp | 95 +++++++++++++++++++++++++------------
 1 file changed, 65 insertions(+), 30 deletions(-)

diff --git a/tools/free_splatter-cli.cpp b/tools/free_splatter-cli.cpp
index db8bbf8..f6d3c14 100644
--- a/tools/free_splatter-cli.cpp
+++ b/tools/free_splatter-cli.cpp
@@ -130,11 +130,17 @@ static int usage(const char * a0) {
         "\n"
         "accumulate mode (>=2 images -> one world from the photo stream):\n"
         "  %s --accumulate [--splat-prefix P] [--fuse] [--voxel V] [--fuse-k K]\n"
-        "     [--splat-scale S] [--max-splats N] MODEL.gguf IMG0 IMG1 IMG2 ...\n"
+        "     [--min-parallax DEG] [--splat-scale S] [--max-splats N] MODEL.gguf IMG0 IMG1 ...\n"
         "  runs the engine on each consecutive pair, chains the runs (Sim(3)) into\n"
         "  one accumulating cloud; writes P_<nframes>.splat after each pair (the\n"
-        "  evolving reconstruction) and, with --fuse, a consensus-fused P_fused.splat.\n",
-        a0, a0);
+        "  evolving reconstruction) and, with --fuse, a consensus-fused P_fused.splat.\n"
+        "  --min-parallax gates by keyframe: a candidate frame is folded in only if its\n"
+        "  triangulation angle vs the last kept frame is >= DEG (else its depth is\n"
+        "  ill-conditioned); image mode only.\n"
+        "\n"
+        "parallax (depth conditioning of one pair, after-inference):\n"
+        "  %s --parallax MODEL.gguf (IMG0 IMG1 | PAIR.f32)\n",
+        a0, a0, a0);
     return 2;
 }
 
@@ -148,6 +154,7 @@ int main(int argc, char ** argv) {
     float voxel = 0.02f, splat_scale = 1.0f;   // multiplier on the predicted gaussian radii
     int   fuse_k = 2;
     bool  refine = false; int refine_iters = 8; float refine_voxel = 0.03f, refine_alpha = 0.5f;  // geometric de-ghost
+    float min_parallax = 0.0f;                   // deg; >0 gates accumulate by keyframe parallax
     std::vector<std::string> inputs;
 
     for (int i = 1; i < argc; i++) {
@@ -170,6 +177,7 @@ int main(int argc, char ** argv) {
         else if (a == "--refine-iters" && i+1 < argc)   refine_iters = atoi(argv[++i]);
         else if (a == "--refine-voxel" && i+1 < argc)   refine_voxel = (float) atof(argv[++i]);
         else if (a == "--refine-alpha" && i+1 < argc)   refine_alpha = (float) atof(argv[++i]);
+        else if (a == "--min-parallax" && i+1 < argc)   min_parallax = (float) atof(argv[++i]);
         else if (a == "--splat-scale" && i+1 < argc)    splat_scale = (float) atof(argv[++i]);
         else if (a == "-h" || a == "--help")            return usage(argv[0]);
         else if (!model)                                model = argv[i];
@@ -257,11 +265,26 @@ int main(int argc, char ** argv) {
         free_splatter_accumulator * acc = free_splatter_accumulator_new(sz, sz, opac_thr);
         if (!acc) { std::fprintf(stderr, "accumulator alloc failed\n"); free_splatter_free(ctx); return 1; }
 
-        for (size_t k = 0; k < npairs; k++) {
-            const float * g = nullptr;
-            std::vector<float> dumpbuf;
-            float * runout = nullptr;
-            if (dump_mode) {
+        // emit the accumulated cloud after a pair is folded in (optional de-ghost)
+        auto emit_step = [&](int nframes) {
+            if (!splat_prefix) return;
+            free_splatter_point * cloud = nullptr; size_t nc = 0;
+            free_splatter_accumulator_cloud(acc, &cloud, &nc);
+            if (refine && nframes >= 2)               // geometric de-ghost a copy (internal untouched)
+                free_splatter_refine_cloud(cloud, nc, refine_voxel, refine_iters, refine_alpha);
+            char path[1024];
+            std::snprintf(path, sizeof path, "%s_%d.splat", splat_prefix, nframes);
+            write_cloud_splat(cloud, nc, (size_t) max_splats, splat_scale, path);
+            free_splatter_buf_free(cloud);
+        };
+
+        if (dump_mode) {
+            // fixed pre-computed pairs: cannot re-anchor, so the parallax gate
+            // (which works by re-pairing keyframes) does not apply here.
+            if (min_parallax > 0.0f)
+                std::fprintf(stderr, "note: --min-parallax re-pairs keyframes; ignored for "
+                                     ".f32 dumps (fixed pairs) -- accumulating all.\n");
+            for (size_t k = 0; k < npairs; k++) {
                 std::ifstream f(inputs[k], std::ios::binary | std::ios::ate);
                 if (!f) { std::fprintf(stderr, "cannot open %s\n", inputs[k].c_str()); free_splatter_accumulator_free(acc); free_splatter_free(ctx); return 1; }
                 const std::streamsize bytes = f.tellg(); f.seekg(0);
@@ -269,32 +292,44 @@ int main(int argc, char ** argv) {
                     std::fprintf(stderr, "dump %s: %zu floats, expected 2*%d*%d*%d\n", inputs[k].c_str(),
                                  (size_t)(bytes/sizeof(float)), sz, sz, gc);
                     free_splatter_accumulator_free(acc); free_splatter_free(ctx); return 1; }
-                dumpbuf.resize(pair_floats); f.read((char *) dumpbuf.data(), bytes);
-                g = dumpbuf.data();
-            } else {
+                std::vector<float> dumpbuf(pair_floats); f.read((char *) dumpbuf.data(), bytes);
+                free_splatter_accumulator_add_pair(acc, dumpbuf.data(), gc);
+                const int nframes = free_splatter_accumulator_frame_count(acc);
+                std::printf("pair %zu -> %d frames\n", k, nframes);
+                emit_step(nframes);
+            }
+        } else {
+            // image mode: keyframe gating. Pair the next candidate against the last
+            // KEPT frame; fold it in only if its parallax clears --min-parallax
+            // (else its depth is ill-conditioned). Threshold 0 -> every consecutive
+            // pair, identical to the ungated path.
+            size_t last_kept = 0; int skipped = 0;
+            for (size_t j = 1; j < frames.size(); j++) {
                 std::vector<float> pair(2 * per_view);
-                std::memcpy(&pair[0],        frames[k].data(),   per_view * sizeof(float));
-                std::memcpy(&pair[per_view], frames[k+1].data(), per_view * sizeof(float));
-                size_t ng = 0;
+                std::memcpy(&pair[0],        frames[last_kept].data(), per_view * sizeof(float));
+                std::memcpy(&pair[per_view], frames[j].data(),         per_view * sizeof(float));
+                float * runout = nullptr; size_t ng = 0;
                 if (free_splatter_run(ctx, pair.data(), 2, sz, sz, &runout, &ng) != 0) {
-                    std::fprintf(stderr, "run pair %zu failed: %s\n", k, free_splatter_last_error(ctx));
+                    std::fprintf(stderr, "run pair (%zu,%zu) failed: %s\n", last_kept, j, free_splatter_last_error(ctx));
                     free_splatter_accumulator_free(acc); free_splatter_free(ctx); return 1; }
-                g = runout;
-            }
-            free_splatter_accumulator_add_pair(acc, g, gc);
-            if (runout) free_splatter_buf_free(runout);
-            const int nframes = free_splatter_accumulator_frame_count(acc);
-            std::printf("pair %zu -> %d frames\n", k, nframes);
-            if (splat_prefix) {
-                free_splatter_point * cloud = nullptr; size_t nc = 0;
-                free_splatter_accumulator_cloud(acc, &cloud, &nc);
-                if (refine && nframes >= 2)            // geometric de-ghost a copy (internal untouched)
-                    free_splatter_refine_cloud(cloud, nc, refine_voxel, refine_iters, refine_alpha);
-                char path[1024];
-                std::snprintf(path, sizeof path, "%s_%d.splat", splat_prefix, nframes);
-                write_cloud_splat(cloud, nc, (size_t) max_splats, splat_scale, path);
-                free_splatter_buf_free(cloud);
+                if (min_parallax > 0.0f) {
+                    free_splatter_parallax px;
+                    free_splatter_pair_parallax(runout, 2, sz, sz, gc, opac_thr, &px);
+                    if (px.tri_angle_deg < min_parallax) {
+                        std::printf("skip frame %zu: parallax %.1f deg < %.1f (vs kept frame %zu)\n",
+                                    j, px.tri_angle_deg, min_parallax, last_kept);
+                        free_splatter_buf_free(runout); skipped++; continue;
+                    }
+                    std::printf("keep frame %zu: parallax %.1f deg (vs kept frame %zu)\n", j, px.tri_angle_deg, last_kept);
+                }
+                free_splatter_accumulator_add_pair(acc, runout, gc);
+                free_splatter_buf_free(runout);
+                const int nframes = free_splatter_accumulator_frame_count(acc);
+                std::printf("pair (%zu,%zu) -> %d frames\n", last_kept, j, nframes);
+                emit_step(nframes);
+                last_kept = j;
             }
+            if (skipped) std::printf("gated: skipped %d low-parallax frame(s)\n", skipped);
         }
         if (fuse && splat_prefix) {
             if (refine) free_splatter_accumulator_refine(acc, refine_voxel, refine_iters, refine_alpha);

From 6b5942fc6dbc2bf356d477cbcb666d5bbcfb82aa Mon Sep 17 00:00:00 2001
From: Richard Palethorpe <io@richiejp.com>
Date: Sun, 28 Jun 2026 09:47:35 +0100
Subject: [PATCH 28/33] demo: wire --min-parallax gate into the bake (+ honest
 de-ghost docs)

make_accumulate_demo.sh now passes --min-parallax (MIN_PARALLAX, default 8 deg;
0 disables) and --fuse-mode best, and builds the manifest from the frames the gate
actually KEPT (parsing the keep/skip log): frame 0 is the anchor, each "keep frame
J" adds input frame J, and step acc_n shows that step's n kept thumbnails. So you
can feed a long dense frame stream and the gate curates the well-conditioned subset
instead of folding in frames whose depth the model would invent.

Verified end-to-end on the 7-frame re10k loop: gate kept f0000/f0020/f0060 and
dropped f0040 (7 deg), f0080 (2.4), f0100 (4.8), f0120 (6.6) -- the walkthrough
stops translating laterally after f0060, so only three views carry depth.

web/README documents MIN_PARALLAX (after-inference angle, over-reports -> keep well
above COLMAP's 1-2 deg; parallax_ref.py is the independent cross-check) and folds
in the earlier honest de-ghost rewrite (best-frame selection; --refine off, why).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 scripts/make_accumulate_demo.sh | 47 ++++++++++++++++++++++-----------
 web/README.md                   | 43 +++++++++++++++++++-----------
 2 files changed, 60 insertions(+), 30 deletions(-)

diff --git a/scripts/make_accumulate_demo.sh b/scripts/make_accumulate_demo.sh
index 456a43d..adb7fc6 100755
--- a/scripts/make_accumulate_demo.sh
+++ b/scripts/make_accumulate_demo.sh
@@ -29,29 +29,46 @@ for i in "${!FRAMES[@]}"; do
 done
 
 # 2) accumulate: one engine pass over the stream writes acc_2.splat, acc_3.splat,
-#    ... (the cloud after each photo is folded in) + acc_fused.splat (the
-#    consensus surface: voxels seen by >= K frames, single-view floaters removed).
-#    --refine geometrically de-ghosts each step (pulls overlapping frames together);
-#    set REFINE=0 to disable.
-RFLAG=; [ "${REFINE:-1}" = 1 ] && RFLAG=--refine
-"$CLI" --device "$DEVICE" --accumulate $RFLAG --fuse --splat-prefix "$OUT/acc" \
-  --max-splats "$MAXSPLATS" "$MODEL" "${FRAMES[@]}"
+#    ... (the cloud after each KEPT photo) + a best-frame consensus acc_fused.splat.
+#    --min-parallax gates by keyframe: a candidate frame is folded in only if its
+#    triangulation angle vs the last KEPT frame clears MIN_PARALLAX deg (else its
+#    depth is ill-conditioned and the model would invent it); skipped frames re-
+#    anchor against the last kept one. This is the after-inference angle, which the
+#    model over-reports, so keep it well above COLMAP's 1-2 deg. MIN_PARALLAX=0
+#    folds in every frame. --refine (OFF) is the gaussian averaging pass (blurs).
+RFLAG=; [ "${REFINE:-0}" = 1 ] && RFLAG=--refine
+MINPAR=${MIN_PARALLAX:-8}
+PFLAG=; [ "$MINPAR" != "0" ] && PFLAG="--min-parallax $MINPAR"
+LOG="$OUT/accumulate.log"
+"$CLI" --device "$DEVICE" --accumulate $RFLAG $PFLAG --fuse --fuse-mode best \
+  --splat-prefix "$OUT/acc" --max-splats "$MAXSPLATS" "$MODEL" "${FRAMES[@]}" | tee "$LOG"
 
-# 3) manifest.json: one step per added photo (acc_n.splat + the n thumbnails, the
-#    last being the one just added), then a final consensus-fused step.
-n=${#FRAMES[@]}
-allimgs=""
-for ((j=0;j<n;j++)); do allimgs+="\"view_$j.jpg\""; [ $j -lt $((n-1)) ] && allimgs+=", "; done
+# 3) manifest.json: one step per KEPT photo (acc_n.splat + its n kept thumbnails),
+#    then the consensus-fused step. With gating only the frames that carried enough
+#    parallax are kept: frame 0 is the anchor and each "keep frame J" line adds
+#    input frame J. Ungated (MIN_PARALLAX=0), every frame is kept.
+if [ -z "$PFLAG" ]; then
+  kept=(); for i in "${!FRAMES[@]}"; do kept+=("$i"); done
+else
+  kept=(0)
+  while read -r j; do kept+=("$j"); done < <(grep -oP '^keep frame \K[0-9]+' "$LOG")
+fi
+nkept=${#kept[@]}
+[ "$nkept" -ge 2 ] || { echo "only $nkept frame(s) kept (MIN_PARALLAX=$MINPAR too high?)" >&2; exit 1; }
+thumb() { printf '"view_%s.jpg"' "$1"; }       # thumbnail referenced by original index
 {
   echo '{ "steps": ['
-  for ((k=2;k<=n;k++)); do
+  for ((k=2;k<=nkept;k++)); do
     imgs=""
-    for ((j=0;j<k;j++)); do imgs+="\"view_$j.jpg\""; [ $j -lt $((k-1)) ] && imgs+=", "; done
+    for ((j=0;j<k;j++)); do imgs+="$(thumb "${kept[$j]}")"; [ $j -lt $((k-1)) ] && imgs+=", "; done
     printf '    {"splat":"acc_%d.splat","images":[%s],"n":%d},\n' "$k" "$imgs" "$k"
   done
-  printf '    {"splat":"acc_fused.splat","images":[%s],"n":%d,"label":"consensus-fused — single-view floaters removed"}\n' "$allimgs" "$n"
+  allimgs=""
+  for ((j=0;j<nkept;j++)); do allimgs+="$(thumb "${kept[$j]}")"; [ $j -lt $((nkept-1)) ] && allimgs+=", "; done
+  printf '    {"splat":"acc_fused.splat","images":[%s],"n":%d,"label":"consensus-fused — best-frame, parallax-gated"}\n' "$allimgs" "$nkept"
   echo '  ] }'
 } > "$OUT/manifest.json"
+echo "kept frames (input indices): ${kept[*]}  of ${#FRAMES[@]}"
 
 # 4) the viewer, self-contained next to its assets
 cp "$ROOT/web/accumulate.html" "$OUT/index.html"
diff --git a/web/README.md b/web/README.md
index f974b95..8884613 100644
--- a/web/README.md
+++ b/web/README.md
@@ -49,27 +49,40 @@ consensus voxel (dense); `best` keeps only the most-confident frame per voxel
 (dense AND de-ghosted); `averaged` collapses each voxel to one denoised point
 (cleaner but sparse — only worthwhile with many overlapping frames).
 
-**De-ghosting.** Pairwise chaining leaves per-object misregistration, so
-overlapping frames show doubled objects. `--refine` (on by default in the bake;
-`REFINE=0` to disable) runs a gaussian-level consensus refinement that
-non-rigidly pulls each point toward the agreement of the other frames — ~5× less
-ghosting on real clips. (A rigid per-frame pose bundle-adjust does **not** help
-here: the residual is per-object reconstruction disagreement, not camera-pose
-error.)
+**De-ghosting.** Pairwise chaining overlays each frame's reconstruction in world
+space, so where two frames disagree about a surface you see doubled ("ghosted")
+copies. The fused step removes them by **best-frame selection** (`--fuse-mode
+best`): within each voxel it keeps only the single most-confident frame's
+gaussians, so disagreeing copies aren't stacked — one frame represents each
+region. This de-ghosts by *selection* (no averaging → no blur), at the cost of
+visible seams where adjacent voxels pick different frames. The two artifact-free
+alternatives are more expensive: fixing the registration (only helps when the
+disagreement is a *rigid* pose drift — usually it isn't, it's per-pair stereo
+depth disagreement baked into the network output, which no rigid alignment can
+undo) or photometric 3DGS re-optimization. A gaussian-level averaging pass
+(`--refine`, **off** by default) exists but is **not recommended**: it averages
+points that already share a voxel — blurring the in-register surface — while
+leaving ghosts that are more than a voxel apart untouched, so it trades sharpness
+for no real de-ghosting.
 
 **Pick frames with lateral baseline.** Two-view depth needs the camera to
 *translate sideways* — a pure forward dolly (the camera moving along the direction
 it faces) gives near-zero parallax and reconstructs blurry. Frames a few apart
 from an orbiting / strafing clip work far better than tightly-spaced frames of a
 walk-forward clip. Rule of thumb: the per-pair lateral baseline should be a few %
-of scene depth or more (`scripts/` had a `baseline` probe in git history).
-
-**Pick frames with lateral baseline.** Two-view depth needs the camera to
-*translate sideways* — a pure forward dolly (the camera moving along the direction
-it faces) gives near-zero parallax and reconstructs blurry. Frames a few apart
-from an orbiting / strafing clip work far better than tightly-spaced frames of a
-walk-forward clip. Rule of thumb: the per-pair lateral baseline should be a few %
-of scene depth or more (`scripts/` had a `baseline` probe in git history).
+of scene depth or more.
+
+**…or let the gate pick them (`MIN_PARALLAX`).** The bake passes
+`--min-parallax DEG` (default 8°, `MIN_PARALLAX=0` to disable): a candidate frame
+is folded in only if its triangulation angle vs the last *kept* frame clears the
+threshold — otherwise its depth is ill-conditioned and the model invents it.
+Skipped frames re-anchor against the last kept one (keyframe selection), so you can
+feed a long, dense frame stream and let the gate curate the well-conditioned
+subset. The threshold is the engine's *after-inference* angle
+(`free_splatter-cli --parallax …`), which over-reports, so keep it well above
+COLMAP's 1–2°. `scripts/parallax_ref.py` (cv2, dev-shell only) is the independent
+geometric cross-check — agreement on a pair validates it; a large gap (geometric
+angle tiny, model angle confident) flags the model hallucinating depth.
 
 ## Sanity-checking the data separately
 

From 65bde62e0902642bb8bb141b08e4d4f2c7f39af9 Mon Sep 17 00:00:00 2001
From: Richard Palethorpe <io@richiejp.com>
Date: Sun, 28 Jun 2026 10:45:22 +0100
Subject: [PATCH 29/33] demo: auto-bake videos from demo-vids/
 (scripts/demo/bake-vids.sh)

Drop a clip under demo-vids/<name>/ (mirroring the demo-photos/ convention) and
this samples it to frames, lets the --min-parallax gate curate the well-conditioned
keyframes, and bakes a growing-reconstruction demo (make_accumulate_demo.sh) to
.cache/demo/<name>/. Time-lapse / slow-pan clips are the sweet spot: the gate drops
the near-duplicate tight frames and keeps ~10-14deg-parallax steps.

Verified on two clips: flower-bed (81 frames -> 21 sampled -> 11 kept, a clean
10-step growing flower bed) and office-corner (9 -> 5 kept, 4 steps). gitignore the
demo-vids/ drop folder like demo-photos/ (user media, not committed).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .gitignore                |  3 +-
 scripts/demo/bake-vids.sh | 65 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+), 1 deletion(-)
 create mode 100755 scripts/demo/bake-vids.sh

diff --git a/.gitignore b/.gitignore
index 3c76e18..15c3e90 100644
--- a/.gitignore
+++ b/.gitignore
@@ -53,5 +53,6 @@ timeout-*
 /server/fsserver
 /server/server
 
-# Demo-video drop folder (user photos; baked into .cache/demo)
+# Demo drop folders (user photos / videos; baked into .cache/demo)
 /demo-photos/
+/demo-vids/
diff --git a/scripts/demo/bake-vids.sh b/scripts/demo/bake-vids.sh
new file mode 100755
index 0000000..4c8635e
--- /dev/null
+++ b/scripts/demo/bake-vids.sh
@@ -0,0 +1,65 @@
+#!/usr/bin/env bash
+# Auto-discover videos and bake each into an accumulating-reconstruction demo.
+#
+# Mirrors the drop-a-folder convenience of demo-photos/ (scripts/demo/bake.sh),
+# but for VIDEO: drop one folder per scene under demo-vids/ with a clip inside —
+#   demo-vids/office-corner/clip.mp4
+#   demo-vids/flower-bed/clip.mp4
+# — then run this. Each clip is sampled into frames, the parallax gate curates the
+# well-conditioned subset (scripts/make_accumulate_demo.sh --min-parallax), and the
+# result is a self-contained growing-reconstruction demo (accumulate.html) under
+# .cache/demo/<name>/.
+#
+#   nix develop -c scripts/demo/bake-vids.sh                 # bake every demo-vids/ folder
+#   MIN_PARALLAX=6 MAXFRAMES=30 nix develop -c scripts/demo/bake-vids.sh
+#
+# Env: VIDS (demo-vids/), OUTBASE (.cache/demo), MAXFRAMES (frames sampled per
+# clip, default 24), MIN_PARALLAX (gate degrees, default 8; 0 disables).
+set -euo pipefail
+cd "$(dirname "$0")/../.."
+ROOT=$(pwd)
+VIDS=${VIDS:-$ROOT/demo-vids}
+OUTBASE=${OUTBASE:-$ROOT/.cache/demo}
+MAXFRAMES=${MAXFRAMES:-24}
+export MIN_PARALLAX=${MIN_PARALLAX:-8}
+
+command -v ffmpeg >/dev/null || { echo "ffmpeg required" >&2; exit 1; }
+mkdir -p "$VIDS"
+[ -f "$VIDS/README.txt" ] || cat >"$VIDS/README.txt" <<'TXT'
+Drop one folder per scene here with a video clip inside (a time-lapse / slow pan
+works best), then run scripts/demo/bake-vids.sh:
+  office-corner/  clip.mp4
+  flower-bed/     clip.mp4
+Each clip is sampled to frames, the parallax gate keeps the well-conditioned ones,
+and the result is a growing-reconstruction demo under .cache/demo/<name>/.
+TXT
+
+shopt -s nullglob
+baked=()
+for D in "$VIDS"/*/; do
+  D=${D%/}; name=$(basename "$D")
+  vid=$(find "$D" -maxdepth 1 -type f \( -iname '*.mp4' -o -iname '*.mov' -o -iname '*.webm' -o -iname '*.mkv' -o -iname '*.m4v' \) | sort | head -1)
+  [ -n "$vid" ] || { echo "  no video in $name/ — skipping"; continue; }
+  safe=$(printf '%s' "$name" | tr -c 'A-Za-z0-9_-' '_')
+
+  # Sample frames: decode all, then take an even stride down to ~MAXFRAMES. (Time-
+  # lapse clips are short, so decoding all is cheap and avoids frame-count guessing.)
+  fdir="$OUTBASE/_frames/$safe"; rm -rf "$fdir"; mkdir -p "$fdir"
+  ffmpeg -y -loglevel error -i "$vid" "$fdir/all%04d.png"
+  mapfile -t ALL < <(ls "$fdir"/all*.png | sort)
+  nb=${#ALL[@]}
+  [ "$nb" -ge 2 ] || { echo "  $name: <2 frames decoded — skipping"; continue; }
+  stride=$(( (nb + MAXFRAMES - 1) / MAXFRAMES )); [ "$stride" -lt 1 ] && stride=1
+  FR=(); for ((i=0;i<nb;i+=stride)); do FR+=("${ALL[$i]}"); done
+  echo "== $name: $(basename "$vid")  ($nb frames -> ${#FR[@]} sampled, stride $stride, gate ${MIN_PARALLAX}deg) =="
+
+  bash scripts/make_accumulate_demo.sh "$OUTBASE/$safe" "${FR[@]}"
+  baked+=("$safe")
+done
+
+if [ "${#baked[@]}" -eq 0 ]; then
+  echo "no video folders found under $VIDS (drop demo-vids/<name>/<clip>.mp4)"; exit 0
+fi
+echo
+echo "baked ${#baked[@]} demo(s): ${baked[*]}"
+echo "serve one with:  python3 -m http.server -d $OUTBASE/<name> 8080   # open http://localhost:8080/"

From e6cf432d54cd0b3c28225eaac310242d8ba0dafd Mon Sep 17 00:00:00 2001
From: Richard Palethorpe <io@richiejp.com>
Date: Sun, 28 Jun 2026 11:29:56 +0100
Subject: [PATCH 30/33] device: default to GPU/Vulkan, fail-closed; CPU is
 explicit opt-in
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A user could silently land on the ~50x slower CPU path. Now the library default
device is "vulkan" (src/options.h) and an empty/unset device resolves to vulkan
(free_splatter.cpp), so a caller who never sets one fails-closed if no GPU is
present rather than running on CPU. backend.cpp already errors for an explicit
vulkan request with no device — that path is now the default.

- CLI forwards the library default (vulkan); on a load failure with no explicit
  --device it prints guidance ("pass --device cpu to run on CPU"). Usage updated.
- bench: device label no longer assumes cpu when unset.
- make_accumulate_demo.sh: DEVICE defaults to vulkan and resolves/builds the vulkan
  CLI (build/vulkan/bin/free_splatter-cli) like serve.sh builds the .so; DEVICE=cpu
  + the release CLI stays as the explicit CPU escape (headless/CI bakes).

Tests are unaffected (every test passes an explicit device; default FREE_SPLATTER_DEVICE
is cpu). Verified: CPU build with no --device fails-closed with guidance; --device
cpu runs; ctest -LE model green.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 bench/free_splatter-bench.cpp   |  4 ++--
 scripts/make_accumulate_demo.sh | 16 ++++++++++++++--
 src/free_splatter.cpp           |  5 ++++-
 src/options.h                   |  2 +-
 tools/free_splatter-cli.cpp     |  9 ++++++++-
 5 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/bench/free_splatter-bench.cpp b/bench/free_splatter-bench.cpp
index 8f3b165..e505433 100644
--- a/bench/free_splatter-bench.cpp
+++ b/bench/free_splatter-bench.cpp
@@ -88,7 +88,7 @@ int main(int argc, char ** argv) {
     }
 
     std::printf("model: %s  device=%s  %dx%d  in=%d  gc=%d  views=%d  S=%lld  load=%.1f ms\n",
-                model, device ? device : "cpu", geo.image_width, geo.image_height,
+                model, device ? device : "vulkan", geo.image_width, geo.image_height,
                 geo.in_channels, geo.gaussian_channels, views,
                 (long long) ((int64_t) views * (geo.image_height/8) * (geo.image_width/8)), load_ms);
 
@@ -124,7 +124,7 @@ int main(int argc, char ** argv) {
                 "(%.2f views/s, %.2f scenes/s)\n", mn, med, mean, mx, vps, 1000.0 / med);
     std::printf("RESULT engine device=%s views=%d gc=%d load_ms=%.1f min_ms=%.1f "
                 "median_ms=%.1f mean_ms=%.1f max_ms=%.1f views_per_s=%.3f\n",
-                device ? device : "cpu", views, geo.gaussian_channels, load_ms,
+                device ? device : "vulkan", views, geo.gaussian_channels, load_ms,
                 mn, med, mean, mx, vps);
     return 0;
 }
diff --git a/scripts/make_accumulate_demo.sh b/scripts/make_accumulate_demo.sh
index adb7fc6..3cc109e 100755
--- a/scripts/make_accumulate_demo.sh
+++ b/scripts/make_accumulate_demo.sh
@@ -13,8 +13,20 @@ cd "$(dirname "$0")/.."
 ROOT=$(pwd)
 MODEL=${MODEL:-.cache/freesplatter-scene-f16.gguf}
 MAXSPLATS=${MAXSPLATS:-500000}
-DEVICE=${DEVICE:-cpu}
-CLI=${CLI:-$ROOT/build/release/bin/free_splatter-cli}
+# GPU by default (CPU is ~50x slower); DEVICE=cpu is the explicit opt-in. The CLI
+# is resolved to the matching build, and the vulkan CLI is built if missing (it
+# isn't part of the default CPU build) — mirroring scripts/serve.sh for the .so.
+DEVICE=${DEVICE:-vulkan}
+if [ "$DEVICE" = cpu ]; then
+  CLI=${CLI:-$ROOT/build/release/bin/free_splatter-cli}
+else
+  CLI=${CLI:-$ROOT/build/vulkan/bin/free_splatter-cli}
+  if [ ! -x "$CLI" ]; then
+    echo "building vulkan CLI ($CLI) ..." >&2
+    cmake -S "$ROOT" -B "$ROOT/build/vulkan" -DFREE_SPLATTER_VULKAN=ON -DFREE_SPLATTER_BUILD_SHARED=ON >&2
+    cmake --build "$ROOT/build/vulkan" --target free_splatter-cli >&2
+  fi
+fi
 
 [ "$#" -ge 3 ] || { echo "usage: $0 OUT_DIR FRAME0 FRAME1 [FRAME2 ...]  (>=2 frames)" >&2; exit 1; }
 OUT=$1; shift
diff --git a/src/free_splatter.cpp b/src/free_splatter.cpp
index 7305d21..ab7462a 100644
--- a/src/free_splatter.cpp
+++ b/src/free_splatter.cpp
@@ -92,7 +92,10 @@ free_splatter_ctx * free_splatter_load(const char * gguf_path, const free_splatt
     if (!ctx) return nullptr;
     if (opts) ctx->opts = opts->o;
 
-    if (!ctx->m.load(gguf_path, ctx->opts.device.empty() ? "cpu" : ctx->opts.device,
+    // Default to GPU and fail-closed: an unset/empty device resolves to vulkan, so
+    // a caller who never set one can't silently land on the (~50x slower) CPU path.
+    // CPU is opt-in only (explicit "cpu"). backend.cpp errors if no GPU is present.
+    if (!ctx->m.load(gguf_path, ctx->opts.device.empty() ? "vulkan" : ctx->opts.device,
                      ctx->opts.n_threads)) {
         ctx->error = ctx->m.error;
     }
diff --git a/src/options.h b/src/options.h
index e531e29..4bf8d1f 100644
--- a/src/options.h
+++ b/src/options.h
@@ -7,7 +7,7 @@
 namespace free_splatter {
 
 struct options {
-    std::string device       = "cpu";  // "" | cpu | gpu | cuda | vulkan [:N]
+    std::string device       = "vulkan";  // "" | cpu | gpu | cuda | vulkan [:N]; default GPU, fail-closed
     int         n_threads    = 0;       // <= 0 => auto (CPU)
     std::string dump_taps_dir;          // empty => tap dumping disabled
 };
diff --git a/tools/free_splatter-cli.cpp b/tools/free_splatter-cli.cpp
index f6d3c14..af96083 100644
--- a/tools/free_splatter-cli.cpp
+++ b/tools/free_splatter-cli.cpp
@@ -127,6 +127,8 @@ static int usage(const char * a0) {
         "usage: %s [--device DEV] [--splat OUT.splat] [--out OUT.f32]\n"
         "          [--opacity-threshold T] [--max-splats N] [--dump-taps DIR]\n"
         "          MODEL.gguf (IMAGES... | INPUT.f32)\n"
+        "  --device defaults to GPU/vulkan and FAILS if none is present; pass\n"
+        "  --device cpu to run on CPU explicitly.\n"
         "\n"
         "accumulate mode (>=2 images -> one world from the photo stream):\n"
         "  %s --accumulate [--splat-prefix P] [--fuse] [--voxel V] [--fuse-k K]\n"
@@ -192,7 +194,12 @@ int main(int argc, char ** argv) {
     free_splatter_options_free(opts);
     if (!ctx) { std::fprintf(stderr, "load: out of memory\n"); return 1; }
     if (const char * err = free_splatter_last_error(ctx)) {
-        std::fprintf(stderr, "load failed: %s\n", err); free_splatter_free(ctx); return 1;
+        std::fprintf(stderr, "load failed: %s\n", err);
+        // Default device is GPU/Vulkan (fail-closed): if none was requested
+        // explicitly, point the user at the CPU opt-in instead of guessing.
+        if (!device) std::fprintf(stderr,
+            "  (no GPU device — pass --device cpu to run on CPU, or build with FREE_SPLATTER_VULKAN)\n");
+        free_splatter_free(ctx); return 1;
     }
 
     free_splatter_geometry geo;

From 8b731af11bb5ffe7a4ec379c94be7a2d6a9309e0 Mon Sep 17 00:00:00 2001
From: Richard Palethorpe <io@richiejp.com>
Date: Sun, 28 Jun 2026 11:47:51 +0100
Subject: [PATCH 31/33] server: scene switcher + upload-a-video-to-make-a-scene
 (in-process GPU bake)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The accumulating-reconstruction demos were separate static dirs on separate ports;
now the web app switches between them and can generate a new one from a video in
the UI.

- /api/scenes lists the growing-world scenes (server/scenes.go scanScenes: any
  scenesDir subdir whose manifest.json has a "steps" array — the storyboard's
  "stations" manifest is excluded). Served at /scenes-assets/. New `-scenes-dir`
  flag (defaults to -demo-dir). No in-memory registry: re-scanned per request, so
  externally-baked and uploaded scenes appear with no restart.
- POST /api/scene/from-video (+ GET /api/scene/status/{job}): samples frames
  (ffmpeg), runs the keyframe parallax gate + accumulate + best-frame fuse
  IN-PROCESS on the loaded GPU engine, and writes the scene dir incrementally.
  Async job + polling; one bake at a time (429 if busy). engine.go binds the
  accumulator + pair_parallax C API (with point/parallax ABI-mirror structs and
  copy helpers); convert.go adds encodeCloudSplat (mirrors write_cloud_splat +
  splat.h — note the cloud colour is already linear, NOT SH-transformed).
- server/web/accumulate.html: the viewer gains a scene dropdown + a "make scene
  from video" upload widget with live progress. It is now the SINGLE source (moved
  from web/; standalone baked dirs fall back to ./manifest.json when /api/scenes is
  absent). make_accumulate_demo.sh copies it from server/web. index.html links to it.

Verified end-to-end on Vulkan (AMD RADV): /api/scenes lists the 6 baked scenes;
uploading office-corner1.mp4 baked a 5-step scene (gate kept 5/9, same as the
offline bake-vids.sh) that renders identically to the CLI result and appears in the
dropdown live. Go build + vet clean; ctest -LE model green.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 scripts/make_accumulate_demo.sh     |   2 +-
 server/README.md                    |  21 +-
 server/convert.go                   |  46 ++++
 server/engine.go                    |  81 +++++++
 server/main.go                      |  32 ++-
 server/scenes.go                    | 364 ++++++++++++++++++++++++++++
 {web => server/web}/accumulate.html | 108 ++++++++-
 server/web/index.html               |   1 +
 8 files changed, 646 insertions(+), 9 deletions(-)
 create mode 100644 server/scenes.go
 rename {web => server/web}/accumulate.html (77%)

diff --git a/scripts/make_accumulate_demo.sh b/scripts/make_accumulate_demo.sh
index 3cc109e..e837418 100755
--- a/scripts/make_accumulate_demo.sh
+++ b/scripts/make_accumulate_demo.sh
@@ -83,6 +83,6 @@ thumb() { printf '"view_%s.jpg"' "$1"; }       # thumbnail referenced by origina
 echo "kept frames (input indices): ${kept[*]}  of ${#FRAMES[@]}"
 
 # 4) the viewer, self-contained next to its assets
-cp "$ROOT/web/accumulate.html" "$OUT/index.html"
+cp "$ROOT/server/web/accumulate.html" "$OUT/index.html"   # single source; standalone-fallback to ./manifest.json
 echo "demo baked -> $OUT"
 echo "serve: (cd $OUT && python3 -m http.server 8080)  then open http://localhost:8080/"
diff --git a/server/README.md b/server/README.md
index e85b60d..a66521c 100644
--- a/server/README.md
+++ b/server/README.md
@@ -26,7 +26,26 @@ CGO_ENABLED=0 go run . \
 `net/http` the pure-Go resolver so the server builds on a bare `PATH`.
 
 Flags: `-addr` `-lib` `-models` (comma list of `name=path.gguf`; the first is the
-default) `-device` (`cpu`|`vulkan`|`vulkan:N`) `-max-splats` `-opacity-threshold`.
+default) `-device` (`vulkan` by default and **fail-closed** — it errors if no GPU
+is present rather than silently using the slow CPU path; pass `cpu` to opt in)
+`-max-splats` `-opacity-threshold` `-demo-dir` `-scenes-dir`.
+
+## Accumulating scenes + video upload (`/accumulate.html`)
+
+A second page plays the **growing-world** demo: pick a scene from the dropdown and
+watch its reconstruction build up as photos are folded in. The scenes are the
+accumulating-reconstruction dirs under `-scenes-dir` (default: the `-demo-dir`
+value) — any subdir whose `manifest.json` has a `steps` array, produced by
+`scripts/demo/bake-vids.sh` or by an upload. `GET /api/scenes` lists them (no
+restart needed when new ones appear).
+
+**Upload a video → it makes the scene.** The page's "make scene from video" control
+POSTs a clip to `POST /api/scene/from-video`; the server samples frames, runs the
+keyframe **parallax gate** (`--min-parallax`, default 8°) and the accumulate +
+best-frame fuse **in-process on the loaded GPU engine** (no shell-out, no second
+model load), and writes the scene dir. Progress streams via
+`GET /api/scene/status/{job}`; one bake runs at a time (HTTP 429 if busy). The new
+scene then appears in the dropdown automatically. Needs `ffmpeg` on `PATH`.
 
 ## Models
 
diff --git a/server/convert.go b/server/convert.go
index 2655e2b..da0ca52 100644
--- a/server/convert.go
+++ b/server/convert.go
@@ -142,6 +142,52 @@ func encodeSplat(g []float32, opacThr float32, maxSplats int) ([]byte, int) {
 	return buf, m
 }
 
+// encodeCloudSplat converts an accumulator cloud ([]point) to antimatter15 .splat
+// bytes — mirroring tools/free_splatter-cli.cpp write_cloud_splat + src/splat.h
+// encode_splat_record. CRITICAL: unlike encodeSplat (raw gaussians), the cloud's
+// R/G/B are ALREADY activated linear colour in [0,1] — do NOT apply the SH-DC
+// transform. Importance = opacity*sx*sy*sz; sort+cap only when capping (else keep
+// order). Scale is multiplied by scaleMult; pos/quat get the OpenCV->OpenGL flip.
+func encodeCloudSplat(pts []point, maxSplats int, scaleMult float32) []byte {
+	n := len(pts)
+	idx := make([]int, n)
+	for i := range idx {
+		idx[i] = i
+	}
+	imp := func(i int) float64 {
+		p := pts[i]
+		vol := float64(maxf(p.Sx, 1e-9)) * float64(maxf(p.Sy, 1e-9)) * float64(maxf(p.Sz, 1e-9))
+		return float64(maxf(p.Opacity, 0)) * vol
+	}
+	m := n
+	if maxSplats > 0 && n > maxSplats {
+		sort.Slice(idx, func(a, b int) bool { return imp(idx[a]) > imp(idx[b]) })
+		m = maxSplats
+		idx = idx[:m]
+	}
+	buf := make([]byte, m*32)
+	for k, i := range idx {
+		p := pts[i]
+		o := k * 32
+		putf(buf[o:], p.X)
+		putf(buf[o+4:], -p.Y)
+		putf(buf[o+8:], -p.Z)
+		putf(buf[o+12:], scaleMult*p.Sx)
+		putf(buf[o+16:], scaleMult*p.Sy)
+		putf(buf[o+20:], scaleMult*p.Sz)
+		buf[o+24] = u8(clamp01(float64(p.R)) * 255.0)
+		buf[o+25] = u8(clamp01(float64(p.G)) * 255.0)
+		buf[o+26] = u8(clamp01(float64(p.B)) * 255.0)
+		buf[o+27] = u8(clamp01(float64(p.Opacity)) * 255.0)
+		q := [4]float64{-float64(p.Qx), float64(p.Qw), -float64(p.Qz), float64(p.Qy)}
+		nrm := math.Sqrt(q[0]*q[0]+q[1]*q[1]+q[2]*q[2]+q[3]*q[3]) + 1e-12
+		for c := 0; c < 4; c++ {
+			buf[o+28+c] = u8(q[c]/nrm*128.0 + 128.0)
+		}
+	}
+	return buf
+}
+
 func putf(b []byte, v float32) { binary.LittleEndian.PutUint32(b, math.Float32bits(v)) }
 
 func maxf(a, b float32) float32 {
diff --git a/server/engine.go b/server/engine.go
index 6166005..010b675 100644
--- a/server/engine.go
+++ b/server/engine.go
@@ -22,6 +22,19 @@ type geometry struct {
 	gaussianChannels int32
 }
 
+// point mirrors free_splatter_point exactly: 14 float32 + 1 int32 = 60 bytes, all
+// 4-byte aligned (no padding). Field ORDER is load-bearing across the C ABI.
+type point struct {
+	X, Y, Z, R, G, B, Opacity, Sx, Sy, Sz, Qw, Qx, Qy, Qz float32
+	Frame                                                 int32
+}
+
+// parallax mirrors free_splatter_parallax: 6 float32 + 1 int32 = 28 bytes.
+type parallax struct {
+	TriAngleDeg, LateralAngleDeg, BaselineOverDepth, Baseline, MedianDepth, Focal float32
+	NPoints                                                                       int32
+}
+
 // Lib is the dlopen'd shared library with the C API bound by name.
 type Lib struct {
 	handle uintptr
@@ -36,6 +49,15 @@ type Lib struct {
 	run        func(uintptr, unsafe.Pointer, int32, int32, int32, unsafe.Pointer, unsafe.Pointer) int32
 	bufFree    func(uintptr)
 	abiVersion func() int32
+
+	// accumulator + parallax (for in-process video->scene baking)
+	accNew      func(int32, int32, float32) uintptr
+	accFree     func(uintptr)
+	accAddPair  func(uintptr, unsafe.Pointer, int32) int32
+	accFrameCnt func(uintptr) int32
+	accCloud    func(uintptr, unsafe.Pointer, unsafe.Pointer) int32
+	accFuse     func(uintptr, float32, int32, int32, unsafe.Pointer, unsafe.Pointer) int32
+	parallaxFn  func(unsafe.Pointer, int32, int32, int32, int32, float32, unsafe.Pointer) int32
 }
 
 // Engine is one loaded model (context) belonging to a Lib.
@@ -79,6 +101,13 @@ func OpenLib(libPath string) (*Lib, error) {
 	purego.RegisterLibFunc(&l.run, handle, "free_splatter_run")
 	purego.RegisterLibFunc(&l.bufFree, handle, "free_splatter_buf_free")
 	purego.RegisterLibFunc(&l.abiVersion, handle, "free_splatter_abi_version")
+	purego.RegisterLibFunc(&l.accNew, handle, "free_splatter_accumulator_new")
+	purego.RegisterLibFunc(&l.accFree, handle, "free_splatter_accumulator_free")
+	purego.RegisterLibFunc(&l.accAddPair, handle, "free_splatter_accumulator_add_pair")
+	purego.RegisterLibFunc(&l.accFrameCnt, handle, "free_splatter_accumulator_frame_count")
+	purego.RegisterLibFunc(&l.accCloud, handle, "free_splatter_accumulator_cloud")
+	purego.RegisterLibFunc(&l.accFuse, handle, "free_splatter_accumulator_fuse")
+	purego.RegisterLibFunc(&l.parallaxFn, handle, "free_splatter_pair_parallax")
 	if v := l.abiVersion(); v != 1 {
 		purego.Dlclose(handle)
 		return nil, fmt.Errorf("ABI mismatch: library reports %d, expected 1", v)
@@ -149,3 +178,55 @@ func (e *Engine) Close() {
 		e.ctx = 0
 	}
 }
+
+// --- accumulator + parallax helpers (in-process video->scene baking) ---
+
+// pointsFromC copies a malloc'd free_splatter_point[n] to a Go slice and frees it.
+func (l *Lib) pointsFromC(ptr uintptr, n uint64) []point {
+	if ptr == 0 {
+		return nil
+	}
+	out := make([]point, int(n))
+	if n > 0 {
+		copy(out, unsafe.Slice((*point)(unsafe.Pointer(ptr)), int(n)))
+	}
+	l.bufFree(ptr)
+	return out
+}
+
+// accCloudCopy returns a copy of the accumulator's current cloud.
+func (l *Lib) accCloudCopy(acc uintptr) []point {
+	var ptr uintptr
+	var n uint64
+	if l.accCloud(acc, unsafe.Pointer(&ptr), unsafe.Pointer(&n)) != 0 {
+		return nil
+	}
+	return l.pointsFromC(ptr, n)
+}
+
+// accFuseCopy returns the consensus-fused cloud (voxelFrac, k, mode: 0 avg/1 kept/2 best).
+func (l *Lib) accFuseCopy(acc uintptr, voxelFrac float32, k, mode int32) []point {
+	var ptr uintptr
+	var n uint64
+	if l.accFuse(acc, voxelFrac, k, mode, unsafe.Pointer(&ptr), unsafe.Pointer(&n)) != 0 {
+		return nil
+	}
+	return l.pointsFromC(ptr, n)
+}
+
+// pairParallax computes the after-inference parallax of a 2-view gaussian buffer
+// (the layout Engine.Run returns for nViews=2).
+func (l *Lib) pairParallax(g []float32, h, w, gc int32, thr float32) (parallax, bool) {
+	var px parallax
+	rc := l.parallaxFn(unsafe.Pointer(&g[0]), 2, h, w, gc, thr, unsafe.Pointer(&px))
+	runtime.KeepAlive(g)
+	return px, rc == 0
+}
+
+// accAddPairSlice folds one pair's gaussians into the accumulator (the C side
+// copies them, so g may be reused after). Returns 0 on success.
+func (l *Lib) accAddPairSlice(acc uintptr, g []float32, gc int32) int32 {
+	rc := l.accAddPair(acc, unsafe.Pointer(&g[0]), gc)
+	runtime.KeepAlive(g)
+	return rc
+}
diff --git a/server/main.go b/server/main.go
index 8afee0b..7cc91ba 100644
--- a/server/main.go
+++ b/server/main.go
@@ -6,10 +6,14 @@
 //   cd server && go run . [-models scene=A.gguf,object=B.gguf] [-device vulkan] [-addr :8080]
 //
 // REST API:
-//   GET  /api/models         -> [{name,label,hint,views,size}, ...]
-//   POST /api/reconstruct     multipart "images" (>=1) + optional "model" -> {id,model,n_views,n_splats,size,seconds}
-//   GET  /api/splat/{id}      -> the .splat bytes (importance-sorted, 32B/splat)
-//   GET  /                    -> embedded frontend (web/)
+//   GET  /api/models          -> [{name,label,hint,views,size}, ...]
+//   POST /api/reconstruct      multipart "images" (>=1) + optional "model" -> {id,model,n_views,n_splats,size,seconds}
+//   GET  /api/splat/{id}       -> the .splat bytes (importance-sorted, 32B/splat)
+//   GET  /api/scenes           -> accumulating-reconstruction scenes [{name,label,steps,thumb,source}, ...]
+//   POST /api/scene/from-video  multipart "video" + "name" -> {job}  (gated in-process accumulate)
+//   GET  /api/scene/status/{job} -> {state,total,done,kept,scene,error}
+//   GET  /scenes-assets/...    -> a scene's manifest.json + .splat + thumbnails
+//   GET  / , /accumulate.html  -> embedded frontend (reconstruct page + scene viewer)
 package main
 
 import (
@@ -53,12 +57,17 @@ type server struct {
 	opts        options
 	bgRemoveCmd string // external matting command ("" = feature disabled)
 	demoDir     string // static dir served at /demo-assets/ (manifest + splats + images); "" = off
+	scenesDir   string // accumulating-reconstruction scenes (per-subdir steps-manifest); "" = off
 	workDir     string // scratch root for demo video frames + encoded MP4s
 
 	mu      sync.Mutex
 	results map[string][]byte
 	order2  []string
 	counter int64
+
+	jobsMu  sync.Mutex
+	jobs    map[string]*sceneJob
+	bakeSem chan struct{} // capacity 1: at most one video->scene bake at a time
 }
 
 func respondJSON(w http.ResponseWriter, v any) {
@@ -255,6 +264,7 @@ func main() {
 	bgCmd := flag.String("bgremove-cmd", "",
 		"external background-removal command for the object path ({in}/{out} are batch dirs), e.g. 'rembg p -m u2netp {in} {out}'")
 	demoDir := flag.String("demo-dir", "", "directory of baked demo assets (manifest.json + .splat + images) served at /demo-assets/")
+	scenesDir := flag.String("scenes-dir", "", "root of accumulating-reconstruction scenes (per-subdir steps-manifest); listed at /api/scenes, served at /scenes-assets/, written by video uploads. Default: the -demo-dir value")
 	workDir := flag.String("work-dir", filepath.Join(os.TempDir(), "freesplatter-demo"), "scratch root for demo video frames + encoded MP4s")
 	flag.Parse()
 
@@ -274,13 +284,20 @@ func main() {
 	if *demoDir != "" {
 		demoAbs, _ = filepath.Abs(*demoDir)
 	}
+	scenesAbs := demoAbs // default: scenes live alongside the baked demo assets
+	if *scenesDir != "" {
+		scenesAbs, _ = filepath.Abs(*scenesDir)
+	}
 	s := &server{
 		models:      map[string]*Engine{},
 		opts:        options{maxSplats: *maxSplats, opacThr: float32(*opacThr)},
 		bgRemoveCmd: *bgCmd,
 		demoDir:     demoAbs,
+		scenesDir:   scenesAbs,
 		workDir:     *workDir,
 		results:     map[string][]byte{},
+		jobs:        map[string]*sceneJob{},
+		bakeSem:     make(chan struct{}, 1),
 	}
 	if *bgCmd != "" {
 		log.Printf("background removal enabled: %s", *bgCmd)
@@ -309,10 +326,17 @@ func main() {
 	mux.HandleFunc("/api/demo/frame", s.handleDemoFrame)
 	mux.HandleFunc("/api/demo/encode", s.handleDemoEncode)
 	mux.HandleFunc("/api/demo/video/", s.handleDemoVideo)
+	mux.HandleFunc("/api/scenes", s.handleScenes)
+	mux.HandleFunc("/api/scene/from-video", s.handleSceneFromVideo)
+	mux.HandleFunc("/api/scene/status/", s.handleSceneStatus)
 	if s.demoDir != "" {
 		mux.Handle("/demo-assets/", http.StripPrefix("/demo-assets/", http.FileServer(http.Dir(s.demoDir))))
 		log.Printf("serving demo assets from %s at /demo-assets/ (work dir %s)", s.demoDir, s.workDir)
 	}
+	if s.scenesDir != "" {
+		mux.Handle("/scenes-assets/", http.StripPrefix("/scenes-assets/", http.FileServer(http.Dir(s.scenesDir))))
+		log.Printf("serving scenes from %s at /scenes-assets/ (list: /api/scenes)", s.scenesDir)
+	}
 	mux.Handle("/", http.FileServer(http.FS(sub)))
 
 	log.Printf("serving %d model(s) %v on http://localhost%s", len(s.order), s.order, *addr)
diff --git a/server/scenes.go b/server/scenes.go
new file mode 100644
index 0000000..36dd3de
--- /dev/null
+++ b/server/scenes.go
@@ -0,0 +1,364 @@
+// scenes.go — the accumulating-reconstruction side of the web app: list the baked
+// "growing world" scenes, and generate a new one from an uploaded video in-process
+// (reuse the already-loaded GPU engine, no shell-out, no second model load).
+//
+//   GET  /api/scenes                 -> [{name,label,steps,thumb,source}, ...]
+//   POST /api/scene/from-video        multipart "video" + "name" -> {job}
+//   GET  /api/scene/status/{job}      -> {state,total,done,kept,scene,error}
+//   GET  /scenes-assets/<name>/...    static scene files (manifest.json, *.splat, view_*.jpg)
+//
+// A scene is a subdir of scenesDir whose manifest.json has a top-level "steps"
+// array (acc_2.splat, acc_3.splat, ... + a fused step) — the format produced by
+// scripts/make_accumulate_demo.sh and by handleSceneFromVideo here. The viewer
+// (server/web/accumulate.html) plays the steps as the reconstruction grows.
+package main
+
+import (
+	"encoding/json"
+	"fmt"
+	"io"
+	"net/http"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"sort"
+	"strconv"
+	"strings"
+)
+
+// --- manifest format (shared with server/web/accumulate.html + make_accumulate_demo.sh) ---
+
+type manifestStep struct {
+	Splat  string   `json:"splat"`
+	Images []string `json:"images"`
+	N      int      `json:"n"`
+	Label  string   `json:"label,omitempty"`
+}
+type sceneManifest struct {
+	Steps []manifestStep `json:"steps"`
+}
+
+// --- listing -------------------------------------------------------------------
+
+type sceneInfo struct {
+	Name   string `json:"name"`
+	Label  string `json:"label"`
+	Steps  int    `json:"steps"`
+	Thumb  string `json:"thumb"` // /scenes-assets/<name>/<first image>, for the picker
+	Source string `json:"source"`
+}
+
+// scanScenes lists the accumulating-reconstruction scenes under scenesDir: each
+// immediate subdir with a manifest.json that has a non-empty "steps" array. Cheap
+// enough to run per request, so externally-baked and uploaded scenes appear with
+// no restart.
+func (s *server) scanScenes() []sceneInfo {
+	if s.scenesDir == "" {
+		return nil
+	}
+	ents, err := os.ReadDir(s.scenesDir)
+	if err != nil {
+		return nil
+	}
+	out := []sceneInfo{}
+	for _, e := range ents {
+		if !e.IsDir() {
+			continue
+		}
+		name := e.Name()
+		raw, err := os.ReadFile(filepath.Join(s.scenesDir, name, "manifest.json"))
+		if err != nil {
+			continue
+		}
+		var m sceneManifest
+		if json.Unmarshal(raw, &m) != nil || len(m.Steps) == 0 {
+			continue // storyboard manifests ("stations") and malformed dirs are skipped
+		}
+		thumb := ""
+		if len(m.Steps[0].Images) > 0 {
+			thumb = "/scenes-assets/" + name + "/" + m.Steps[0].Images[0]
+		}
+		label := strings.ReplaceAll(name, "-", " ")
+		out = append(out, sceneInfo{
+			Name: name, Label: label, Steps: len(m.Steps), Thumb: thumb, Source: "baked",
+		})
+	}
+	sort.Slice(out, func(a, b int) bool { return out[a].Name < out[b].Name })
+	return out
+}
+
+func (s *server) handleScenes(w http.ResponseWriter, r *http.Request) {
+	respondJSON(w, map[string]any{"scenes": s.scanScenes()})
+}
+
+// --- video -> scene (async) ----------------------------------------------------
+
+type sceneJob struct {
+	State string `json:"state"` // "running" | "done" | "error"
+	Total int    `json:"total"` // candidate frames
+	Done  int    `json:"done"`  // candidates processed
+	Kept  int    `json:"kept"`  // frames folded in so far
+	Scene string `json:"scene"` // slug, set on done
+	Err   string `json:"error,omitempty"`
+}
+
+func (s *server) putJob(id string, mutate func(*sceneJob)) {
+	s.jobsMu.Lock()
+	defer s.jobsMu.Unlock()
+	j := s.jobs[id]
+	if j == nil {
+		j = &sceneJob{}
+		s.jobs[id] = j
+	}
+	mutate(j)
+}
+
+func (s *server) handleSceneStatus(w http.ResponseWriter, r *http.Request) {
+	id := strings.TrimPrefix(r.URL.Path, "/api/scene/status/")
+	s.jobsMu.Lock()
+	j := s.jobs[id]
+	var cp sceneJob
+	if j != nil {
+		cp = *j
+	}
+	s.jobsMu.Unlock()
+	if j == nil {
+		http.Error(w, "no such job", http.StatusNotFound)
+		return
+	}
+	respondJSON(w, cp)
+}
+
+// handleSceneFromVideo accepts a clip, returns a job id immediately, and bakes the
+// gated accumulating scene in a background goroutine (max one at a time).
+func (s *server) handleSceneFromVideo(w http.ResponseWriter, r *http.Request) {
+	if r.Method != http.MethodPost {
+		http.Error(w, "POST only", http.StatusMethodNotAllowed)
+		return
+	}
+	if s.scenesDir == "" {
+		http.Error(w, "no scenes dir configured (-scenes-dir / -demo-dir)", http.StatusServiceUnavailable)
+		return
+	}
+	if err := r.ParseMultipartForm(256 << 20); err != nil {
+		http.Error(w, "bad multipart form: "+err.Error(), http.StatusBadRequest)
+		return
+	}
+	name, ok := safeToken(r.FormValue("name"))
+	if !ok {
+		http.Error(w, "bad or missing scene name (a-z0-9-_ only)", http.StatusBadRequest)
+		return
+	}
+	minParallax := float32(8)
+	if v, err := strconv.ParseFloat(r.FormValue("min_parallax"), 32); err == nil {
+		minParallax = float32(v)
+	}
+	maxFrames := 24
+	if v, err := strconv.Atoi(r.FormValue("max_frames")); err == nil && v >= 2 && v <= 200 {
+		maxFrames = v
+	}
+	file, _, err := r.FormFile("video")
+	if err != nil {
+		http.Error(w, `no video (expected multipart field "video")`, http.StatusBadRequest)
+		return
+	}
+	defer file.Close()
+
+	// one bake at a time — a bake monopolizes the single engine for many GPU-seconds.
+	select {
+	case s.bakeSem <- struct{}{}:
+	default:
+		http.Error(w, "a scene bake is already running; try again shortly", http.StatusTooManyRequests)
+		return
+	}
+
+	jobDir := filepath.Join(s.workDir, "scene", name)
+	if err := os.MkdirAll(jobDir, 0o755); err != nil {
+		<-s.bakeSem
+		http.Error(w, "mkdir: "+err.Error(), http.StatusInternalServerError)
+		return
+	}
+	inPath := filepath.Join(jobDir, "input")
+	in, err := os.Create(inPath)
+	if err != nil {
+		<-s.bakeSem
+		http.Error(w, "save upload: "+err.Error(), http.StatusInternalServerError)
+		return
+	}
+	if _, err := io.Copy(in, file); err != nil {
+		in.Close()
+		<-s.bakeSem
+		http.Error(w, "save upload: "+err.Error(), http.StatusInternalServerError)
+		return
+	}
+	in.Close()
+
+	jobID := name // one bake at a time + scene name as slug -> stable status URL
+	s.putJob(jobID, func(j *sceneJob) { *j = sceneJob{State: "running"} })
+
+	go func() {
+		defer func() { <-s.bakeSem }()
+		scene, err := s.bakeSceneFromVideo(name, inPath, jobDir, minParallax, maxFrames)
+		s.putJob(jobID, func(j *sceneJob) {
+			if err != nil {
+				j.State, j.Err = "error", err.Error()
+				return
+			}
+			j.State, j.Scene = "done", scene
+		})
+	}()
+
+	respondJSON(w, map[string]any{"job": jobID, "name": name})
+}
+
+// bakeSceneFromVideo: sample frames -> gated in-process accumulate -> scene dir.
+func (s *server) bakeSceneFromVideo(name, inPath, jobDir string, minParallax float32, maxFrames int) (string, error) {
+	eng := s.models[s.order[0]] // scene model is the default/first
+	if eng == nil {
+		return "", fmt.Errorf("no scene model loaded")
+	}
+	sz := int(eng.Geo.imageWidth)
+	gc := int(eng.Geo.gaussianChannels)
+
+	// 1) decode all frames, then even-stride down to <= maxFrames (time-lapse clips
+	//    are short; decoding all avoids frame-count guessing — mirrors bake-vids.sh).
+	frameDir := filepath.Join(jobDir, "frames")
+	os.RemoveAll(frameDir)
+	if err := os.MkdirAll(frameDir, 0o755); err != nil {
+		return "", err
+	}
+	if err := runFFmpeg("-i", inPath, filepath.Join(frameDir, "all%04d.png")); err != nil {
+		return "", fmt.Errorf("frame extraction: %w", err)
+	}
+	all, _ := filepath.Glob(filepath.Join(frameDir, "all*.png"))
+	sort.Strings(all)
+	if len(all) < 2 {
+		return "", fmt.Errorf("video decoded to %d frames (need >= 2)", len(all))
+	}
+	stride := (len(all) + maxFrames - 1) / maxFrames
+	if stride < 1 {
+		stride = 1
+	}
+	var pngs []string
+	for i := 0; i < len(all); i += stride {
+		pngs = append(pngs, all[i])
+	}
+
+	// 2) preprocess sampled frames to model input (CHW [0,1]).
+	frames := make([][]float32, len(pngs))
+	for i, p := range pngs {
+		data, err := os.ReadFile(p)
+		if err != nil {
+			return "", err
+		}
+		px, err := preprocessImage(data, sz)
+		if err != nil {
+			return "", fmt.Errorf("preprocess %s: %w", filepath.Base(p), err)
+		}
+		frames[i] = px
+	}
+	s.putJob(name, func(j *sceneJob) { j.Total = len(frames) - 1 })
+
+	// 3) gated in-process accumulate (mirrors free_splatter-cli.cpp keyframe loop).
+	acc := eng.lib.accNew(int32(sz), int32(sz), s.opts.opacThr)
+	if acc == 0 {
+		return "", fmt.Errorf("accumulator alloc failed")
+	}
+	defer eng.lib.accFree(acc)
+
+	sceneDir := filepath.Join(s.scenesDir, name)
+	if err := os.MkdirAll(sceneDir, 0o755); err != nil {
+		return "", err
+	}
+	// frame 0 is the anchor; thumbnail it as view_0.jpg.
+	if err := thumbnail(pngs[0], filepath.Join(sceneDir, "view_0.jpg")); err != nil {
+		return "", err
+	}
+
+	keptIdx := []int{0}
+	lastKept := 0
+	for j := 1; j < len(frames); j++ {
+		pair := make([]float32, 0, 2*len(frames[0]))
+		pair = append(pair, frames[lastKept]...)
+		pair = append(pair, frames[j]...)
+		g, err := eng.Run(pair, 2)
+		if err != nil {
+			return "", fmt.Errorf("inference pair (%d,%d): %w", lastKept, j, err)
+		}
+		if minParallax > 0 {
+			if px, ok := eng.lib.pairParallax(g, int32(sz), int32(sz), int32(gc), s.opts.opacThr); ok && px.TriAngleDeg < minParallax {
+				s.putJob(name, func(jb *sceneJob) { jb.Done = j })
+				continue // re-anchor: lastKept unchanged, try the next candidate
+			}
+		}
+		if eng.lib.accAddPairSlice(acc, g, int32(gc)) != 0 {
+			return "", fmt.Errorf("add_pair (%d,%d) failed", lastKept, j)
+		}
+		nframes := int(eng.lib.accFrameCnt(acc))
+		cloud := eng.lib.accCloudCopy(acc)
+		splat := encodeCloudSplat(cloud, s.opts.maxSplats, 1.0)
+		if err := os.WriteFile(filepath.Join(sceneDir, fmt.Sprintf("acc_%d.splat", nframes)), splat, 0o644); err != nil {
+			return "", err
+		}
+		keptIdx = append(keptIdx, j)
+		if err := thumbnail(pngs[j], filepath.Join(sceneDir, fmt.Sprintf("view_%d.jpg", len(keptIdx)-1))); err != nil {
+			return "", err
+		}
+		lastKept = j
+		s.putJob(name, func(jb *sceneJob) { jb.Done, jb.Kept = j, len(keptIdx) })
+	}
+	if len(keptIdx) < 2 {
+		return "", fmt.Errorf("only %d frame kept (min-parallax %.0f too high for this clip?)", len(keptIdx), minParallax)
+	}
+
+	// 4) consensus-fused surface (best-frame, k=2, voxel 0.02 — same as the CLI bake).
+	fused := encodeCloudSplat(eng.lib.accFuseCopy(acc, 0.02, 2, 2), s.opts.maxSplats, 1.0)
+	if err := os.WriteFile(filepath.Join(sceneDir, "acc_fused.splat"), fused, 0o644); err != nil {
+		return "", err
+	}
+
+	// 5) manifest: one step per kept frame (acc_2..acc_N) + the fused step.
+	if err := writeSceneManifest(sceneDir, len(keptIdx)); err != nil {
+		return "", err
+	}
+	return name, nil
+}
+
+// writeSceneManifest emits the steps manifest from `nkept` kept frames, referencing
+// view_0.jpg .. view_<k-1>.jpg per step — identical shape to make_accumulate_demo.sh.
+func writeSceneManifest(dir string, nkept int) error {
+	var m sceneManifest
+	imgs := func(k int) []string {
+		out := make([]string, k)
+		for j := 0; j < k; j++ {
+			out[j] = fmt.Sprintf("view_%d.jpg", j)
+		}
+		return out
+	}
+	for k := 2; k <= nkept; k++ {
+		m.Steps = append(m.Steps, manifestStep{Splat: fmt.Sprintf("acc_%d.splat", k), Images: imgs(k), N: k})
+	}
+	m.Steps = append(m.Steps, manifestStep{
+		Splat: "acc_fused.splat", Images: imgs(nkept), N: nkept,
+		Label: "consensus-fused — best-frame, parallax-gated",
+	})
+	raw, err := json.MarshalIndent(m, "", "  ")
+	if err != nil {
+		return err
+	}
+	return os.WriteFile(filepath.Join(dir, "manifest.json"), raw, 0o644)
+}
+
+// thumbnail center-crops src to a square and scales to 360px, mirroring the engine
+// preprocessing (so the filmstrip shows exactly the reconstructed view).
+func thumbnail(src, dst string) error {
+	return runFFmpeg("-i", src, "-vf", "crop='min(iw,ih)':'min(iw,ih)',scale=360:360", dst)
+}
+
+func runFFmpeg(args ...string) error {
+	full := append([]string{"-y", "-loglevel", "error"}, args...)
+	if out, err := exec.Command("ffmpeg", full...).CombinedOutput(); err != nil {
+		return fmt.Errorf("ffmpeg: %v: %s", err, strings.TrimSpace(string(out)))
+	}
+	return nil
+}
diff --git a/web/accumulate.html b/server/web/accumulate.html
similarity index 77%
rename from web/accumulate.html
rename to server/web/accumulate.html
index 230b083..e45c8e3 100644
--- a/web/accumulate.html
+++ b/server/web/accumulate.html
@@ -63,6 +63,20 @@
     border: 1px solid rgba(255,255,255,.25); box-shadow: 0 8px 26px rgba(0,0,0,.5); }
   #banner.show { display: block; opacity: 1; transform: translateX(-50%) translateY(0); }
   #banner b { font-weight: 750; }
+  /* scene switcher + video upload (server mode only) */
+  #server-ui { display: none; margin-top: 10px; border-top: 1px solid rgba(255,255,255,.08); padding-top: 10px; }
+  #server-ui .srow { display: flex; align-items: center; gap: 8px; margin-bottom: 8px; }
+  #server-ui label { color: #8b93a1; font-size: 11.5px; }
+  #server-ui select, #server-ui input[type=text] { flex: 1; min-width: 0; background: rgba(255,255,255,.06);
+    color: #e8eaed; border: 1px solid rgba(255,255,255,.12); border-radius: 8px; padding: 4px 8px; font-size: 12px; }
+  #server-ui .up { display: flex; flex-direction: column; gap: 6px; }
+  #server-ui input[type=file] { font-size: 11px; color: #9aa3b2; }
+  #server-ui #vid-go { padding: 5px 8px; font-size: 12px; color: #e8eaed; cursor: pointer;
+    background: rgba(91,140,255,.18); border: 1px solid rgba(122,162,255,.5); border-radius: 8px; }
+  #server-ui #vid-go:hover { background: rgba(91,140,255,.3); }
+  #server-ui #vid-go:disabled { opacity: .5; cursor: default; }
+  #vid-bar { height: 5px; background: rgba(255,255,255,.08); border-radius: 99px; overflow: hidden; display: none; }
+  #vid-bar > div { height: 100%; width: 0; background: linear-gradient(90deg,#5b8cff,#22a05a); transition: width .3s ease; }
 </style>
 </head>
 <body>
@@ -76,6 +90,15 @@ <h1>accumulating reconstruction</h1>
   <div class="stat"><span>fps</span><span id="s-fps">—</span></div>
   <div id="status">starting…</div>
   <div id="ctl"><button id="prev">‹ prev</button><button id="play">⏸ pause</button><button id="next">next ›</button></div>
+  <div id="server-ui">
+    <div class="srow"><label>scene</label><select id="scene-sel"></select></div>
+    <div class="up">
+      <input type="file" id="vid" accept="video/*"/>
+      <input type="text" id="vid-name" placeholder="scene name (a-z0-9-)"/>
+      <button id="vid-go">＋ make scene from video</button>
+      <div id="vid-bar"><div></div></div>
+    </div>
+  </div>
 </div>
 <div id="strip"></div>
 <div id="banner"></div>
@@ -84,7 +107,11 @@ <h1>accumulating reconstruction</h1>
 <script type="module">
 const ROW = 32;                       // bytes per splat
 const _q = new URLSearchParams(location.search);
-const BASE = (_q.get("dir") || ".").replace(/\/$/, "");
+// BASE is the dir holding manifest.json + splats. Set by ?dir= (standalone baked
+// dir), else chosen at runtime: server mode = /scenes-assets/<scene>, standalone
+// fallback = "." (manifest.json next to this page).
+let BASE = (_q.get("dir") || "").replace(/\/$/, "");
+let serverMode = false;
 const STEP_MS = +(_q.get("ms") || 2600);
 const START_STEP = parseInt(_q.get("start") || "0", 10);   // begin at this step
 const AUTOPLAY = _q.get("auto") !== "0";                    // ?auto=0 = no advance
@@ -350,17 +377,92 @@ <h1>accumulating reconstruction</h1>
 $("next").onclick = ()=>{ playing=false; $("play").textContent="▶ play"; loadStep(Math.min(manifest.steps.length-1,cur+1)); };
 $("play").onclick = ()=>{ playing=!playing; $("play").textContent = playing?"⏸ pause":"▶ play"; schedule(); };
 
-(async function start(){
-  requestAnimationFrame(render);
+// load the manifest at the current BASE and start (re-entrant: resets per scene).
+async function loadManifestAndPlay(){
+  clearTimeout(timer);
+  manifest=null; cur=-1; framed=false;
   try {
     setStatus("loading manifest…");
     manifest = await (await fetch(`${BASE}/manifest.json`, { cache: "no-store" })).json();
     if(!manifest.steps || !manifest.steps.length) throw new Error("manifest has no steps");
     const s0 = Math.max(0, Math.min(manifest.steps.length-1, START_STEP));
+    playing = AUTOPLAY;
     await loadStep(s0);
     $("play").textContent = playing ? "⏸ pause" : "▶ play";
     schedule();
   } catch(err){ setStatus("manifest.json not found — "+err.message, true); console.error(err); }
+}
+
+// ---------- scene switching (server mode) ----------
+function populateScenes(scenes){
+  const sel=$("scene-sel"); sel.innerHTML="";
+  for (const sc of scenes){
+    const o=document.createElement("option"); o.value=sc.name;
+    o.textContent=`${sc.label} · ${sc.steps} steps`; sel.appendChild(o);
+  }
+}
+async function selectScene(name){
+  if(!name) return;
+  $("scene-sel").value = name;
+  BASE = "/scenes-assets/" + name;
+  await loadManifestAndPlay();
+}
+$("scene-sel").onchange = () => selectScene($("scene-sel").value);
+
+// ---------- upload a video -> bake a scene ----------
+function slug(s){ return (s||"").toLowerCase().replace(/[^a-z0-9_-]+/g,"-").replace(/^-+|-+$/g,"").slice(0,48); }
+$("vid").onchange = () => { if(!$("vid-name").value && $("vid").files[0])
+  $("vid-name").value = slug($("vid").files[0].name.replace(/\.[^.]+$/,"")); };
+$("vid-go").onclick = async () => {
+  const f = $("vid").files[0];
+  const nm = slug($("vid-name").value || (f && f.name.replace(/\.[^.]+$/,"")));
+  if(!f){ setStatus("pick a video first", true); return; }
+  if(!nm){ setStatus("enter a scene name", true); return; }
+  const fd = new FormData(); fd.append("video", f); fd.append("name", nm);
+  const go=$("vid-go"), bar=$("vid-bar"); go.disabled=true; bar.style.display="block"; bar.firstElementChild.style.width="0%";
+  setStatus(`uploading ${f.name}…`);
+  let resp;
+  try { resp = await fetch("/api/scene/from-video", { method:"POST", body:fd }); }
+  catch(e){ setStatus("upload failed: "+e.message, true); go.disabled=false; return; }
+  if(!resp.ok){ setStatus("rejected: "+(await resp.text()).trim(), true); go.disabled=false; bar.style.display="none"; return; }
+  const { job } = await resp.json();
+  const poll = setInterval(async () => {
+    let st; try { st = await (await fetch("/api/scene/status/"+encodeURIComponent(job), {cache:"no-store"})).json(); }
+    catch(e){ return; }
+    if(st.state==="running"){
+      setStatus(`baking ${nm}: ${st.done}/${st.total} frames · ${st.kept} kept`);
+      bar.firstElementChild.style.width = st.total ? Math.round(st.done/st.total*100)+"%" : "0%";
+    } else if(st.state==="done"){
+      clearInterval(poll); go.disabled=false; bar.firstElementChild.style.width="100%";
+      const scenes=(await (await fetch("/api/scenes",{cache:"no-store"})).json()).scenes;
+      populateScenes(scenes); await selectScene(st.scene);
+      setTimeout(()=>{ bar.style.display="none"; }, 1200);
+    } else if(st.state==="error"){
+      clearInterval(poll); go.disabled=false; bar.style.display="none";
+      setStatus("bake failed: "+st.error, true);
+    }
+  }, 1200);
+};
+
+// ---------- init: server mode (scene list) vs standalone (?dir / local manifest) ----------
+(async function init(){
+  requestAnimationFrame(render);
+  if (BASE) { await loadManifestAndPlay(); return; }   // ?dir= : explicit standalone dir
+  let scenes = null;
+  try { const r = await fetch("/api/scenes", { cache:"no-store" }); if(r.ok) scenes = (await r.json()).scenes; } catch(e){}
+  if (scenes !== null) {                                // served by the Go app
+    serverMode = true;
+    $("server-ui").style.display = "block";
+    populateScenes(scenes);
+    if (scenes.length) {
+      const want = _q.get("scene");
+      const pick = scenes.find(s => s.name === want) ? want : scenes[0].name;
+      await selectScene(pick);
+    } else setStatus("no scenes yet — upload a video to make one");
+    return;
+  }
+  BASE = ".";                                           // standalone baked dir: manifest.json next to this page
+  await loadManifestAndPlay();
 })();
 </script>
 </body>
diff --git a/server/web/index.html b/server/web/index.html
index 2c0cb09..d2c0d79 100644
--- a/server/web/index.html
+++ b/server/web/index.html
@@ -73,6 +73,7 @@
     <h1>Photos → 3D gaussian splat</h1>
     <select id="model" title="reconstruction model"></select>
     <div class="sub" id="sub">Drop photos of the same scene from different angles — they must overlap. One feed-forward pass on the GPU (Vulkan).</div>
+    <div class="sub" style="margin-top:6px"><a href="/accumulate.html" style="color:#7aa2ff;text-decoration:none">▸ accumulating scenes &amp; build one from a video →</a></div>
     <div id="zone">
       <div class="big">📷 Drag photos here, or click to choose</div>
       <div class="small" id="zone-hint">JPG / PNG · overlapping views</div>

From 1dd5b2f338bdf0ac177570b9e27a05716404e9a9 Mon Sep 17 00:00:00 2001
From: Richard Palethorpe <io@richiejp.com>
Date: Sun, 28 Jun 2026 12:01:10 +0100
Subject: [PATCH 32/33] server: / is a menu of the demo pages (one server, one
 port)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The pages were already one server, but / was the reconstruct page and the baked
accumulating demos were being served as separate static instances on other ports.
Now / is a landing menu and everything is reachable from this one server:

- server/web/index.html: new menu — cards link to /reconstruct.html (photos -> splat),
  /accumulate.html (accumulating scenes + video upload), /demo.html (storyboard).
- the reconstruct page moves to /reconstruct.html (was index.html); each page gets a
  "‹ all demos" link back to the menu.
- the standalone per-scene static servers are obsolete: those scenes are listed by
  /api/scenes and switched in /accumulate.html, so there's nothing left to run on a
  separate port.

Verified on the single Vulkan server: /, /reconstruct.html, /accumulate.html,
/demo.html, /api/scenes all 200; the menu renders three cards linking the pages.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 server/README.md            |   4 +
 server/main.go              |   2 +-
 server/web/accumulate.html  |   1 +
 server/web/demo.html        |   1 +
 server/web/index.html       | 581 +++---------------------------------
 server/web/reconstruct.html | 556 ++++++++++++++++++++++++++++++++++
 6 files changed, 606 insertions(+), 539 deletions(-)
 create mode 100644 server/web/reconstruct.html

diff --git a/server/README.md b/server/README.md
index a66521c..ec424e4 100644
--- a/server/README.md
+++ b/server/README.md
@@ -9,6 +9,10 @@ This is a thin Go HTTP server that binds the `free_splatter` C API with
 engine — the numerical port, validation, and build details live in the repo root
 `CLAUDE.md`.
 
+One server, one port. `/` is a **menu** linking to the demo pages, all served from
+here: `/reconstruct.html` (photos → splat), `/accumulate.html` (accumulating scenes
++ video upload), `/demo.html` (storyboard).
+
 ## Run
 
 ```sh
diff --git a/server/main.go b/server/main.go
index 7cc91ba..981f2a0 100644
--- a/server/main.go
+++ b/server/main.go
@@ -13,7 +13,7 @@
 //   POST /api/scene/from-video  multipart "video" + "name" -> {job}  (gated in-process accumulate)
 //   GET  /api/scene/status/{job} -> {state,total,done,kept,scene,error}
 //   GET  /scenes-assets/...    -> a scene's manifest.json + .splat + thumbnails
-//   GET  / , /accumulate.html  -> embedded frontend (reconstruct page + scene viewer)
+//   GET  /                     -> menu; /reconstruct.html, /accumulate.html, /demo.html are the pages
 package main
 
 import (
diff --git a/server/web/accumulate.html b/server/web/accumulate.html
index e45c8e3..5b5365a 100644
--- a/server/web/accumulate.html
+++ b/server/web/accumulate.html
@@ -82,6 +82,7 @@
 <body>
 <canvas id="gl"></canvas>
 <div id="panel">
+  <a href="/" style="display:inline-block;color:#7aa2ff;text-decoration:none;font-size:11.5px;margin-bottom:6px">‹ all demos</a>
   <h1>accumulating reconstruction</h1>
   <div class="sub">free-splatter.cpp — one world from a growing photo stream.</div>
   <div id="bar"><div></div></div>
diff --git a/server/web/demo.html b/server/web/demo.html
index f0549c2..ab78579 100644
--- a/server/web/demo.html
+++ b/server/web/demo.html
@@ -45,6 +45,7 @@
 <canvas id="gl"></canvas>
 
 <div class="card" id="ui">
+  <a href="/" style="display:inline-block;color:#7aa2ff;text-decoration:none;font-size:11.5px;margin-bottom:6px">‹ all demos</a>
   <h1>free-splatter.cpp · demo render</h1>
   <div class="sub" id="sub">loading storyboard…</div>
   <div class="row">
diff --git a/server/web/index.html b/server/web/index.html
index d2c0d79..cc73877 100644
--- a/server/web/index.html
+++ b/server/web/index.html
@@ -1,555 +1,60 @@
 <!doctype html>
 <!--
-  free-splatter.cpp web app.
-  Drop photos -> POST /api/reconstruct (Go server, purego -> Vulkan engine) ->
-  fetch the .splat -> render in WebGL2 with depth-sorted alpha blending.
-  The .splat is importance-sorted, so the "build-up" reveal slider/animation
-  shows the scene assembling from its most important gaussians first — the honest
-  analogue of a "step count" for a feed-forward (one-pass) reconstruction.
-  EWA splat shaders follow 3DGS (Kerbl et al.) / antimatter15's WebGL formulation.
+  free-splatter.cpp — landing menu. One server, one port: this page links to the
+  demo pages (each its own route on this same server). Add a card here when you add
+  a page.
 -->
 <html lang="en">
 <head>
 <meta charset="utf-8"/>
 <meta name="viewport" content="width=device-width, initial-scale=1"/>
-<title>free-splatter.cpp — photos to gaussian splat</title>
+<title>free-splatter.cpp — demos</title>
 <style>
   :root { color-scheme: dark; }
   * { margin: 0; box-sizing: border-box; }
-  html, body { height: 100%; background: #0b0d10; overflow: hidden; font-family: ui-sans-serif, system-ui, sans-serif; color: #e8eaed; }
-  #gl { position: fixed; inset: 0; width: 100%; height: 100%; display: block; }
-
-  .card { background: rgba(16,19,24,.86); backdrop-filter: blur(10px);
-    border: 1px solid rgba(255,255,255,.08); border-radius: 14px; box-shadow: 0 10px 40px rgba(0,0,0,.5); }
-
-  /* upload overlay (initial + "new photos") */
-  #drop { position: fixed; inset: 0; z-index: 20; display: flex; align-items: center; justify-content: center; }
-  #drop .card { width: min(440px, 92vw); padding: 26px 26px 22px; text-align: center; }
-  #drop h1 { font-size: 18px; font-weight: 680; letter-spacing: .2px; }
-  #model { width: 100%; margin-top: 12px; padding: 7px 9px; font-size: 12.5px; color: #e8eaed;
-    background: rgba(255,255,255,.06); border: 1px solid rgba(255,255,255,.12); border-radius: 8px; cursor: pointer; }
-  #model:hover { background: rgba(255,255,255,.1); }
-  #bgrow { display: flex; align-items: center; gap: 6px; font-size: 12px; color: #b9c0cc; margin-top: 12px; cursor: pointer; }
-  #bgrow input { accent-color: #6f9bff; }
-  #bgrow .bgnote { color: #8b93a1; font-size: 11px; }
-  #drop .sub { color: #8b93a1; font-size: 12.5px; margin-top: 4px; line-height: 1.5; }
-  #zone { margin-top: 18px; border: 1.5px dashed rgba(255,255,255,.18); border-radius: 12px;
-    padding: 26px 16px; cursor: pointer; transition: border-color .15s, background .15s; }
-  #zone.hot { border-color: #6f9bff; background: rgba(91,140,255,.08); }
-  #zone .big { font-size: 13.5px; }
-  #zone .small { color: #8b93a1; font-size: 11.5px; margin-top: 4px; }
-  #thumbs { display: flex; flex-wrap: wrap; gap: 8px; justify-content: center; margin-top: 14px; }
-  #thumbs img { width: 62px; height: 62px; object-fit: cover; border-radius: 8px; border: 1px solid rgba(255,255,255,.12); }
-  .btn { margin-top: 16px; width: 100%; padding: 11px; font-size: 13.5px; font-weight: 640; color: #fff; cursor: pointer;
-    border: none; border-radius: 10px; background: linear-gradient(90deg,#5b8cff,#8a5bff); }
-  .btn:disabled { opacity: .45; cursor: not-allowed; }
-  .btn.ghost { background: rgba(255,255,255,.08); }
-  #drop .err { color: #ff8a8a; font-size: 12px; margin-top: 10px; min-height: 1em; }
-
-  /* viewer panel (after reconstruction) */
-  #panel { position: fixed; top: 16px; left: 16px; z-index: 10; width: 264px; padding: 14px 16px; font-size: 13px; line-height: 1.45; display: none; }
-  #panel h1 { font-size: 14px; font-weight: 650; }
-  #panel .sub { color: #8b93a1; font-size: 11.5px; margin-top: 2px; }
-  .stat { display: flex; justify-content: space-between; font-variant-numeric: tabular-nums; margin-top: 3px; }
-  .stat span:first-child { color: #8b93a1; }
-  #status { margin-top: 9px; color: #9fd3a0; font-size: 11.5px; min-height: 1.2em; }
-  #status.err { color: #ff8a8a; }
-  #autores { display: flex; align-items: center; gap: 6px; font-size: 11.5px; color: #8b93a1; margin-top: 7px; cursor: pointer; }
-  #autores input { accent-color: #6f9bff; }
-  .rev { margin-top: 12px; border-top: 1px solid rgba(255,255,255,.08); padding-top: 11px; }
-  .rev .lab { display: flex; justify-content: space-between; font-size: 11.5px; color: #8b93a1; }
-  #reveal { width: 100%; margin: 7px 0 4px; accent-color: #6f9bff; }
-  .row { display: flex; gap: 8px; margin-top: 8px; }
-  .row .btn { margin-top: 0; padding: 8px; font-size: 12px; }
-  #hint { position: fixed; bottom: 14px; left: 50%; transform: translateX(-50%); z-index: 10;
-    color: #6b7280; font-size: 11.5px; background: rgba(16,19,24,.7); padding: 6px 12px; border-radius: 8px; display: none; }
+  body { min-height: 100vh; background: radial-gradient(1200px 620px at 50% -8%, #171b24, #0b0d10);
+    color: #e8eaed; font-family: ui-sans-serif, system-ui, sans-serif;
+    display: flex; flex-direction: column; align-items: center; justify-content: center;
+    padding: 48px 20px; gap: 30px; }
+  .head { text-align: center; }
+  .head h1 { font-size: 27px; font-weight: 680; letter-spacing: .2px; }
+  .head p { color: #8b93a1; font-size: 14px; margin-top: 7px; }
+  .grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(244px, 1fr));
+    gap: 18px; width: 100%; max-width: 880px; }
+  a.card { display: block; text-decoration: none; color: inherit; background: rgba(16,19,24,.72);
+    border: 1px solid rgba(255,255,255,.08); border-radius: 14px; padding: 22px;
+    box-shadow: 0 8px 30px rgba(0,0,0,.35);
+    transition: transform .15s ease, border-color .15s ease, background .15s ease; }
+  a.card:hover { transform: translateY(-3px); border-color: rgba(122,162,255,.6); background: rgba(20,24,32,.92); }
+  a.card .ico { font-size: 27px; line-height: 1; }
+  a.card h2 { font-size: 16px; font-weight: 640; margin-top: 12px; }
+  a.card p { color: #8b93a1; font-size: 12.5px; line-height: 1.5; margin-top: 7px; }
+  .foot { color: #5b6473; font-size: 11.5px; }
+  .foot a { color: #6b7787; }
 </style>
 </head>
 <body>
-<canvas id="gl"></canvas>
-
-<div id="drop">
-  <div class="card">
-    <h1>Photos → 3D gaussian splat</h1>
-    <select id="model" title="reconstruction model"></select>
-    <div class="sub" id="sub">Drop photos of the same scene from different angles — they must overlap. One feed-forward pass on the GPU (Vulkan).</div>
-    <div class="sub" style="margin-top:6px"><a href="/accumulate.html" style="color:#7aa2ff;text-decoration:none">▸ accumulating scenes &amp; build one from a video →</a></div>
-    <div id="zone">
-      <div class="big">📷 Drag photos here, or click to choose</div>
-      <div class="small" id="zone-hint">JPG / PNG · overlapping views</div>
-    </div>
-    <label id="bgrow" style="display:none"><input type="checkbox" id="bgremove" checked/> remove background <span class="bgnote">(recommended; ~2s/photo)</span></label>
-    <div id="thumbs"></div>
-    <button id="go" class="btn" disabled>Reconstruct</button>
-    <div id="err" class="err"></div>
-    <input id="file" type="file" accept="image/*" multiple hidden/>
+  <div class="head">
+    <h1>free-splatter.cpp</h1>
+    <p>FreeSplatter, in C++/GGML on the GPU — pick a demo.</p>
   </div>
-</div>
-
-<div id="panel" class="card">
-  <h1>free-splatter.cpp</h1>
-  <div class="sub">3D gaussian scene from your photos.</div>
-  <div class="stat"><span>views</span><span id="s-views">—</span></div>
-  <div class="stat"><span>splats</span><span id="s-splats">0</span></div>
-  <div class="stat"><span>reconstruct</span><span id="s-time">—</span></div>
-  <div class="stat"><span>fps</span><span id="s-fps">—</span></div>
-  <div class="stat"><span>render</span><span id="s-render">100%</span></div>
-  <label id="autores"><input type="checkbox" id="autores-cb" checked/> auto quality (smooth navigation)</label>
-  <label id="vibrow" style="display:flex;align-items:center;gap:8px;font-size:11.5px;color:#8b93a1;margin-top:7px;">
-    <span>vibrance</span><input id="vibrance" type="range" min="0.6" max="2.2" step="0.05" value="1.45" style="flex:1;accent-color:#6f9bff"/>
-  </label>
-  <div id="status">…</div>
-  <div class="rev">
-    <div class="lab"><span>build-up</span><span id="s-reveal">100%</span></div>
-    <input id="reveal" type="range" min="0" max="1000" value="1000"/>
-    <div class="row">
-      <button id="replay" class="btn ghost">▶ replay</button>
-      <button id="again" class="btn ghost">＋ new photos</button>
-    </div>
+  <div class="grid">
+    <a class="card" href="/reconstruct.html">
+      <div class="ico">📷</div>
+      <h2>Photos → 3D splat</h2>
+      <p>Drop a few overlapping photos of a scene; one feed-forward GPU pass returns a 3D gaussian splat you can orbit.</p>
+    </a>
+    <a class="card" href="/accumulate.html">
+      <div class="ico">🌱</div>
+      <h2>Accumulating scenes</h2>
+      <p>Watch one world grow as photos are folded in — switch between scenes, or upload a video and it builds the scene for you.</p>
+    </a>
+    <a class="card" href="/demo.html">
+      <div class="ico">🎬</div>
+      <h2>Storyboard</h2>
+      <p>A guided tour through reconstructions — also the offline, deterministic video-render harness.</p>
+    </a>
   </div>
-</div>
-
-<div id="hint">drag to orbit · scroll to zoom · WASD/arrows to move</div>
-
-<script type="module">
-const ROW = 32;                       // bytes per splat
-const $ = (id) => document.getElementById(id);
-const P = new URLSearchParams(location.search);   // ?id=&yaw=&pitch=&dist=&reveal=
-const setStatus = (t, err=false) => { const s=$("status"); s.textContent=t; s.classList.toggle("err",err); };
-
-const canvas = $("gl");
-const gl = canvas.getContext("webgl2", { antialias: false, premultipliedAlpha: true, alpha: false });
-if (!gl) { $("err").textContent = "WebGL2 not available in this browser."; throw new Error("no webgl2"); }
-
-// ---------- shaders (EWA splatting) ----------
-const VERT = `#version 300 es
-precision highp float; precision highp int;
-uniform highp usampler2D u_tex;
-uniform mat4 u_proj, u_view;
-uniform vec2 u_focal, u_viewport;
-in vec2 a_corner;      // quad corner in [-2,2]
-in uint a_index;       // sorted splat index
-out vec4 v_color;
-out vec2 v_pos;
-void main() {
-  uint t = a_index * 2u;
-  ivec2 wh = textureSize(u_tex, 0);
-  uvec4 A = texelFetch(u_tex, ivec2(int(t % uint(wh.x)), int(t / uint(wh.x))), 0);
-  uvec4 B = texelFetch(u_tex, ivec2(int((t+1u) % uint(wh.x)), int((t+1u) / uint(wh.x))), 0);
-  vec3 center = uintBitsToFloat(A.xyz);
-  vec3 scale  = vec3(uintBitsToFloat(A.w), uintBitsToFloat(B.x), uintBitsToFloat(B.y));
-  uint rgba = B.z, rotp = B.w;
-  v_color = vec4(float(rgba & 0xffu), float((rgba>>8)&0xffu), float((rgba>>16)&0xffu), float((rgba>>24)&0xffu)) / 255.0;
-  vec4 q = (vec4(float(rotp&0xffu), float((rotp>>8)&0xffu), float((rotp>>16)&0xffu), float((rotp>>24)&0xffu)) - 128.0) / 128.0;
-  q = normalize(q);                    // (w,x,y,z)
-  float w=q.x, x=q.y, y=q.z, z=q.w;
-  mat3 R = mat3(
-    1.-2.*(y*y+z*z), 2.*(x*y+w*z),   2.*(x*z-w*y),
-    2.*(x*y-w*z),    1.-2.*(x*x+z*z),2.*(y*z+w*x),
-    2.*(x*z+w*y),    2.*(y*z-w*x),   1.-2.*(x*x+y*y));
-  mat3 S = mat3(scale.x,0.,0., 0.,scale.y,0., 0.,0.,scale.z);
-  mat3 M = R * S;
-  mat3 cov3d = M * transpose(M);
-
-  vec4 cam = u_view * vec4(center, 1.0);
-  if (cam.z >= -0.2) { gl_Position = vec4(0.0,0.0,2.0,1.0); return; }
-  vec4 clip = u_proj * cam;
-  if (clip.w <= 0.0 || abs(clip.x) > 1.3*clip.w || abs(clip.y) > 1.3*clip.w) {
-    gl_Position = vec4(0.0,0.0,2.0,1.0); return;
-  }
-  mat3 J = mat3(
-    u_focal.x/cam.z, 0.0, -(u_focal.x*cam.x)/(cam.z*cam.z),
-    0.0, -u_focal.y/cam.z, (u_focal.y*cam.y)/(cam.z*cam.z),
-    0.0, 0.0, 0.0);
-  mat3 W = transpose(mat3(u_view));
-  mat3 T = W * J;
-  mat3 cov2 = transpose(T) * cov3d * T;
-  float a = cov2[0][0] + 0.3, b = cov2[0][1], c = cov2[1][1] + 0.3;
-  float det = a*c - b*b;
-  if (det <= 0.0) { gl_Position = vec4(0.0,0.0,2.0,1.0); return; }
-  float mid = 0.5*(a+c);
-  float rad = sqrt(max(0.0, mid*mid - det));
-  float l1 = mid + rad;
-  float l2 = max(mid - rad, 0.1);
-  vec2 d = vec2(b, l1 - a);
-  vec2 e1 = (dot(d,d) > 1e-9) ? normalize(d) : vec2(1.0, 0.0);
-  float MAXPX = 0.20 * min(u_viewport.x, u_viewport.y);
-  vec2 v1 = min(sqrt(2.0*l1), MAXPX) * e1;
-  vec2 v2 = min(sqrt(2.0*l2), MAXPX) * vec2(e1.y, -e1.x);
-
-  vec2 ndc = clip.xy / clip.w;
-  vec2 offset = (a_corner.x*v1 + a_corner.y*v2) / u_viewport * 2.0;
-  v_pos = a_corner;
-  gl_Position = vec4(ndc + offset, clip.z/clip.w, 1.0);
-}`;
-
-const FRAG = `#version 300 es
-precision highp float;
-uniform float u_sat;     // saturation/vibrance boost (1.0 = as-decoded)
-in vec4 v_color; in vec2 v_pos;
-out vec4 o;
-void main() {
-  float p = -dot(v_pos, v_pos);
-  if (p < -4.0) discard;
-  float alpha = exp(p) * v_color.a;
-  // SH-DC-only color is flat/desaturated; push each splat's color away from its
-  // luma before the premultiplied blend (the blend itself also greys overlaps).
-  vec3 rgb = v_color.rgb;
-  float luma = dot(rgb, vec3(0.2126, 0.7152, 0.0722));
-  rgb = clamp(mix(vec3(luma), rgb, u_sat), 0.0, 1.0);
-  o = vec4(rgb * alpha, alpha);
-}`;
-
-function shader(type, src){ const s=gl.createShader(type); gl.shaderSource(s,src); gl.compileShader(s);
-  if(!gl.getShaderParameter(s,gl.COMPILE_STATUS)) throw new Error(gl.getShaderInfoLog(s)); return s; }
-const prog = gl.createProgram();
-gl.attachShader(prog, shader(gl.VERTEX_SHADER, VERT));
-gl.attachShader(prog, shader(gl.FRAGMENT_SHADER, FRAG));
-gl.linkProgram(prog);
-if(!gl.getProgramParameter(prog, gl.LINK_STATUS)) throw new Error(gl.getProgramInfoLog(prog));
-gl.useProgram(prog);
-
-const quad = new Float32Array([-2,-2, 2,-2, 2,2, -2,-2, 2,2, -2,2]);
-const quadBuf = gl.createBuffer(); gl.bindBuffer(gl.ARRAY_BUFFER, quadBuf); gl.bufferData(gl.ARRAY_BUFFER, quad, gl.STATIC_DRAW);
-const aCorner = gl.getAttribLocation(prog, "a_corner");
-const idxBuf = gl.createBuffer();
-const aIndex = gl.getAttribLocation(prog, "a_index");
-const tex = gl.createTexture();
-const TEX_W = 2048;
-const U = {
-  proj: gl.getUniformLocation(prog,"u_proj"), view: gl.getUniformLocation(prog,"u_view"),
-  focal: gl.getUniformLocation(prog,"u_focal"), viewport: gl.getUniformLocation(prog,"u_viewport"),
-  sat: gl.getUniformLocation(prog,"u_sat"),
-};
-gl.uniform1i(gl.getUniformLocation(prog,"u_tex"), 0);
-gl.disable(gl.DEPTH_TEST);
-gl.enable(gl.BLEND);
-gl.blendFuncSeparate(gl.ONE, gl.ONE_MINUS_SRC_ALPHA, gl.ONE, gl.ONE_MINUS_SRC_ALPHA);
-
-// ---------- splat store ----------
-let count = 0;                         // total splats ingested
-let revealCount = 0;                   // top-K (importance-sorted) actually shown
-let followFull = true;                 // reveal tracks count as data streams in
-// default render ceiling: show the top 300k for smooth navigation even when the
-// splat holds up to ~1M; the slider still reaches the full count. ?maxsplats= overrides.
-const VIEW_CAP = parseInt(P.get("maxsplats") ?? "300000", 10);
-// ease-in reveal: front-loaded importance-sorted splats fill fast then stall, so
-// reveal count slowly at first to even out the visible build-up. ?buildcurve= tunes.
-const BUILD_CURVE = parseFloat(P.get("buildcurve") ?? "3");
-let posSent = false;                   // positions uploaded to the sort worker?
-let positions = new Float32Array(0);   // xyz per splat (for sorting)
-let texData = new Uint32Array(0);      // 8 uint32 per splat
-
-function ensureCapacity(n) {
-  if (n*8 <= texData.length) return;
-  const cap = Math.max(n, Math.ceil(texData.length/8*1.5), 65536);
-  const td = new Uint32Array(cap*8); td.set(texData); texData = td;
-  const ps = new Float32Array(cap*3); ps.set(positions); positions = ps;
-}
-function uploadTexture() {
-  const rows = Math.ceil((count*2)/TEX_W);
-  const buf = new Uint32Array(TEX_W*rows*4);
-  buf.set(texData.subarray(0, count*8));
-  gl.activeTexture(gl.TEXTURE0); gl.bindTexture(gl.TEXTURE_2D, tex);
-  gl.texImage2D(gl.TEXTURE_2D, 0, gl.RGBA32UI, TEX_W, rows, 0, gl.RGBA_INTEGER, gl.UNSIGNED_INT, buf);
-  gl.texParameteri(gl.TEXTURE_2D, gl.TEXTURE_MIN_FILTER, gl.NEAREST);
-  gl.texParameteri(gl.TEXTURE_2D, gl.TEXTURE_MAG_FILTER, gl.NEAREST);
-}
-function ingest(bytes) {
-  const n = Math.floor(bytes.byteLength / ROW);
-  ensureCapacity(n);
-  const f32 = new Float32Array(bytes.buffer, bytes.byteOffset, n*8);
-  const u32 = new Uint32Array(bytes.buffer, bytes.byteOffset, n*8);
-  texData.set(u32.subarray(0, n*8), 0);
-  for (let i=0;i<n;i++){ positions[i*3]=f32[i*8]; positions[i*3+1]=f32[i*8+1]; positions[i*3+2]=f32[i*8+2]; }
-  count = n;
-  posSent = false;                     // new positions -> resend to the sort worker
-  if (followFull) revealCount = Math.min(count, VIEW_CAP);
-  uploadTexture();
-  $("s-splats").textContent = count.toLocaleString();
-  syncRevealUI();
-  if (count > 5000) frameToData();
-}
-
-// ---------- reveal ----------
-function setReveal(k) {
-  revealCount = Math.max(0, Math.min(count, k|0));
-  followFull = revealCount >= count;
-  sortDirty = true;
-  syncRevealUI();
-}
-function syncRevealUI() {
-  const r = $("reveal");
-  r.max = Math.max(1, count);
-  r.value = revealCount;
-  const pct = count ? Math.round(revealCount/count*100) : 0;
-  $("s-reveal").textContent = revealCount.toLocaleString() + " · " + pct + "%";
-}
-$("reveal").addEventListener("input", e => setReveal(+e.target.value));
-
-let buildupRAF = 0;
-function playBuildup(ms=3500) {
-  if (count === 0) return;
-  cancelAnimationFrame(buildupRAF);
-  const cap = Math.min(count, VIEW_CAP);         // build up to the render ceiling, not the full 1M
-  const t0 = performance.now(), start = Math.max(1, Math.floor(cap*0.012));
-  const step = (now) => {
-    const u = Math.min(1, (now - t0) / ms);
-    const e = Math.pow(u, BUILD_CURVE);          // ease-in (even visible build-up)
-    setReveal(Math.floor(start + (cap-start)*e));
-    if (u < 1) buildupRAF = requestAnimationFrame(step); else setReveal(cap);
-  };
-  buildupRAF = requestAnimationFrame(step);
-}
-$("replay").addEventListener("click", () => playBuildup());
-
-// ---------- depth-sort worker ----------
-const worker = new Worker(URL.createObjectURL(new Blob([`
-let pos=null, n=0;
-onmessage = (e) => {
-  if (e.data.pos) { pos = e.data.pos; n = e.data.n; return; }
-  if (!pos) return;
-  const view = e.data.view, N = Math.min(n, e.data.n);
-  const d = new Float32Array(N); let dmin=Infinity, dmax=-Infinity;
-  for (let i=0;i<N;i++){
-    const z = view[2]*pos[i*3] + view[6]*pos[i*3+1] + view[10]*pos[i*3+2];
-    d[i]=z; if(z<dmin)dmin=z; if(z>dmax)dmax=z;
-  }
-  const B=65536, cnt=new Uint32Array(B);
-  const scale = dmax>dmin ? (B-1)/(dmax-dmin) : 0;
-  const key=new Uint16Array(N);
-  for(let i=0;i<N;i++){ let k=((d[i]-dmin)*scale)|0; k=k<0?0:k>65535?65535:k; key[i]=k; cnt[k]++; }
-  for(let i=1;i<B;i++) cnt[i]+=cnt[i-1];
-  const order=new Uint32Array(N);
-  for(let i=N-1;i>=0;i--) order[--cnt[key[i]]]=i;
-  postMessage({order, n:N}, [order.buffer]);
-};
-`], {type:"text/javascript"})));
-
-let sortedIndex = new Uint32Array(0), sortedCount = 0, sorting = false, sortDirty = true;
-worker.onmessage = (e) => {
-  sortedIndex = e.data.order; sortedCount = e.data.n; sorting = false;
-  gl.bindBuffer(gl.ARRAY_BUFFER, idxBuf);
-  gl.bufferData(gl.ARRAY_BUFFER, sortedIndex, gl.DYNAMIC_DRAW);
-};
-function requestSort(view) {
-  if (sorting || revealCount === 0) return;
-  sorting = true;
-  // positions change only on ingest; resend them then, not every camera move
-  // (the worker keeps the full set and sorts the first revealCount of them).
-  if (!posSent) { worker.postMessage({ pos: positions.slice(0, count*3), n: count }); posSent = true; }
-  worker.postMessage({ view, n: revealCount });
-}
-
-// ---------- camera ----------
-// yaw/pitch/dist overridable via query (shareable views + headless screenshots).
-let target = [0,0,0], framed = false;
-let yaw   = parseFloat(P.get("yaw")   ?? "0.0");
-let pitch = parseFloat(P.get("pitch") ?? "0.05");
-let dist  = parseFloat(P.get("dist")  ?? "2.0");
-function frameToData() {
-  if (framed || count === 0) return;
-  let lo = [1e30,1e30,1e30], hi = [-1e30,-1e30,-1e30];
-  for (let i=0;i<count;i++) for (let k=0;k<3;k++){ const v=positions[i*3+k]; if(v<lo[k])lo[k]=v; if(v>hi[k])hi[k]=v; }
-  const diag = Math.hypot(hi[0]-lo[0], hi[1]-lo[1], hi[2]-lo[2]);
-  target = [(lo[0]+hi[0])/2, (lo[1]+hi[1])/2, (lo[2]+hi[2])/2];
-  if (!P.has("dist")) dist = Math.max(0.5, 0.7*diag);
-  framed = true; sortDirty = true;
-}
-const keys = {};
-addEventListener("keydown", e => { if (drop.style.display==="none") keys[e.key.toLowerCase()] = true; });
-addEventListener("keyup",   e => keys[e.key.toLowerCase()] = false);
-let drag=false, lx=0, ly=0;
-canvas.addEventListener("pointerdown", e => { drag=true; lx=e.clientX; ly=e.clientY; canvas.setPointerCapture(e.pointerId); });
-canvas.addEventListener("pointerup",   e => drag=false);
-canvas.addEventListener("pointermove", e => { if(!drag)return; yaw -= (e.clientX-lx)*0.005; pitch -= (e.clientY-ly)*0.005;
-  pitch = Math.max(-1.5, Math.min(1.5, pitch)); lx=e.clientX; ly=e.clientY; sortDirty=true; });
-canvas.addEventListener("wheel", e => { dist *= Math.exp(e.deltaY*0.001); dist=Math.max(0.2,Math.min(10,dist)); sortDirty=true; e.preventDefault(); }, {passive:false});
-
-function mat4Mul(a,b){ const o=new Float32Array(16); for(let i=0;i<4;i++)for(let j=0;j<4;j++){let s=0;for(let k=0;k<4;k++)s+=a[k*4+j]*b[i*4+k];o[i*4+j]=s;} return o; }
-function perspective(fovy, aspect, n, f){ const t=1/Math.tan(fovy/2); return new Float32Array([t/aspect,0,0,0, 0,t,0,0, 0,0,(f+n)/(n-f),-1, 0,0,(2*f*n)/(n-f),0]); }
-function lookAt(eye, c){ const z=norm(sub(eye,c)), x=norm(cross([0,1,0],z)), y=cross(z,x);
-  return new Float32Array([x[0],y[0],z[0],0, x[1],y[1],z[1],0, x[2],y[2],z[2],0, -dot(x,eye),-dot(y,eye),-dot(z,eye),1]); }
-const sub=(a,b)=>[a[0]-b[0],a[1]-b[1],a[2]-b[2]], dot=(a,b)=>a[0]*b[0]+a[1]*b[1]+a[2]*b[2];
-const cross=(a,b)=>[a[1]*b[2]-a[2]*b[1],a[2]*b[0]-a[0]*b[2],a[0]*b[1]-a[1]*b[0]];
-const norm=(a)=>{const l=Math.hypot(...a)||1;return [a[0]/l,a[1]/l,a[2]/l];};
-
-// adaptive resolution: overdraw (not geometry) is the bottleneck when you move
-// into the cloud, so trade internal render scale for framerate and restore it
-// when FPS recovers. The CSS size stays full; only the GL backing shrinks.
-let renderScale = 1.0, autoRes = true;
-// vibrance (saturation) — cosmetic, in the demo layer only; ?sat= overrides so
-// headless/video renders match what you dialed in.
-let saturation = parseFloat(P.get("sat") ?? "1.45");
-{ const vib = $("vibrance"); vib.value = saturation;
-  vib.addEventListener("input", e => { saturation = +e.target.value; }); }
-function resize(){ const dpr=Math.min(devicePixelRatio,2)*renderScale;
-  canvas.width=Math.max(1,Math.round(innerWidth*dpr)); canvas.height=Math.max(1,Math.round(innerHeight*dpr));
-  gl.viewport(0,0,canvas.width,canvas.height); }
-addEventListener("resize", resize); resize();
-$("autores-cb").addEventListener("change", e => {
-  autoRes = e.target.checked;
-  if (!autoRes) { renderScale = 1.0; resize(); $("s-render").textContent = "100%"; }
-});
-
-let frames=0, fpsT=performance.now();
-function render(){
-  const speed = dist*0.02;
-  const fwd=[Math.sin(yaw)*Math.cos(pitch),Math.sin(pitch),Math.cos(yaw)*Math.cos(pitch)];
-  const right=norm(cross([0,1,0],fwd));
-  if(keys["w"]||keys["arrowup"]) { for(let i=0;i<3;i++) target[i]-=fwd[i]*speed; sortDirty=true; }
-  if(keys["s"]||keys["arrowdown"]){ for(let i=0;i<3;i++) target[i]+=fwd[i]*speed; sortDirty=true; }
-  if(keys["a"]||keys["arrowleft"]){ for(let i=0;i<3;i++) target[i]-=right[i]*speed; sortDirty=true; }
-  if(keys["d"]||keys["arrowright"]){for(let i=0;i<3;i++) target[i]+=right[i]*speed; sortDirty=true; }
-
-  const eye=[target[0]+dist*Math.sin(yaw)*Math.cos(pitch), target[1]+dist*Math.sin(pitch), target[2]+dist*Math.cos(yaw)*Math.cos(pitch)];
-  const view=lookAt(eye,target);
-  const aspect=canvas.width/canvas.height, fovy=1.0;
-  const proj=perspective(fovy, aspect, 0.05, 100);
-  const fy=canvas.height/(2*Math.tan(fovy/2)), fx=fy;
-
-  if(sortDirty){ requestSort(view); sortDirty=false; }
-
-  gl.clearColor(0.043,0.051,0.063,1); gl.clear(gl.COLOR_BUFFER_BIT);
-  gl.useProgram(prog);
-  gl.uniformMatrix4fv(U.view,false,view); gl.uniformMatrix4fv(U.proj,false,proj);
-  gl.uniform2f(U.focal,fx,fy); gl.uniform2f(U.viewport,canvas.width,canvas.height);
-  gl.uniform1f(U.sat, saturation);
-  gl.bindBuffer(gl.ARRAY_BUFFER,quadBuf); gl.enableVertexAttribArray(aCorner); gl.vertexAttribPointer(aCorner,2,gl.FLOAT,false,0,0);
-  if(sortedCount>0){
-    gl.bindBuffer(gl.ARRAY_BUFFER,idxBuf); gl.enableVertexAttribArray(aIndex);
-    gl.vertexAttribIPointer(aIndex,1,gl.UNSIGNED_INT,0,0); gl.vertexAttribDivisor(aIndex,1);
-    gl.drawArraysInstanced(gl.TRIANGLES,0,6,sortedCount);
-  }
-  frames++; const now=performance.now();
-  if(now-fpsT>500){
-    const fps=Math.round(frames*1000/(now-fpsT)); frames=0; fpsT=now;
-    $("s-fps").textContent=fps;
-    if(autoRes){                       // hold ~30fps by trading resolution
-      if(fps<26 && renderScale>0.35){ renderScale=Math.max(0.35,renderScale-0.12); resize(); }
-      else if(fps>52 && renderScale<1.0){ renderScale=Math.min(1.0,renderScale+0.12); resize(); }
-    }
-    $("s-render").textContent=Math.round(renderScale*100)+"%";
-  }
-  requestAnimationFrame(render);
-}
-requestAnimationFrame(render);
-
-// ---------- splat load (progressive) ----------
-async function loadSplat(url){
-  framed = false; followFull = true; count = 0; revealCount = 0; posSent = false;
-  const resp = await fetch(url);
-  if(!resp.ok) throw new Error("HTTP "+resp.status+" for splat");
-  const total = +resp.headers.get("content-length") || 0;
-  const reader = resp.body.getReader();
-  let chunks=[], received=0, lastIngest=0;
-  while(true){
-    const {done,value}=await reader.read();
-    if(done) break;
-    chunks.push(value); received+=value.byteLength;
-    if(received-lastIngest > 4096*ROW || !total){
-      const all=new Uint8Array(received); let o=0; for(const c of chunks){all.set(c,o); o+=c.byteLength;}
-      ingest(all); sortDirty=true; lastIngest=received;
-    }
-  }
-  const all=new Uint8Array(received); let o=0; for(const c of chunks){all.set(c,o); o+=c.byteLength;}
-  ingest(all); sortDirty=true;
-}
-
-// ---------- upload UI ----------
-const drop = $("drop");
-let selected = [];
-function showThumbs() {
-  const t = $("thumbs"); t.innerHTML = "";
-  for (const f of selected) { const im = document.createElement("img"); im.src = URL.createObjectURL(f); t.appendChild(im); }
-  $("go").disabled = selected.length === 0;
-}
-function pick(files) {
-  selected = [...files].filter(f => f.type.startsWith("image/")).slice(0, 8);
-  $("err").textContent = "";
-  showThumbs();
-}
-// model selector: populate from /api/models; update the hint per model. Hidden
-// when only one model is loaded.
-let MODELS = [], BG_AVAILABLE = false;
-(async () => {
-  const sel = $("model");
-  try {
-    const data = await (await fetch("/api/models")).json();
-    MODELS = data.models || []; BG_AVAILABLE = !!data.bg_remove;
-  } catch { MODELS = []; }
-  if (MODELS.length <= 1) sel.style.display = "none";
-  for (const m of MODELS) {
-    const o = document.createElement("option");
-    o.value = m.name; o.textContent = `${m.label} · ${m.views} views`;
-    sel.appendChild(o);
-  }
-  const applyHint = () => {
-    const m = MODELS.find(x => x.name === sel.value) || MODELS[0];
-    if (!m) return;
-    $("sub").textContent = m.hint;
-    $("zone-hint").textContent = `JPG / PNG · best with ${m.views} overlapping views`;
-    // background removal only helps the object path (isolate one object on white)
-    $("bgrow").style.display = (BG_AVAILABLE && sel.value === "object") ? "flex" : "none";
-  };
-  sel.addEventListener("change", applyHint);
-  applyHint();
-})();
-
-$("zone").addEventListener("click", () => $("file").click());
-$("file").addEventListener("change", e => pick(e.target.files));
-$("zone").addEventListener("dragover", e => { e.preventDefault(); $("zone").classList.add("hot"); });
-$("zone").addEventListener("dragleave", () => $("zone").classList.remove("hot"));
-$("zone").addEventListener("drop", e => { e.preventDefault(); $("zone").classList.remove("hot"); pick(e.dataTransfer.files); });
-addEventListener("dragover", e => e.preventDefault());
-addEventListener("drop", e => e.preventDefault());
-
-$("again").addEventListener("click", () => { drop.style.display = "flex"; $("panel").style.display = "none"; $("hint").style.display = "none"; });
-
-// deep-link: ?id=<splat> skips the uploader (shareable result + headless render).
-(async () => {
-  const id = P.get("id");
-  if (!id) return;
-  drop.style.display = "none"; $("panel").style.display = "block"; $("hint").style.display = "block";
-  try {
-    await loadSplat("/api/splat/" + id);
-    const rv = P.get("reveal");
-    if (rv != null) setReveal(Math.round(parseFloat(rv) * count)); else playBuildup();
-    setStatus(`ready · ${count.toLocaleString()} splats`);
-    $("s-splats").textContent = count.toLocaleString();
-  } catch (e) { setStatus(String(e.message || e), true); }
-})();
-
-$("go").addEventListener("click", async () => {
-  if (selected.length === 0) return;
-  $("go").disabled = true; $("err").textContent = "";
-  $("go").textContent = "Uploading…";
-  try {
-    const removeBg = $("bgremove").checked && $("bgrow").style.display !== "none";
-    const fd = new FormData();
-    for (const f of selected) fd.append("images", f);
-    fd.append("model", $("model").value || "");
-    if (removeBg) fd.append("remove_bg", "1");
-    const t0 = performance.now();
-    $("go").textContent = removeBg ? "Removing background + reconstructing…" : "Reconstructing… (Vulkan)";
-    const resp = await fetch("/api/reconstruct", { method: "POST", body: fd });
-    if (!resp.ok) throw new Error(await resp.text());
-    const meta = await resp.json();
-    // reveal the viewer
-    drop.style.display = "none"; $("panel").style.display = "block"; $("hint").style.display = "block";
-    $("s-views").textContent = meta.n_views;
-    $("s-time").textContent = meta.seconds.toFixed(2) + "s";
-    setStatus(`reconstructed ${meta.n_splats.toLocaleString()} gaussians`);
-    await loadSplat("/api/splat/" + meta.id);
-    setStatus(`ready · ${count.toLocaleString()} splats`);
-    playBuildup();
-  } catch (err) {
-    $("err").textContent = String(err.message || err);
-    console.error(err);
-  } finally {
-    $("go").disabled = selected.length === 0; $("go").textContent = "Reconstruct";
-  }
-});
-</script>
+  <div class="foot">one server · Vulkan engine · WebGL render</div>
 </body>
 </html>
diff --git a/server/web/reconstruct.html b/server/web/reconstruct.html
new file mode 100644
index 0000000..9ab2a73
--- /dev/null
+++ b/server/web/reconstruct.html
@@ -0,0 +1,556 @@
+<!doctype html>
+<!--
+  free-splatter.cpp web app.
+  Drop photos -> POST /api/reconstruct (Go server, purego -> Vulkan engine) ->
+  fetch the .splat -> render in WebGL2 with depth-sorted alpha blending.
+  The .splat is importance-sorted, so the "build-up" reveal slider/animation
+  shows the scene assembling from its most important gaussians first — the honest
+  analogue of a "step count" for a feed-forward (one-pass) reconstruction.
+  EWA splat shaders follow 3DGS (Kerbl et al.) / antimatter15's WebGL formulation.
+-->
+<html lang="en">
+<head>
+<meta charset="utf-8"/>
+<meta name="viewport" content="width=device-width, initial-scale=1"/>
+<title>free-splatter.cpp — photos to gaussian splat</title>
+<style>
+  :root { color-scheme: dark; }
+  * { margin: 0; box-sizing: border-box; }
+  html, body { height: 100%; background: #0b0d10; overflow: hidden; font-family: ui-sans-serif, system-ui, sans-serif; color: #e8eaed; }
+  #gl { position: fixed; inset: 0; width: 100%; height: 100%; display: block; }
+
+  .card { background: rgba(16,19,24,.86); backdrop-filter: blur(10px);
+    border: 1px solid rgba(255,255,255,.08); border-radius: 14px; box-shadow: 0 10px 40px rgba(0,0,0,.5); }
+
+  /* upload overlay (initial + "new photos") */
+  #drop { position: fixed; inset: 0; z-index: 20; display: flex; align-items: center; justify-content: center; }
+  #drop .card { width: min(440px, 92vw); padding: 26px 26px 22px; text-align: center; }
+  #drop h1 { font-size: 18px; font-weight: 680; letter-spacing: .2px; }
+  #model { width: 100%; margin-top: 12px; padding: 7px 9px; font-size: 12.5px; color: #e8eaed;
+    background: rgba(255,255,255,.06); border: 1px solid rgba(255,255,255,.12); border-radius: 8px; cursor: pointer; }
+  #model:hover { background: rgba(255,255,255,.1); }
+  #bgrow { display: flex; align-items: center; gap: 6px; font-size: 12px; color: #b9c0cc; margin-top: 12px; cursor: pointer; }
+  #bgrow input { accent-color: #6f9bff; }
+  #bgrow .bgnote { color: #8b93a1; font-size: 11px; }
+  #drop .sub { color: #8b93a1; font-size: 12.5px; margin-top: 4px; line-height: 1.5; }
+  #zone { margin-top: 18px; border: 1.5px dashed rgba(255,255,255,.18); border-radius: 12px;
+    padding: 26px 16px; cursor: pointer; transition: border-color .15s, background .15s; }
+  #zone.hot { border-color: #6f9bff; background: rgba(91,140,255,.08); }
+  #zone .big { font-size: 13.5px; }
+  #zone .small { color: #8b93a1; font-size: 11.5px; margin-top: 4px; }
+  #thumbs { display: flex; flex-wrap: wrap; gap: 8px; justify-content: center; margin-top: 14px; }
+  #thumbs img { width: 62px; height: 62px; object-fit: cover; border-radius: 8px; border: 1px solid rgba(255,255,255,.12); }
+  .btn { margin-top: 16px; width: 100%; padding: 11px; font-size: 13.5px; font-weight: 640; color: #fff; cursor: pointer;
+    border: none; border-radius: 10px; background: linear-gradient(90deg,#5b8cff,#8a5bff); }
+  .btn:disabled { opacity: .45; cursor: not-allowed; }
+  .btn.ghost { background: rgba(255,255,255,.08); }
+  #drop .err { color: #ff8a8a; font-size: 12px; margin-top: 10px; min-height: 1em; }
+
+  /* viewer panel (after reconstruction) */
+  #panel { position: fixed; top: 16px; left: 16px; z-index: 10; width: 264px; padding: 14px 16px; font-size: 13px; line-height: 1.45; display: none; }
+  #panel h1 { font-size: 14px; font-weight: 650; }
+  #panel .sub { color: #8b93a1; font-size: 11.5px; margin-top: 2px; }
+  .stat { display: flex; justify-content: space-between; font-variant-numeric: tabular-nums; margin-top: 3px; }
+  .stat span:first-child { color: #8b93a1; }
+  #status { margin-top: 9px; color: #9fd3a0; font-size: 11.5px; min-height: 1.2em; }
+  #status.err { color: #ff8a8a; }
+  #autores { display: flex; align-items: center; gap: 6px; font-size: 11.5px; color: #8b93a1; margin-top: 7px; cursor: pointer; }
+  #autores input { accent-color: #6f9bff; }
+  .rev { margin-top: 12px; border-top: 1px solid rgba(255,255,255,.08); padding-top: 11px; }
+  .rev .lab { display: flex; justify-content: space-between; font-size: 11.5px; color: #8b93a1; }
+  #reveal { width: 100%; margin: 7px 0 4px; accent-color: #6f9bff; }
+  .row { display: flex; gap: 8px; margin-top: 8px; }
+  .row .btn { margin-top: 0; padding: 8px; font-size: 12px; }
+  #hint { position: fixed; bottom: 14px; left: 50%; transform: translateX(-50%); z-index: 10;
+    color: #6b7280; font-size: 11.5px; background: rgba(16,19,24,.7); padding: 6px 12px; border-radius: 8px; display: none; }
+</style>
+</head>
+<body>
+<canvas id="gl"></canvas>
+
+<div id="drop">
+  <div class="card">
+    <a href="/" style="display:inline-block;color:#7aa2ff;text-decoration:none;font-size:11.5px;margin-bottom:8px">‹ all demos</a>
+    <h1>Photos → 3D gaussian splat</h1>
+    <select id="model" title="reconstruction model"></select>
+    <div class="sub" id="sub">Drop photos of the same scene from different angles — they must overlap. One feed-forward pass on the GPU (Vulkan).</div>
+    <div id="zone">
+      <div class="big">📷 Drag photos here, or click to choose</div>
+      <div class="small" id="zone-hint">JPG / PNG · overlapping views</div>
+    </div>
+    <label id="bgrow" style="display:none"><input type="checkbox" id="bgremove" checked/> remove background <span class="bgnote">(recommended; ~2s/photo)</span></label>
+    <div id="thumbs"></div>
+    <button id="go" class="btn" disabled>Reconstruct</button>
+    <div id="err" class="err"></div>
+    <input id="file" type="file" accept="image/*" multiple hidden/>
+  </div>
+</div>
+
+<div id="panel" class="card">
+  <a href="/" style="display:inline-block;color:#7aa2ff;text-decoration:none;font-size:11.5px;margin-bottom:6px">‹ all demos</a>
+  <h1>free-splatter.cpp</h1>
+  <div class="sub">3D gaussian scene from your photos.</div>
+  <div class="stat"><span>views</span><span id="s-views">—</span></div>
+  <div class="stat"><span>splats</span><span id="s-splats">0</span></div>
+  <div class="stat"><span>reconstruct</span><span id="s-time">—</span></div>
+  <div class="stat"><span>fps</span><span id="s-fps">—</span></div>
+  <div class="stat"><span>render</span><span id="s-render">100%</span></div>
+  <label id="autores"><input type="checkbox" id="autores-cb" checked/> auto quality (smooth navigation)</label>
+  <label id="vibrow" style="display:flex;align-items:center;gap:8px;font-size:11.5px;color:#8b93a1;margin-top:7px;">
+    <span>vibrance</span><input id="vibrance" type="range" min="0.6" max="2.2" step="0.05" value="1.45" style="flex:1;accent-color:#6f9bff"/>
+  </label>
+  <div id="status">…</div>
+  <div class="rev">
+    <div class="lab"><span>build-up</span><span id="s-reveal">100%</span></div>
+    <input id="reveal" type="range" min="0" max="1000" value="1000"/>
+    <div class="row">
+      <button id="replay" class="btn ghost">▶ replay</button>
+      <button id="again" class="btn ghost">＋ new photos</button>
+    </div>
+  </div>
+</div>
+
+<div id="hint">drag to orbit · scroll to zoom · WASD/arrows to move</div>
+
+<script type="module">
+const ROW = 32;                       // bytes per splat
+const $ = (id) => document.getElementById(id);
+const P = new URLSearchParams(location.search);   // ?id=&yaw=&pitch=&dist=&reveal=
+const setStatus = (t, err=false) => { const s=$("status"); s.textContent=t; s.classList.toggle("err",err); };
+
+const canvas = $("gl");
+const gl = canvas.getContext("webgl2", { antialias: false, premultipliedAlpha: true, alpha: false });
+if (!gl) { $("err").textContent = "WebGL2 not available in this browser."; throw new Error("no webgl2"); }
+
+// ---------- shaders (EWA splatting) ----------
+const VERT = `#version 300 es
+precision highp float; precision highp int;
+uniform highp usampler2D u_tex;
+uniform mat4 u_proj, u_view;
+uniform vec2 u_focal, u_viewport;
+in vec2 a_corner;      // quad corner in [-2,2]
+in uint a_index;       // sorted splat index
+out vec4 v_color;
+out vec2 v_pos;
+void main() {
+  uint t = a_index * 2u;
+  ivec2 wh = textureSize(u_tex, 0);
+  uvec4 A = texelFetch(u_tex, ivec2(int(t % uint(wh.x)), int(t / uint(wh.x))), 0);
+  uvec4 B = texelFetch(u_tex, ivec2(int((t+1u) % uint(wh.x)), int((t+1u) / uint(wh.x))), 0);
+  vec3 center = uintBitsToFloat(A.xyz);
+  vec3 scale  = vec3(uintBitsToFloat(A.w), uintBitsToFloat(B.x), uintBitsToFloat(B.y));
+  uint rgba = B.z, rotp = B.w;
+  v_color = vec4(float(rgba & 0xffu), float((rgba>>8)&0xffu), float((rgba>>16)&0xffu), float((rgba>>24)&0xffu)) / 255.0;
+  vec4 q = (vec4(float(rotp&0xffu), float((rotp>>8)&0xffu), float((rotp>>16)&0xffu), float((rotp>>24)&0xffu)) - 128.0) / 128.0;
+  q = normalize(q);                    // (w,x,y,z)
+  float w=q.x, x=q.y, y=q.z, z=q.w;
+  mat3 R = mat3(
+    1.-2.*(y*y+z*z), 2.*(x*y+w*z),   2.*(x*z-w*y),
+    2.*(x*y-w*z),    1.-2.*(x*x+z*z),2.*(y*z+w*x),
+    2.*(x*z+w*y),    2.*(y*z-w*x),   1.-2.*(x*x+y*y));
+  mat3 S = mat3(scale.x,0.,0., 0.,scale.y,0., 0.,0.,scale.z);
+  mat3 M = R * S;
+  mat3 cov3d = M * transpose(M);
+
+  vec4 cam = u_view * vec4(center, 1.0);
+  if (cam.z >= -0.2) { gl_Position = vec4(0.0,0.0,2.0,1.0); return; }
+  vec4 clip = u_proj * cam;
+  if (clip.w <= 0.0 || abs(clip.x) > 1.3*clip.w || abs(clip.y) > 1.3*clip.w) {
+    gl_Position = vec4(0.0,0.0,2.0,1.0); return;
+  }
+  mat3 J = mat3(
+    u_focal.x/cam.z, 0.0, -(u_focal.x*cam.x)/(cam.z*cam.z),
+    0.0, -u_focal.y/cam.z, (u_focal.y*cam.y)/(cam.z*cam.z),
+    0.0, 0.0, 0.0);
+  mat3 W = transpose(mat3(u_view));
+  mat3 T = W * J;
+  mat3 cov2 = transpose(T) * cov3d * T;
+  float a = cov2[0][0] + 0.3, b = cov2[0][1], c = cov2[1][1] + 0.3;
+  float det = a*c - b*b;
+  if (det <= 0.0) { gl_Position = vec4(0.0,0.0,2.0,1.0); return; }
+  float mid = 0.5*(a+c);
+  float rad = sqrt(max(0.0, mid*mid - det));
+  float l1 = mid + rad;
+  float l2 = max(mid - rad, 0.1);
+  vec2 d = vec2(b, l1 - a);
+  vec2 e1 = (dot(d,d) > 1e-9) ? normalize(d) : vec2(1.0, 0.0);
+  float MAXPX = 0.20 * min(u_viewport.x, u_viewport.y);
+  vec2 v1 = min(sqrt(2.0*l1), MAXPX) * e1;
+  vec2 v2 = min(sqrt(2.0*l2), MAXPX) * vec2(e1.y, -e1.x);
+
+  vec2 ndc = clip.xy / clip.w;
+  vec2 offset = (a_corner.x*v1 + a_corner.y*v2) / u_viewport * 2.0;
+  v_pos = a_corner;
+  gl_Position = vec4(ndc + offset, clip.z/clip.w, 1.0);
+}`;
+
+const FRAG = `#version 300 es
+precision highp float;
+uniform float u_sat;     // saturation/vibrance boost (1.0 = as-decoded)
+in vec4 v_color; in vec2 v_pos;
+out vec4 o;
+void main() {
+  float p = -dot(v_pos, v_pos);
+  if (p < -4.0) discard;
+  float alpha = exp(p) * v_color.a;
+  // SH-DC-only color is flat/desaturated; push each splat's color away from its
+  // luma before the premultiplied blend (the blend itself also greys overlaps).
+  vec3 rgb = v_color.rgb;
+  float luma = dot(rgb, vec3(0.2126, 0.7152, 0.0722));
+  rgb = clamp(mix(vec3(luma), rgb, u_sat), 0.0, 1.0);
+  o = vec4(rgb * alpha, alpha);
+}`;
+
+function shader(type, src){ const s=gl.createShader(type); gl.shaderSource(s,src); gl.compileShader(s);
+  if(!gl.getShaderParameter(s,gl.COMPILE_STATUS)) throw new Error(gl.getShaderInfoLog(s)); return s; }
+const prog = gl.createProgram();
+gl.attachShader(prog, shader(gl.VERTEX_SHADER, VERT));
+gl.attachShader(prog, shader(gl.FRAGMENT_SHADER, FRAG));
+gl.linkProgram(prog);
+if(!gl.getProgramParameter(prog, gl.LINK_STATUS)) throw new Error(gl.getProgramInfoLog(prog));
+gl.useProgram(prog);
+
+const quad = new Float32Array([-2,-2, 2,-2, 2,2, -2,-2, 2,2, -2,2]);
+const quadBuf = gl.createBuffer(); gl.bindBuffer(gl.ARRAY_BUFFER, quadBuf); gl.bufferData(gl.ARRAY_BUFFER, quad, gl.STATIC_DRAW);
+const aCorner = gl.getAttribLocation(prog, "a_corner");
+const idxBuf = gl.createBuffer();
+const aIndex = gl.getAttribLocation(prog, "a_index");
+const tex = gl.createTexture();
+const TEX_W = 2048;
+const U = {
+  proj: gl.getUniformLocation(prog,"u_proj"), view: gl.getUniformLocation(prog,"u_view"),
+  focal: gl.getUniformLocation(prog,"u_focal"), viewport: gl.getUniformLocation(prog,"u_viewport"),
+  sat: gl.getUniformLocation(prog,"u_sat"),
+};
+gl.uniform1i(gl.getUniformLocation(prog,"u_tex"), 0);
+gl.disable(gl.DEPTH_TEST);
+gl.enable(gl.BLEND);
+gl.blendFuncSeparate(gl.ONE, gl.ONE_MINUS_SRC_ALPHA, gl.ONE, gl.ONE_MINUS_SRC_ALPHA);
+
+// ---------- splat store ----------
+let count = 0;                         // total splats ingested
+let revealCount = 0;                   // top-K (importance-sorted) actually shown
+let followFull = true;                 // reveal tracks count as data streams in
+// default render ceiling: show the top 300k for smooth navigation even when the
+// splat holds up to ~1M; the slider still reaches the full count. ?maxsplats= overrides.
+const VIEW_CAP = parseInt(P.get("maxsplats") ?? "300000", 10);
+// ease-in reveal: front-loaded importance-sorted splats fill fast then stall, so
+// reveal count slowly at first to even out the visible build-up. ?buildcurve= tunes.
+const BUILD_CURVE = parseFloat(P.get("buildcurve") ?? "3");
+let posSent = false;                   // positions uploaded to the sort worker?
+let positions = new Float32Array(0);   // xyz per splat (for sorting)
+let texData = new Uint32Array(0);      // 8 uint32 per splat
+
+function ensureCapacity(n) {
+  if (n*8 <= texData.length) return;
+  const cap = Math.max(n, Math.ceil(texData.length/8*1.5), 65536);
+  const td = new Uint32Array(cap*8); td.set(texData); texData = td;
+  const ps = new Float32Array(cap*3); ps.set(positions); positions = ps;
+}
+function uploadTexture() {
+  const rows = Math.ceil((count*2)/TEX_W);
+  const buf = new Uint32Array(TEX_W*rows*4);
+  buf.set(texData.subarray(0, count*8));
+  gl.activeTexture(gl.TEXTURE0); gl.bindTexture(gl.TEXTURE_2D, tex);
+  gl.texImage2D(gl.TEXTURE_2D, 0, gl.RGBA32UI, TEX_W, rows, 0, gl.RGBA_INTEGER, gl.UNSIGNED_INT, buf);
+  gl.texParameteri(gl.TEXTURE_2D, gl.TEXTURE_MIN_FILTER, gl.NEAREST);
+  gl.texParameteri(gl.TEXTURE_2D, gl.TEXTURE_MAG_FILTER, gl.NEAREST);
+}
+function ingest(bytes) {
+  const n = Math.floor(bytes.byteLength / ROW);
+  ensureCapacity(n);
+  const f32 = new Float32Array(bytes.buffer, bytes.byteOffset, n*8);
+  const u32 = new Uint32Array(bytes.buffer, bytes.byteOffset, n*8);
+  texData.set(u32.subarray(0, n*8), 0);
+  for (let i=0;i<n;i++){ positions[i*3]=f32[i*8]; positions[i*3+1]=f32[i*8+1]; positions[i*3+2]=f32[i*8+2]; }
+  count = n;
+  posSent = false;                     // new positions -> resend to the sort worker
+  if (followFull) revealCount = Math.min(count, VIEW_CAP);
+  uploadTexture();
+  $("s-splats").textContent = count.toLocaleString();
+  syncRevealUI();
+  if (count > 5000) frameToData();
+}
+
+// ---------- reveal ----------
+function setReveal(k) {
+  revealCount = Math.max(0, Math.min(count, k|0));
+  followFull = revealCount >= count;
+  sortDirty = true;
+  syncRevealUI();
+}
+function syncRevealUI() {
+  const r = $("reveal");
+  r.max = Math.max(1, count);
+  r.value = revealCount;
+  const pct = count ? Math.round(revealCount/count*100) : 0;
+  $("s-reveal").textContent = revealCount.toLocaleString() + " · " + pct + "%";
+}
+$("reveal").addEventListener("input", e => setReveal(+e.target.value));
+
+let buildupRAF = 0;
+function playBuildup(ms=3500) {
+  if (count === 0) return;
+  cancelAnimationFrame(buildupRAF);
+  const cap = Math.min(count, VIEW_CAP);         // build up to the render ceiling, not the full 1M
+  const t0 = performance.now(), start = Math.max(1, Math.floor(cap*0.012));
+  const step = (now) => {
+    const u = Math.min(1, (now - t0) / ms);
+    const e = Math.pow(u, BUILD_CURVE);          // ease-in (even visible build-up)
+    setReveal(Math.floor(start + (cap-start)*e));
+    if (u < 1) buildupRAF = requestAnimationFrame(step); else setReveal(cap);
+  };
+  buildupRAF = requestAnimationFrame(step);
+}
+$("replay").addEventListener("click", () => playBuildup());
+
+// ---------- depth-sort worker ----------
+const worker = new Worker(URL.createObjectURL(new Blob([`
+let pos=null, n=0;
+onmessage = (e) => {
+  if (e.data.pos) { pos = e.data.pos; n = e.data.n; return; }
+  if (!pos) return;
+  const view = e.data.view, N = Math.min(n, e.data.n);
+  const d = new Float32Array(N); let dmin=Infinity, dmax=-Infinity;
+  for (let i=0;i<N;i++){
+    const z = view[2]*pos[i*3] + view[6]*pos[i*3+1] + view[10]*pos[i*3+2];
+    d[i]=z; if(z<dmin)dmin=z; if(z>dmax)dmax=z;
+  }
+  const B=65536, cnt=new Uint32Array(B);
+  const scale = dmax>dmin ? (B-1)/(dmax-dmin) : 0;
+  const key=new Uint16Array(N);
+  for(let i=0;i<N;i++){ let k=((d[i]-dmin)*scale)|0; k=k<0?0:k>65535?65535:k; key[i]=k; cnt[k]++; }
+  for(let i=1;i<B;i++) cnt[i]+=cnt[i-1];
+  const order=new Uint32Array(N);
+  for(let i=N-1;i>=0;i--) order[--cnt[key[i]]]=i;
+  postMessage({order, n:N}, [order.buffer]);
+};
+`], {type:"text/javascript"})));
+
+let sortedIndex = new Uint32Array(0), sortedCount = 0, sorting = false, sortDirty = true;
+worker.onmessage = (e) => {
+  sortedIndex = e.data.order; sortedCount = e.data.n; sorting = false;
+  gl.bindBuffer(gl.ARRAY_BUFFER, idxBuf);
+  gl.bufferData(gl.ARRAY_BUFFER, sortedIndex, gl.DYNAMIC_DRAW);
+};
+function requestSort(view) {
+  if (sorting || revealCount === 0) return;
+  sorting = true;
+  // positions change only on ingest; resend them then, not every camera move
+  // (the worker keeps the full set and sorts the first revealCount of them).
+  if (!posSent) { worker.postMessage({ pos: positions.slice(0, count*3), n: count }); posSent = true; }
+  worker.postMessage({ view, n: revealCount });
+}
+
+// ---------- camera ----------
+// yaw/pitch/dist overridable via query (shareable views + headless screenshots).
+let target = [0,0,0], framed = false;
+let yaw   = parseFloat(P.get("yaw")   ?? "0.0");
+let pitch = parseFloat(P.get("pitch") ?? "0.05");
+let dist  = parseFloat(P.get("dist")  ?? "2.0");
+function frameToData() {
+  if (framed || count === 0) return;
+  let lo = [1e30,1e30,1e30], hi = [-1e30,-1e30,-1e30];
+  for (let i=0;i<count;i++) for (let k=0;k<3;k++){ const v=positions[i*3+k]; if(v<lo[k])lo[k]=v; if(v>hi[k])hi[k]=v; }
+  const diag = Math.hypot(hi[0]-lo[0], hi[1]-lo[1], hi[2]-lo[2]);
+  target = [(lo[0]+hi[0])/2, (lo[1]+hi[1])/2, (lo[2]+hi[2])/2];
+  if (!P.has("dist")) dist = Math.max(0.5, 0.7*diag);
+  framed = true; sortDirty = true;
+}
+const keys = {};
+addEventListener("keydown", e => { if (drop.style.display==="none") keys[e.key.toLowerCase()] = true; });
+addEventListener("keyup",   e => keys[e.key.toLowerCase()] = false);
+let drag=false, lx=0, ly=0;
+canvas.addEventListener("pointerdown", e => { drag=true; lx=e.clientX; ly=e.clientY; canvas.setPointerCapture(e.pointerId); });
+canvas.addEventListener("pointerup",   e => drag=false);
+canvas.addEventListener("pointermove", e => { if(!drag)return; yaw -= (e.clientX-lx)*0.005; pitch -= (e.clientY-ly)*0.005;
+  pitch = Math.max(-1.5, Math.min(1.5, pitch)); lx=e.clientX; ly=e.clientY; sortDirty=true; });
+canvas.addEventListener("wheel", e => { dist *= Math.exp(e.deltaY*0.001); dist=Math.max(0.2,Math.min(10,dist)); sortDirty=true; e.preventDefault(); }, {passive:false});
+
+function mat4Mul(a,b){ const o=new Float32Array(16); for(let i=0;i<4;i++)for(let j=0;j<4;j++){let s=0;for(let k=0;k<4;k++)s+=a[k*4+j]*b[i*4+k];o[i*4+j]=s;} return o; }
+function perspective(fovy, aspect, n, f){ const t=1/Math.tan(fovy/2); return new Float32Array([t/aspect,0,0,0, 0,t,0,0, 0,0,(f+n)/(n-f),-1, 0,0,(2*f*n)/(n-f),0]); }
+function lookAt(eye, c){ const z=norm(sub(eye,c)), x=norm(cross([0,1,0],z)), y=cross(z,x);
+  return new Float32Array([x[0],y[0],z[0],0, x[1],y[1],z[1],0, x[2],y[2],z[2],0, -dot(x,eye),-dot(y,eye),-dot(z,eye),1]); }
+const sub=(a,b)=>[a[0]-b[0],a[1]-b[1],a[2]-b[2]], dot=(a,b)=>a[0]*b[0]+a[1]*b[1]+a[2]*b[2];
+const cross=(a,b)=>[a[1]*b[2]-a[2]*b[1],a[2]*b[0]-a[0]*b[2],a[0]*b[1]-a[1]*b[0]];
+const norm=(a)=>{const l=Math.hypot(...a)||1;return [a[0]/l,a[1]/l,a[2]/l];};
+
+// adaptive resolution: overdraw (not geometry) is the bottleneck when you move
+// into the cloud, so trade internal render scale for framerate and restore it
+// when FPS recovers. The CSS size stays full; only the GL backing shrinks.
+let renderScale = 1.0, autoRes = true;
+// vibrance (saturation) — cosmetic, in the demo layer only; ?sat= overrides so
+// headless/video renders match what you dialed in.
+let saturation = parseFloat(P.get("sat") ?? "1.45");
+{ const vib = $("vibrance"); vib.value = saturation;
+  vib.addEventListener("input", e => { saturation = +e.target.value; }); }
+function resize(){ const dpr=Math.min(devicePixelRatio,2)*renderScale;
+  canvas.width=Math.max(1,Math.round(innerWidth*dpr)); canvas.height=Math.max(1,Math.round(innerHeight*dpr));
+  gl.viewport(0,0,canvas.width,canvas.height); }
+addEventListener("resize", resize); resize();
+$("autores-cb").addEventListener("change", e => {
+  autoRes = e.target.checked;
+  if (!autoRes) { renderScale = 1.0; resize(); $("s-render").textContent = "100%"; }
+});
+
+let frames=0, fpsT=performance.now();
+function render(){
+  const speed = dist*0.02;
+  const fwd=[Math.sin(yaw)*Math.cos(pitch),Math.sin(pitch),Math.cos(yaw)*Math.cos(pitch)];
+  const right=norm(cross([0,1,0],fwd));
+  if(keys["w"]||keys["arrowup"]) { for(let i=0;i<3;i++) target[i]-=fwd[i]*speed; sortDirty=true; }
+  if(keys["s"]||keys["arrowdown"]){ for(let i=0;i<3;i++) target[i]+=fwd[i]*speed; sortDirty=true; }
+  if(keys["a"]||keys["arrowleft"]){ for(let i=0;i<3;i++) target[i]-=right[i]*speed; sortDirty=true; }
+  if(keys["d"]||keys["arrowright"]){for(let i=0;i<3;i++) target[i]+=right[i]*speed; sortDirty=true; }
+
+  const eye=[target[0]+dist*Math.sin(yaw)*Math.cos(pitch), target[1]+dist*Math.sin(pitch), target[2]+dist*Math.cos(yaw)*Math.cos(pitch)];
+  const view=lookAt(eye,target);
+  const aspect=canvas.width/canvas.height, fovy=1.0;
+  const proj=perspective(fovy, aspect, 0.05, 100);
+  const fy=canvas.height/(2*Math.tan(fovy/2)), fx=fy;
+
+  if(sortDirty){ requestSort(view); sortDirty=false; }
+
+  gl.clearColor(0.043,0.051,0.063,1); gl.clear(gl.COLOR_BUFFER_BIT);
+  gl.useProgram(prog);
+  gl.uniformMatrix4fv(U.view,false,view); gl.uniformMatrix4fv(U.proj,false,proj);
+  gl.uniform2f(U.focal,fx,fy); gl.uniform2f(U.viewport,canvas.width,canvas.height);
+  gl.uniform1f(U.sat, saturation);
+  gl.bindBuffer(gl.ARRAY_BUFFER,quadBuf); gl.enableVertexAttribArray(aCorner); gl.vertexAttribPointer(aCorner,2,gl.FLOAT,false,0,0);
+  if(sortedCount>0){
+    gl.bindBuffer(gl.ARRAY_BUFFER,idxBuf); gl.enableVertexAttribArray(aIndex);
+    gl.vertexAttribIPointer(aIndex,1,gl.UNSIGNED_INT,0,0); gl.vertexAttribDivisor(aIndex,1);
+    gl.drawArraysInstanced(gl.TRIANGLES,0,6,sortedCount);
+  }
+  frames++; const now=performance.now();
+  if(now-fpsT>500){
+    const fps=Math.round(frames*1000/(now-fpsT)); frames=0; fpsT=now;
+    $("s-fps").textContent=fps;
+    if(autoRes){                       // hold ~30fps by trading resolution
+      if(fps<26 && renderScale>0.35){ renderScale=Math.max(0.35,renderScale-0.12); resize(); }
+      else if(fps>52 && renderScale<1.0){ renderScale=Math.min(1.0,renderScale+0.12); resize(); }
+    }
+    $("s-render").textContent=Math.round(renderScale*100)+"%";
+  }
+  requestAnimationFrame(render);
+}
+requestAnimationFrame(render);
+
+// ---------- splat load (progressive) ----------
+async function loadSplat(url){
+  framed = false; followFull = true; count = 0; revealCount = 0; posSent = false;
+  const resp = await fetch(url);
+  if(!resp.ok) throw new Error("HTTP "+resp.status+" for splat");
+  const total = +resp.headers.get("content-length") || 0;
+  const reader = resp.body.getReader();
+  let chunks=[], received=0, lastIngest=0;
+  while(true){
+    const {done,value}=await reader.read();
+    if(done) break;
+    chunks.push(value); received+=value.byteLength;
+    if(received-lastIngest > 4096*ROW || !total){
+      const all=new Uint8Array(received); let o=0; for(const c of chunks){all.set(c,o); o+=c.byteLength;}
+      ingest(all); sortDirty=true; lastIngest=received;
+    }
+  }
+  const all=new Uint8Array(received); let o=0; for(const c of chunks){all.set(c,o); o+=c.byteLength;}
+  ingest(all); sortDirty=true;
+}
+
+// ---------- upload UI ----------
+const drop = $("drop");
+let selected = [];
+function showThumbs() {
+  const t = $("thumbs"); t.innerHTML = "";
+  for (const f of selected) { const im = document.createElement("img"); im.src = URL.createObjectURL(f); t.appendChild(im); }
+  $("go").disabled = selected.length === 0;
+}
+function pick(files) {
+  selected = [...files].filter(f => f.type.startsWith("image/")).slice(0, 8);
+  $("err").textContent = "";
+  showThumbs();
+}
+// model selector: populate from /api/models; update the hint per model. Hidden
+// when only one model is loaded.
+let MODELS = [], BG_AVAILABLE = false;
+(async () => {
+  const sel = $("model");
+  try {
+    const data = await (await fetch("/api/models")).json();
+    MODELS = data.models || []; BG_AVAILABLE = !!data.bg_remove;
+  } catch { MODELS = []; }
+  if (MODELS.length <= 1) sel.style.display = "none";
+  for (const m of MODELS) {
+    const o = document.createElement("option");
+    o.value = m.name; o.textContent = `${m.label} · ${m.views} views`;
+    sel.appendChild(o);
+  }
+  const applyHint = () => {
+    const m = MODELS.find(x => x.name === sel.value) || MODELS[0];
+    if (!m) return;
+    $("sub").textContent = m.hint;
+    $("zone-hint").textContent = `JPG / PNG · best with ${m.views} overlapping views`;
+    // background removal only helps the object path (isolate one object on white)
+    $("bgrow").style.display = (BG_AVAILABLE && sel.value === "object") ? "flex" : "none";
+  };
+  sel.addEventListener("change", applyHint);
+  applyHint();
+})();
+
+$("zone").addEventListener("click", () => $("file").click());
+$("file").addEventListener("change", e => pick(e.target.files));
+$("zone").addEventListener("dragover", e => { e.preventDefault(); $("zone").classList.add("hot"); });
+$("zone").addEventListener("dragleave", () => $("zone").classList.remove("hot"));
+$("zone").addEventListener("drop", e => { e.preventDefault(); $("zone").classList.remove("hot"); pick(e.dataTransfer.files); });
+addEventListener("dragover", e => e.preventDefault());
+addEventListener("drop", e => e.preventDefault());
+
+$("again").addEventListener("click", () => { drop.style.display = "flex"; $("panel").style.display = "none"; $("hint").style.display = "none"; });
+
+// deep-link: ?id=<splat> skips the uploader (shareable result + headless render).
+(async () => {
+  const id = P.get("id");
+  if (!id) return;
+  drop.style.display = "none"; $("panel").style.display = "block"; $("hint").style.display = "block";
+  try {
+    await loadSplat("/api/splat/" + id);
+    const rv = P.get("reveal");
+    if (rv != null) setReveal(Math.round(parseFloat(rv) * count)); else playBuildup();
+    setStatus(`ready · ${count.toLocaleString()} splats`);
+    $("s-splats").textContent = count.toLocaleString();
+  } catch (e) { setStatus(String(e.message || e), true); }
+})();
+
+$("go").addEventListener("click", async () => {
+  if (selected.length === 0) return;
+  $("go").disabled = true; $("err").textContent = "";
+  $("go").textContent = "Uploading…";
+  try {
+    const removeBg = $("bgremove").checked && $("bgrow").style.display !== "none";
+    const fd = new FormData();
+    for (const f of selected) fd.append("images", f);
+    fd.append("model", $("model").value || "");
+    if (removeBg) fd.append("remove_bg", "1");
+    const t0 = performance.now();
+    $("go").textContent = removeBg ? "Removing background + reconstructing…" : "Reconstructing… (Vulkan)";
+    const resp = await fetch("/api/reconstruct", { method: "POST", body: fd });
+    if (!resp.ok) throw new Error(await resp.text());
+    const meta = await resp.json();
+    // reveal the viewer
+    drop.style.display = "none"; $("panel").style.display = "block"; $("hint").style.display = "block";
+    $("s-views").textContent = meta.n_views;
+    $("s-time").textContent = meta.seconds.toFixed(2) + "s";
+    setStatus(`reconstructed ${meta.n_splats.toLocaleString()} gaussians`);
+    await loadSplat("/api/splat/" + meta.id);
+    setStatus(`ready · ${count.toLocaleString()} splats`);
+    playBuildup();
+  } catch (err) {
+    $("err").textContent = String(err.message || err);
+    console.error(err);
+  } finally {
+    $("go").disabled = selected.length === 0; $("go").textContent = "Reconstruct";
+  }
+});
+</script>
+</body>
+</html>

From f045b948e8def8e664e276f98176042610f5f2f7 Mon Sep 17 00:00:00 2001
From: Richard Palethorpe <io@richiejp.com>
Date: Mon, 29 Jun 2026 09:14:00 +0100
Subject: [PATCH 33/33] Tree-merge accumulation + vibrance slider for the
 accumulate demo
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a batch hierarchical (balanced binary tree) alternative to the linear
Sim(3) accumulator, to cut the registration drift that smears the far end of a
long photo stream. Adjacent submaps of `block` frames overlap by `overlap`
frames and each merge fits one Sim(3) over all shared frames at once, so the
fit is over-determined and per-frame depth noise averages out — drift compounds
over ~log2(N) hops instead of N. block=2,overlap=1 is the plain overlap-by-one
tree.

- Engine (src/pose.{h,cpp}): tree_accumulate_overlap + consensus_fuse of an
  arbitrary frame-tagged cloud (best-frame mode de-ghosts the overlapping
  per-frame copies); shared fit_band_sim RANSAC registration primitive.
- C API: free_splatter_tree_overlap (full merge or staged side-by-side via
  block/overlap/max_levels/layout_spacing/per_node_cap) + free_splatter_fuse_cloud.
- CLI: --accumulate-tree (--tree-block/--tree-overlap, --fuse) and --tree-stages,
  which bakes one laid-out .splat per merge level + a consensus-fused final stage
  and a manifest the viewer steps through.
- Viewer (accumulate.html): vibrance/saturation slider (?sat= override), ported
  from reconstruct.html; reframes per stage for tree-stages manifests.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 include/free_splatter.h     |  32 ++++++
 server/web/accumulate.html  |  23 +++-
 src/free_splatter.cpp       |  62 +++++++++-
 src/pose.cpp                | 220 ++++++++++++++++++++++++++++++++++++
 src/pose.h                  |  40 +++++++
 tools/free_splatter-cli.cpp | 104 ++++++++++++++++-
 6 files changed, 477 insertions(+), 4 deletions(-)

diff --git a/include/free_splatter.h b/include/free_splatter.h
index d27c8f4..22ea889 100644
--- a/include/free_splatter.h
+++ b/include/free_splatter.h
@@ -171,6 +171,38 @@ FREE_SPLATTER_API int free_splatter_refine_cloud(
 FREE_SPLATTER_API int free_splatter_accumulator_refine(
     free_splatter_accumulator * acc, float voxel_frac, int32_t iters, float alpha);
 
+// Consensus-fuse an arbitrary point cloud (e.g. a tree root) using each point's
+// `frame` tag: keep voxels (size voxel_frac*extent) seen by >= k distinct frames,
+// emitting per `mode` (0 averaged, 1 kept, 2 best-frame — dense AND de-ghosted).
+// *out malloc'd (free with free_splatter_buf_free). Returns 0 on success.
+FREE_SPLATTER_API int free_splatter_fuse_cloud(
+    const free_splatter_point * pts, size_t n, float voxel_frac, int32_t k, int32_t mode,
+    free_splatter_point ** out, size_t * n_out);
+
+// Hierarchical (balanced binary tree) accumulation — batch alternative to the
+// linear accumulator. pairs[k] is one engine output [2*H*W*gc] (pair k = frames
+// k,k+1; pair k's view-1 and pair k+1's view-0 are the same frame). Adjacent
+// submaps of `block` frames overlap by `overlap` frames, and each merge fits its
+// Sim(3) over all shared frames at once (over-determined -> averages per-frame
+// noise) — so drift compounds over ~log2(N) hops, not N, and is spread evenly
+// instead of dumped on the last frame. block=2,overlap=1 is the plain
+// overlap-by-one (single shared boundary) tree.
+//
+// The remaining args are for visualising the merge side by side (pass max_levels=-1,
+// layout_spacing=0, per_node_cap=0, n_nodes=NULL for a plain full merge to the root):
+// max_levels stops after that many rounds (-1 = full), so 0 returns the independent
+// base submaps, 1 the first merge round, …; step 0..D to watch the tree collapse to
+// one scene. The remaining nodes are laid out side by side (layout_spacing: 0 = none,
+// <0 = auto, >0 = explicit units) and each capped to its most important
+// (opacity*radius) points (per_node_cap>0) so every scene renders at its own detail.
+// *n_nodes (optional) gets the remaining node count (1 once fully merged). *out
+// malloc'd as free_splatter_point[*n_out] (free with free_splatter_buf_free). 0 on success.
+FREE_SPLATTER_API int free_splatter_tree_overlap(
+    const float * const * pairs, int32_t n_pairs, int32_t gaussian_channels,
+    int32_t height, int32_t width, float opacity_threshold, int32_t block, int32_t overlap,
+    int32_t max_levels, float layout_spacing, int32_t per_node_cap,
+    free_splatter_point ** out, size_t * n_out, int32_t * n_nodes);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/server/web/accumulate.html b/server/web/accumulate.html
index 5b5365a..5b6692e 100644
--- a/server/web/accumulate.html
+++ b/server/web/accumulate.html
@@ -91,6 +91,9 @@ <h1>accumulating reconstruction</h1>
   <div class="stat"><span>fps</span><span id="s-fps">—</span></div>
   <div id="status">starting…</div>
   <div id="ctl"><button id="prev">‹ prev</button><button id="play">⏸ pause</button><button id="next">next ›</button></div>
+  <label id="vibrow" style="display:flex;align-items:center;gap:8px;font-size:11.5px;color:#8b93a1;margin-top:10px;">
+    <span>vibrance</span><input id="vibrance" type="range" min="0.6" max="2.2" step="0.05" value="1.45" style="flex:1;accent-color:#6f9bff"/>
+  </label>
   <div id="server-ui">
     <div class="srow"><label>scene</label><select id="scene-sel"></select></div>
     <div class="up">
@@ -184,13 +187,19 @@ <h1>accumulating reconstruction</h1>
 }`;
 const FRAG = `#version 300 es
 precision highp float;
+uniform float u_sat;     // saturation/vibrance boost (1.0 = as-decoded)
 in vec4 v_color; in vec2 v_pos;
 out vec4 o;
 void main() {
   float p = -dot(v_pos, v_pos);
   if (p < -4.0) discard;
   float alpha = exp(p) * v_color.a;
-  o = vec4(v_color.rgb * alpha, alpha);
+  // SH-DC-only color is flat/desaturated; push each splat's color away from its
+  // luma before the premultiplied blend (the blend itself also greys overlaps).
+  vec3 rgb = v_color.rgb;
+  float luma = dot(rgb, vec3(0.2126, 0.7152, 0.0722));
+  rgb = clamp(mix(vec3(luma), rgb, u_sat), 0.0, 1.0);
+  o = vec4(rgb * alpha, alpha);
 }`;
 function shader(type, src){ const s=gl.createShader(type); gl.shaderSource(s,src); gl.compileShader(s);
   if(!gl.getShaderParameter(s,gl.COMPILE_STATUS)) throw new Error(gl.getShaderInfoLog(s)); return s; }
@@ -212,6 +221,7 @@ <h1>accumulating reconstruction</h1>
 const U = {
   proj: gl.getUniformLocation(prog,"u_proj"), view: gl.getUniformLocation(prog,"u_view"),
   focal: gl.getUniformLocation(prog,"u_focal"), viewport: gl.getUniformLocation(prog,"u_viewport"),
+  sat: gl.getUniformLocation(prog,"u_sat"),
 };
 gl.uniform1i(gl.getUniformLocation(prog,"u_tex"), 0);
 gl.disable(gl.DEPTH_TEST);
@@ -312,6 +322,12 @@ <h1>accumulating reconstruction</h1>
 function resize(){ const dpr=Math.min(devicePixelRatio,2); canvas.width=innerWidth*dpr; canvas.height=innerHeight*dpr; gl.viewport(0,0,canvas.width,canvas.height); }
 addEventListener("resize", resize); resize();
 
+// vibrance (saturation) — cosmetic, demo-layer only; ?sat= overrides so headless/
+// video renders match what you dialed in. Same control as reconstruct.html.
+let saturation = parseFloat(_q.get("sat") ?? "1.45");
+{ const vib = $("vibrance"); vib.value = saturation;
+  vib.addEventListener("input", e => { saturation = +e.target.value; }); }
+
 let frames=0, fpsT=performance.now(), lastT=performance.now();
 function render(){
   const now=performance.now(), dt=(now-lastT)/1000; lastT=now;
@@ -326,6 +342,7 @@ <h1>accumulating reconstruction</h1>
   gl.useProgram(prog);
   gl.uniformMatrix4fv(U.view,false,view); gl.uniformMatrix4fv(U.proj,false,proj);
   gl.uniform2f(U.focal,fx,fy); gl.uniform2f(U.viewport,canvas.width,canvas.height);
+  gl.uniform1f(U.sat, saturation);
   gl.bindBuffer(gl.ARRAY_BUFFER,quadBuf); gl.enableVertexAttribArray(aCorner); gl.vertexAttribPointer(aCorner,2,gl.FLOAT,false,0,0);
   if(sortedCount>0){
     gl.bindBuffer(gl.ARRAY_BUFFER,idxBuf); gl.enableVertexAttribArray(aIndex);
@@ -363,7 +380,9 @@ <h1>accumulating reconstruction</h1>
     if(!resp.ok) throw new Error("HTTP "+resp.status+" for "+step.splat);
     const bytes = new Uint8Array(await resp.arrayBuffer());
     setSplat(bytes);
-    if(!framed) frameToData(0.9);                    // frame once; camera stays as it grows
+    // frame once (camera stays put as the cloud grows); but a tree-stages manifest
+    // sets "reframe":true so each stage (very different extents) refits the view.
+    if(!framed || (manifest && manifest.reframe)) frameToData(0.9);
     if (step.label) {
       banner.innerHTML = `<b>✓ ${step.label}</b> · ${count.toLocaleString()} splats`;
       setStatus(`${step.label} · ${count.toLocaleString()} splats`);
diff --git a/src/free_splatter.cpp b/src/free_splatter.cpp
index ab7462a..14b8595 100644
--- a/src/free_splatter.cpp
+++ b/src/free_splatter.cpp
@@ -5,6 +5,8 @@
 #include "options.h"
 #include "pose.h"
 
+#include <cmath>
+#include <cstdio>
 #include <cstdlib>
 #include <cstring>
 #include <filesystem>
@@ -261,7 +263,20 @@ int free_splatter_accumulator_add_pair(free_splatter_accumulator * acc, const fl
                                        int32_t gaussian_channels) {
     if (!acc || !gaussians || gaussian_channels < 16) return -1;
     acc->gc = gaussian_channels;
-    acc->acc.add_pair(gaussians, gaussian_channels);
+    const free_splatter::pose::ChainLink link = acc->acc.add_pair(gaussians, gaussian_channels);
+    if (std::getenv("FREE_SPLATTER_DEBUG_LINKS")) {
+        // Surface the per-link registration diagnostics + the COMPOUNDED global drift
+        // (open-loop chain T_k = T_{k-1}.sim, so per-link error accumulates).
+        const auto & g = link.global;
+        const double tn = std::sqrt(g.t[0]*g.t[0] + g.t[1]*g.t[1] + g.t[2]*g.t[2]);
+        double c = (g.R(0,0) + g.R(1,1) + g.R(2,2) - 1.0) * 0.5;
+        c = c > 1.0 ? 1.0 : (c < -1.0 ? -1.0 : c);
+        std::fprintf(stderr,
+            "[link] pair=%d  local: s=%.4f inlier=%.3f valid=%.3f resid=%.4f | "
+            "global: s=%.4f rot=%.2fdeg |t|=%.4f\n",
+            acc->acc.pair_count(), link.sim.s, link.inlier_frac, link.valid_frac,
+            link.resid_frac, g.s, std::acos(c) * 57.29577951308232, tn);
+    }
     return 0;
 }
 
@@ -306,6 +321,51 @@ int free_splatter_accumulator_refine(free_splatter_accumulator * acc, float voxe
     return 0;
 }
 
+int free_splatter_fuse_cloud(const free_splatter_point * pts, size_t n, float voxel_frac,
+                             int32_t k, int32_t mode, free_splatter_point ** out, size_t * n_out) {
+    if (out) *out = nullptr;
+    if (n_out) *n_out = 0;
+    if (!pts || !out || !n_out || k < 1 || !(voxel_frac > 0)) return -1;
+    std::vector<free_splatter::pose::AccumPoint> cloud(n);
+    for (size_t i = 0; i < n; i++) {
+        const free_splatter_point & p = pts[i]; free_splatter::pose::AccumPoint & a = cloud[i];
+        a.x = p.x; a.y = p.y; a.z = p.z; a.r = p.r; a.g = p.g; a.b = p.b; a.opacity = p.opacity;
+        a.sx = p.sx; a.sy = p.sy; a.sz = p.sz; a.qw = p.qw; a.qx = p.qx; a.qy = p.qy; a.qz = p.qz;
+        a.frame = p.frame;
+    }
+    std::vector<free_splatter::pose::AccumPoint> fused;
+    free_splatter::pose::consensus_fuse(cloud, voxel_frac, k, fused, mode);
+    return emit_points(fused, out, n_out);
+}
+
+int free_splatter_tree_overlap(const float * const * pairs, int32_t n_pairs, int32_t gaussian_channels,
+                               int32_t height, int32_t width, float opacity_threshold,
+                               int32_t block, int32_t overlap,
+                               int32_t max_levels, float layout_spacing, int32_t per_node_cap,
+                               free_splatter_point ** out, size_t * n_out, int32_t * n_nodes) {
+    if (out) *out = nullptr;
+    if (n_out) *n_out = 0;
+    if (n_nodes) *n_nodes = 0;
+    if (!pairs || n_pairs < 1 || gaussian_channels < 16 || height < 1 || width < 1 || !out || !n_out)
+        return -1;
+    for (int32_t i = 0; i < n_pairs; i++) if (!pairs[i]) return -1;
+    std::vector<const float *> ps(pairs, pairs + n_pairs);
+    std::vector<free_splatter::pose::TreeMerge> merges;
+    int nn = 0;
+    std::vector<free_splatter::pose::AccumPoint> cloud =
+        free_splatter::pose::tree_accumulate_overlap(ps, height, width, gaussian_channels, opacity_threshold,
+                                                     block, overlap, 0.02, 300, 0, &merges,
+                                                     max_levels, layout_spacing, &nn, per_node_cap);
+    if (n_nodes) *n_nodes = nn;
+    if (std::getenv("FREE_SPLATTER_DEBUG_LINKS"))
+        for (const auto & m : merges)
+            std::fprintf(stderr,
+                "[tree] L%d range[%d..%d] sharedframes=%d  s=%.4f inlier=%.3f valid=%.3f resid=%.4f\n",
+                m.level, m.lo_frame, m.hi_frame, m.shared_frame,
+                m.scale, m.inlier_frac, m.valid_frac, m.resid_frac);
+    return emit_points(cloud, out, n_out);
+}
+
 int free_splatter_accumulator_camera_path(const free_splatter_accumulator * acc,
                                           float ** out, int32_t * n_frames) {
     if (out) *out = nullptr;
diff --git a/src/pose.cpp b/src/pose.cpp
index 15a15d7..7ba152e 100644
--- a/src/pose.cpp
+++ b/src/pose.cpp
@@ -1078,6 +1078,226 @@ std::vector<Mat4> Accumulator::camera_path() const {
     return path;
 }
 
+// ---- hierarchical tree accumulation ---------------------------------------
+
+// One gaussian (gc-channel engine layout) -> AccumPoint at identity (leaf frame).
+static AccumPoint tree_leaf_point(const float * a, int frame) {
+    AccumPoint p;
+    p.x = a[0]; p.y = a[1]; p.z = a[2];
+    p.r = clamp01((float) (a[3] * SH_C0 + 0.5));
+    p.g = clamp01((float) (a[4] * SH_C0 + 0.5));
+    p.b = clamp01((float) (a[5] * SH_C0 + 0.5));
+    p.opacity = a[15];
+    p.sx = a[16]; p.sy = a[17]; p.sz = a[18];
+    std::array<double,4> q = quat_normalize({ a[19], a[20], a[21], a[22] });
+    p.qw = (float) q[0]; p.qx = (float) q[1]; p.qy = (float) q[2]; p.qz = (float) q[3];
+    p.frame = frame;
+    return p;
+}
+
+// Apply a Sim(3) to a whole cloud (position s*(R@x)+t, scale *s, orientation qS⊗q).
+static void tree_apply_sim_cloud(std::vector<AccumPoint> & c, const Sim3 & S) {
+    const std::array<double,4> qS = mat3_to_quat(S.R);
+    for (auto & p : c) {
+        Vec3 w = sim_apply(S, { p.x, p.y, p.z });
+        p.x = (float) w[0]; p.y = (float) w[1]; p.z = (float) w[2];
+        p.sx = (float) (S.s * p.sx); p.sy = (float) (S.s * p.sy); p.sz = (float) (S.s * p.sz);
+        std::array<double,4> q = quat_normalize(quat_mul(qS, { p.qw, p.qx, p.qy, p.qz }));
+        p.qw = (float) q[0]; p.qx = (float) q[1]; p.qy = (float) q[2]; p.qz = (float) q[3];
+    }
+}
+static void tree_apply_sim_pts(std::vector<float> & pts, const Sim3 & S) {
+    for (size_t i = 0; i + 3 <= pts.size(); i += 3) {
+        Vec3 w = sim_apply(S, { pts[i], pts[i+1], pts[i+2] });
+        pts[i] = (float) w[0]; pts[i+1] = (float) w[1]; pts[i+2] = (float) w[2];
+    }
+}
+
+// Robustly register one band of shared-frame correspondences (XA -> XB, both flat
+// 3*n): RANSAC similarity at a scene-relative threshold, plus the inlier residual.
+// The single registration primitive for both tree merges (caller derives the
+// inlier/valid fractions, which it normalises differently per tree).
+struct BandFit { Sim3 S; int n_inl; double scene; double resid; };
+static BandFit fit_band_sim(const std::vector<double> & XA, const std::vector<double> & XB,
+                            double rthr, int riters, uint64_t seed) {
+    const int n = (int) (XA.size() / 3);
+    BandFit f{ sim_identity(), 0, 0.0, 0.0 };
+    if (n < 3) return f;
+    double mB[3]; mean3(XB.data(), n, mB);
+    std::vector<double> yc(3 * n);
+    for (int k = 0; k < n; k++) for (int c = 0; c < 3; c++) yc[3*k+c] = XB[3*k+c] - mB[c];
+    f.scene = rms3(yc.data(), n);
+    std::vector<char> inl;
+    f.S = fit_similarity_ransac(XA.data(), XB.data(), n, rthr * f.scene, riters, inl, true, seed);
+    std::vector<double> rA;
+    for (int k = 0; k < n; k++) if (inl[k]) {
+        f.n_inl++;
+        Vec3 sa = sim_apply(f.S, { XA[3*k], XA[3*k+1], XA[3*k+2] });
+        for (int c = 0; c < 3; c++) rA.push_back(sa[c] - XB[3*k+c]);
+    }
+    f.resid = (f.scene > 0 && f.n_inl > 0) ? rms3(rA.data(), f.n_inl) / f.scene : 0.0;
+    return f;
+}
+
+// ---- multi-frame-overlap tree ---------------------------------------------
+
+// A submap covering a contiguous global frame range, carrying the per-pixel point
+// arrays of every frame it spans (in the submap's local frame) so a neighbour can
+// align on the whole band of frames they share, not just one.
+struct ONode {
+    std::vector<AccumPoint> cloud;
+    int lo, hi;                              // inclusive global frame range
+    std::vector<std::vector<float>> fpts;    // [f-lo] -> P*3 per-pixel points
+    std::vector<std::vector<char>>  fmsk;    // [f-lo] -> P opacity mask
+};
+
+// One pair (frames a, a+1) as a 2-frame submap (identity local frame).
+static ONode opair(const float * g, int a, int P, int gc, double thr) {
+    ONode nd; nd.lo = a; nd.hi = a + 1;
+    nd.fpts.assign(2, {}); nd.fmsk.assign(2, {});
+    nd.fpts[0].resize(3*P); nd.fpts[1].resize(3*P);
+    nd.fmsk[0].assign(P, 0); nd.fmsk[1].assign(P, 0);
+    nd.cloud.reserve(2*P);
+    for (int i = 0; i < P; i++) {
+        const float * a0 = g + (size_t) i * gc;
+        const float * a1 = g + (size_t) (P + i) * gc;
+        for (int c = 0; c < 3; c++) { nd.fpts[0][3*i+c] = a0[c]; nd.fpts[1][3*i+c] = a1[c]; }
+        nd.fmsk[0][i] = (a0[15] > thr) ? 1 : 0;
+        nd.fmsk[1][i] = (a1[15] > thr) ? 1 : 0;
+        if (a0[15] > thr) nd.cloud.push_back(tree_leaf_point(a0, a));
+        if (a1[15] > thr) nd.cloud.push_back(tree_leaf_point(a1, a + 1));
+    }
+    return nd;
+}
+
+// Merge R into L's frame, fitting one Sim(3) over EVERY frame they share. Returns
+// the combined submap; optionally records the merge diagnostics.
+static ONode omerge(ONode & L, ONode & R, int P, double rthr, int riters, uint64_t seed,
+                    std::vector<TreeMerge> * merges, int level) {
+    const int so = std::max(L.lo, R.lo), sh = std::min(L.hi, R.hi);   // shared frame range
+    std::vector<double> XA, XB;
+    for (int f = so; f <= sh; f++) {
+        const std::vector<float> & ra = R.fpts[f - R.lo]; const std::vector<char> & rm = R.fmsk[f - R.lo];
+        const std::vector<float> & la = L.fpts[f - L.lo]; const std::vector<char> & lm = L.fmsk[f - L.lo];
+        for (int i = 0; i < P; i++) if (rm[i] && lm[i]) {
+            XA.push_back(ra[3*i]); XA.push_back(ra[3*i+1]); XA.push_back(ra[3*i+2]);
+            XB.push_back(la[3*i]); XB.push_back(la[3*i+1]); XB.push_back(la[3*i+2]);
+        }
+    }
+    const int n = (int) (XA.size() / 3);
+    const int shared = sh - so + 1;
+    BandFit fit = fit_band_sim(XA, XB, rthr, riters, seed);
+    const Sim3 & S = fit.S;
+    if (merges) merges->push_back(TreeMerge{ level, std::min(L.lo,R.lo), std::max(L.hi,R.hi), shared,
+        S.s, (n > 0) ? (double) fit.n_inl / n : 0.0, (double) n / ((double) shared * P), fit.resid });
+
+    tree_apply_sim_cloud(R.cloud, S);
+    for (auto & a : R.fpts) tree_apply_sim_pts(a, S);
+    ONode m; m.lo = std::min(L.lo, R.lo); m.hi = std::max(L.hi, R.hi);
+    const int F = m.hi - m.lo + 1;
+    m.fpts.assign(F, {}); m.fmsk.assign(F, {});
+    for (int f = L.lo; f <= L.hi; f++) { m.fpts[f-m.lo] = std::move(L.fpts[f-L.lo]); m.fmsk[f-m.lo] = std::move(L.fmsk[f-L.lo]); }
+    for (int f = R.lo; f <= R.hi; f++) if (m.fpts[f-m.lo].empty()) { m.fpts[f-m.lo] = std::move(R.fpts[f-R.lo]); m.fmsk[f-m.lo] = std::move(R.fmsk[f-R.lo]); }
+    m.cloud = std::move(L.cloud);
+    m.cloud.insert(m.cloud.end(), R.cloud.begin(), R.cloud.end());
+    return m;
+}
+
+std::vector<AccumPoint> tree_accumulate_overlap(const std::vector<const float *> & pairs,
+                                                int H, int W, int gc, double thr,
+                                                int block, int overlap,
+                                                double rthr, int riters, uint64_t seed,
+                                                std::vector<TreeMerge> * merges,
+                                                int max_levels, double layout_spacing,
+                                                int * n_nodes_out, int per_node_cap) {
+    const int P = H * W;
+    const int M = (int) pairs.size();
+    if (n_nodes_out) *n_nodes_out = 0;
+    if (M < 1) return {};
+    const int nframes = M + 1;
+    overlap = std::max(1, overlap);
+    block = std::max(block, overlap + 1);
+    block = std::min(block, nframes);
+    overlap = std::min(overlap, block - 1);
+    const int step = std::max(1, block - overlap);
+
+    // overlapping base submaps: each chains `block-1` consecutive pairs (single-frame
+    // internal links), adjacent submaps share `overlap` frames.
+    std::vector<ONode> nodes;
+    for (int s = 0; s < M; s += step) {
+        const int w = std::min(block, nframes - s);          // frames in this submap
+        if (w < 2) break;
+        ONode nd = opair(pairs[s], s, P, gc, thr);           // frames s, s+1
+        for (int p = s + 1; p <= s + w - 2; p++) {           // fold the rest of the block
+            ONode nx = opair(pairs[p], p, P, gc, thr);
+            nd = omerge(nd, nx, P, rthr, riters, seed, nullptr, -1);
+        }
+        nodes.push_back(std::move(nd));
+        if (s + block >= nframes) break;                     // last submap reached the end
+    }
+
+    // median base-submap extent — the auto layout pitch for side-by-side stages.
+    // Only the auto-layout (layout_spacing<0, staged demo) path reads it, so the
+    // full-merge callers skip these two passes over every submap.
+    double leaf_extent = 1.0;
+    if (layout_spacing < 0.0) {
+        std::vector<double> ex;
+        for (auto & nd : nodes) {
+            double m[3] = {0,0,0}; int64_t nf = 0;
+            for (auto & p : nd.cloud) if (std::isfinite(p.x) && std::isfinite(p.y) && std::isfinite(p.z)) {
+                m[0]+=p.x; m[1]+=p.y; m[2]+=p.z; nf++; }
+            if (nf == 0) continue;
+            for (int c = 0; c < 3; c++) m[c] /= nf;
+            double ss = 0;
+            for (auto & p : nd.cloud) if (std::isfinite(p.x) && std::isfinite(p.y) && std::isfinite(p.z)) {
+                double dx=p.x-m[0], dy=p.y-m[1], dz=p.z-m[2]; ss += dx*dx+dy*dy+dz*dz; }
+            ex.push_back(std::sqrt(ss / nf));
+        }
+        if (!ex.empty()) { std::sort(ex.begin(), ex.end()); leaf_extent = ex[ex.size()/2]; }
+    }
+
+    // balanced tree-merge of submaps; each merge fits over the overlap band. Stop
+    // after max_levels rounds (-1 = fully to the root) for the staged demo.
+    int level = 0;
+    while (nodes.size() > 1 && (max_levels < 0 || level < max_levels)) {
+        std::vector<ONode> nxt;
+        nxt.reserve(nodes.size()/2 + 1);
+        for (size_t i = 0; i + 1 < nodes.size(); i += 2)
+            nxt.push_back(omerge(nodes[i], nodes[i+1], P, rthr, riters, seed, merges, level));
+        if (nodes.size() % 2 == 1) nxt.push_back(std::move(nodes.back()));
+        nodes = std::move(nxt); level++;
+    }
+    if (n_nodes_out) *n_nodes_out = (int) nodes.size();
+    if (nodes.empty()) return {};
+
+    // concatenate remaining nodes, laid out side by side (layout_spacing != 0) and
+    // capped per node (opacity*radius) — same as tree_accumulate.
+    const double sigma = (layout_spacing < 0.0) ? 4.0 * leaf_extent : layout_spacing;
+    const double gcentre = 0.5 * M;
+    auto importance = [](const AccumPoint & p) -> double {
+        const double vol = (double) std::max(p.sx,1e-9f) * std::max(p.sy,1e-9f) * std::max(p.sz,1e-9f);
+        return (double) std::max(p.opacity, 0.0f) * std::cbrt(vol);
+    };
+    std::vector<AccumPoint> out;
+    size_t total = 0;
+    for (auto & nd : nodes)
+        total += (per_node_cap > 0) ? std::min((size_t) per_node_cap, nd.cloud.size()) : nd.cloud.size();
+    out.reserve(total);
+    for (auto & nd : nodes) {
+        const double dx = (sigma != 0.0) ? (0.5 * (nd.lo + nd.hi) - gcentre) * sigma : 0.0;
+        if (per_node_cap > 0 && (int) nd.cloud.size() > per_node_cap) {  // keep each scene's own detail
+            std::vector<size_t> idx(nd.cloud.size());
+            for (size_t i = 0; i < idx.size(); i++) idx[i] = i;
+            std::partial_sort(idx.begin(), idx.begin() + per_node_cap, idx.end(),
+                [&](size_t a, size_t b){ return importance(nd.cloud[a]) > importance(nd.cloud[b]); });
+            for (int i = 0; i < per_node_cap; i++) { AccumPoint p = nd.cloud[idx[i]]; p.x = (float) (p.x + dx); out.push_back(p); }
+        } else {
+            for (const auto & src : nd.cloud) { AccumPoint p = src; p.x = (float) (p.x + dx); out.push_back(p); }
+        }
+    }
+    return out;
+}
+
 // ---- consensus fusion -----------------------------------------------------
 
 FuseStats consensus_fuse(const std::vector<AccumPoint> & cloud, double voxel_frac, int k,
diff --git a/src/pose.h b/src/pose.h
index ba82a07..dd0e50a 100644
--- a/src/pose.h
+++ b/src/pose.h
@@ -206,6 +206,46 @@ class Accumulator {
     int next_frame_ = 0;
 };
 
+// ---- hierarchical tree accumulation (balanced binary merge) ----------------
+// Linear chaining composes N Sim(3)s in series, so registration drift compounds
+// over N hops and piles onto the last frame (the far end smears). This instead
+// builds short submaps and merges them pairwise up a balanced binary tree — so
+// drift compounds over only ~log2(N) hops and is spread evenly rather than dumped
+// on the end.
+struct TreeMerge {                 // diagnostics per merge (defs as ChainLink)
+    int level, lo_frame, hi_frame, shared_frame;
+    double scale, inlier_frac, valid_frac, resid_frac;
+};
+
+// Multi-frame-overlap tree. pairs[k]: one engine output [2*H*W*gc] (pair k = frames
+// k,k+1; pair k's view-1 and pair k+1's view-0 are the same physical frame).
+// Adjacent submaps of `block` frames overlap by `overlap` frames and each merge fits
+// its Sim(3) over ALL shared frames at once — the fit is over-determined, so the
+// per-frame depth-inconsistency noise (the reason a single-frame alignment is only
+// ~13% rigid-consistent) averages out and the compounded drift drops. Base submaps
+// are short windows of `block` frames (block > overlap), chained internally by
+// single-frame links, then merged in a balanced tree on their overlap bands. The
+// degenerate block=2,overlap=1 is the plain overlap-by-one (single shared boundary)
+// tree. Returns the merged root cloud in the first submap's frame; `merges`
+// (optional) gets one entry per merge.
+//
+// For the staged side-by-side demo: max_levels caps the merge rounds (-1 = full):
+// 0 returns the independent base submaps, 1 the first merge round, ... . The
+// remaining (>1) nodes are shifted apart along X by their frame-range midpoint when
+// layout_spacing != 0 (0 = none, <0 = auto 4x median submap extent, >0 = that
+// spacing), each capped to its top per_node_cap points by opacity*radius (0 =
+// uncapped) so every laid-out scene renders at its own detail. n_nodes_out
+// (optional) reports how many nodes remain (1 once fully merged).
+std::vector<AccumPoint> tree_accumulate_overlap(const std::vector<const float *> & pairs,
+                                                int H, int W, int gaussian_channels,
+                                                double opacity_threshold = 0.05,
+                                                int block = 4, int overlap = 2,
+                                                double ransac_thresh_frac = 0.02,
+                                                int ransac_iters = 300, uint64_t seed = 0,
+                                                std::vector<TreeMerge> * merges = nullptr,
+                                                int max_levels = -1, double layout_spacing = 0.0,
+                                                int * n_nodes_out = nullptr, int per_node_cap = 0);
+
 // ---- consensus fusion (mirrors fuse.py) -----------------------------------
 struct FuseStats {
     int64_t raw_points, voxels, kept_voxels, points_kept, points_dropped;
diff --git a/tools/free_splatter-cli.cpp b/tools/free_splatter-cli.cpp
index af96083..36aee40 100644
--- a/tools/free_splatter-cli.cpp
+++ b/tools/free_splatter-cli.cpp
@@ -151,7 +151,8 @@ int main(int argc, char ** argv) {
     const char * model = nullptr, * splat_prefix = nullptr;
     float opac_thr = 5e-3f;
     long  max_splats = 0;
-    bool  accumulate = false, fuse = false, parallax = false;
+    bool  accumulate = false, fuse = false, parallax = false, tree = false, tree_stages = false;
+    int   tov_block = 4, tov_overlap = 2;        // tree: submap width / shared frames (2/1 = overlap-by-one)
     int   fuse_mode = 1;                         // 0 averaged, 1 kept, 2 best-frame
     float voxel = 0.02f, splat_scale = 1.0f;   // multiplier on the predicted gaussian radii
     int   fuse_k = 2;
@@ -168,6 +169,10 @@ int main(int argc, char ** argv) {
         else if (a == "--opacity-threshold" && i+1<argc) opac_thr = (float) atof(argv[++i]);
         else if (a == "--max-splats" && i+1 < argc)     max_splats = atol(argv[++i]);
         else if (a == "--accumulate")                   accumulate = true;
+        else if (a == "--accumulate-tree")            { accumulate = true; tree = true; }
+        else if (a == "--tree-stages")                { accumulate = true; tree_stages = true; }
+        else if (a == "--tree-block" && i+1 < argc)     tov_block = atoi(argv[++i]);
+        else if (a == "--tree-overlap" && i+1 < argc)   tov_overlap = atoi(argv[++i]);
         else if (a == "--parallax")                     parallax = true;
         else if (a == "--splat-prefix" && i+1 < argc)   splat_prefix = argv[++i];
         else if (a == "--fuse")                         fuse = true;
@@ -272,6 +277,10 @@ int main(int argc, char ** argv) {
         free_splatter_accumulator * acc = free_splatter_accumulator_new(sz, sz, opac_thr);
         if (!acc) { std::fprintf(stderr, "accumulator alloc failed\n"); free_splatter_free(ctx); return 1; }
 
+        // --accumulate-tree: collect the (gated) pair engine outputs and merge them
+        // up a balanced binary tree instead of chaining linearly (less compounded drift).
+        std::vector<std::vector<float>> tree_pairs;
+
         // emit the accumulated cloud after a pair is folded in (optional de-ghost)
         auto emit_step = [&](int nframes) {
             if (!splat_prefix) return;
@@ -300,6 +309,7 @@ int main(int argc, char ** argv) {
                                  (size_t)(bytes/sizeof(float)), sz, sz, gc);
                     free_splatter_accumulator_free(acc); free_splatter_free(ctx); return 1; }
                 std::vector<float> dumpbuf(pair_floats); f.read((char *) dumpbuf.data(), bytes);
+                if (tree || tree_stages) { tree_pairs.push_back(std::move(dumpbuf)); std::printf("pair %zu collected\n", k); continue; }
                 free_splatter_accumulator_add_pair(acc, dumpbuf.data(), gc);
                 const int nframes = free_splatter_accumulator_frame_count(acc);
                 std::printf("pair %zu -> %d frames\n", k, nframes);
@@ -329,6 +339,12 @@ int main(int argc, char ** argv) {
                     }
                     std::printf("keep frame %zu: parallax %.1f deg (vs kept frame %zu)\n", j, px.tri_angle_deg, last_kept);
                 }
+                if (tree || tree_stages) {
+                    tree_pairs.emplace_back(runout, runout + ng);
+                    free_splatter_buf_free(runout);
+                    std::printf("pair (%zu,%zu) collected\n", last_kept, j);
+                    last_kept = j; continue;
+                }
                 free_splatter_accumulator_add_pair(acc, runout, gc);
                 free_splatter_buf_free(runout);
                 const int nframes = free_splatter_accumulator_frame_count(acc);
@@ -338,6 +354,92 @@ int main(int argc, char ** argv) {
             }
             if (skipped) std::printf("gated: skipped %d low-parallax frame(s)\n", skipped);
         }
+        if (tree || tree_stages) {
+            std::vector<const float *> ps; ps.reserve(tree_pairs.size());
+            for (auto & v : tree_pairs) ps.push_back(v.data());
+            if (ps.empty()) { std::fprintf(stderr, "tree: need >=1 pair\n");
+                free_splatter_accumulator_free(acc); free_splatter_free(ctx); return 1; }
+            const std::string pfx = splat_prefix ? splat_prefix : "stage";
+
+            if (tree_stages) {
+                // write one laid-out .splat per merge level (stage 0 = the independent
+                // leaf scenes side by side, ... then the single merged world, then a
+                // consensus-fused clean scene) + a manifest the viewer steps through.
+                std::string dir = ".", base = pfx;
+                const size_t slash = pfx.find_last_of('/');
+                if (slash != std::string::npos) { dir = pfx.substr(0, slash); base = pfx.substr(slash + 1); }
+                // per-scene budget: each laid-out scene keeps its own detail (a single
+                // shared cap would starve every scene to ~1/N and fill it with floaters).
+                const int per_node = (max_splats > 0 && max_splats < 200000) ? (int) max_splats : 150000;
+                std::vector<std::string> labels;
+                auto stage_path = [&](char * p, size_t cap) { std::snprintf(p, cap, "%s_%zu.splat", pfx.c_str(), labels.size()); };
+                // multi-scene stages of the multi-frame-overlap tree: each laid-out
+                // scene gets its own per_node budget.
+                for (int L = 0; L <= 64; L++) {
+                    free_splatter_point * sc = nullptr; size_t nsc = 0; int nnodes = 0;
+                    if (free_splatter_tree_overlap(ps.data(), (int) ps.size(), gc, sz, sz,
+                            opac_thr, tov_block, tov_overlap, L, -1.0f /*auto layout*/, per_node, &sc, &nsc, &nnodes) != 0) {
+                        std::fprintf(stderr, "tree-stages failed at level %d\n", L);
+                        free_splatter_accumulator_free(acc); free_splatter_free(ctx); return 1; }
+                    if (nnodes <= 1) { free_splatter_buf_free(sc); break; }   // single scene: full budget below
+                    char path[1024]; stage_path(path, sizeof path);
+                    write_cloud_splat(sc, nsc, 0 /*already per-node capped*/, splat_scale, path);
+                    free_splatter_buf_free(sc);
+                    std::printf("stage %zu: %d scenes\n", labels.size(), nnodes);
+                    labels.push_back(L == 0 ? (std::to_string(nnodes) + " independent scenes")
+                                   : ("merge " + std::to_string(L) + " — " + std::to_string(nnodes) + " scenes"));
+                }
+                // the single merged world, at full budget: first the raw union (the
+                // overlapping per-frame copies, ghosted), then the consensus-fused
+                // (best-frame) clean scene — same contrast as the linear demo's acc/fused.
+                free_splatter_point * root = nullptr; size_t nroot = 0;
+                if (free_splatter_tree_overlap(ps.data(), (int) ps.size(), gc, sz, sz, opac_thr,
+                                               tov_block, tov_overlap, -1, 0.0f, 0, &root, &nroot, nullptr) == 0) {
+                    char path[1024]; stage_path(path, sizeof path);
+                    write_cloud_splat(root, nroot, (size_t) max_splats, splat_scale, path);
+                    std::printf("stage %zu: merged raw union (%zu pts)\n", labels.size(), nroot);
+                    labels.push_back("merged — one scene (raw union)");
+                    free_splatter_point * fz = nullptr; size_t nfz = 0;
+                    if (free_splatter_fuse_cloud(root, nroot, voxel, fuse_k, 2 /*best-frame*/, &fz, &nfz) == 0 && nfz > 0) {
+                        char path2[1024]; stage_path(path2, sizeof path2);
+                        write_cloud_splat(fz, nfz, (size_t) max_splats, splat_scale, path2);
+                        std::printf("stage %zu: consensus-fused (%zu -> %zu pts)\n", labels.size(), nroot, nfz);
+                        labels.push_back("consensus-fused — one clean scene");
+                        free_splatter_buf_free(fz);
+                    }
+                    free_splatter_buf_free(root);
+                }
+                std::ofstream mf(dir + "/manifest.json");
+                mf << "{ \"reframe\": true, \"steps\": [\n";
+                for (size_t L = 0; L < labels.size(); L++)
+                    mf << "    {\"splat\":\"" << base << "_" << L << ".splat\",\"images\":[],\"n\":" << (L + 1)
+                       << ",\"label\":\"" << labels[L] << "\"}" << (L + 1 < labels.size() ? "," : "") << "\n";
+                mf << "  ] }\n";
+                std::printf("tree-stages: %zu stages -> %s/manifest.json\n", labels.size(), dir.c_str());
+            } else {
+                // one merged world from the overlap tree (block=tov_block, overlap=tov_overlap;
+                // 2/1 = the plain overlap-by-one tree). With --fuse, also the de-ghosted scene.
+                std::printf("tree-accumulate: %zu pairs, block=%d overlap=%d\n", ps.size(), tov_block, tov_overlap);
+                free_splatter_point * tc = nullptr; size_t ntc = 0;
+                if (free_splatter_tree_overlap(ps.data(), (int) ps.size(), gc, sz, sz, opac_thr,
+                                               tov_block, tov_overlap, -1, 0.0f, 0, &tc, &ntc, nullptr) != 0) {
+                    std::fprintf(stderr, "tree accumulate failed\n");
+                    free_splatter_accumulator_free(acc); free_splatter_free(ctx); return 1; }
+                char path[1024];
+                std::snprintf(path, sizeof path, "%s_tree.splat", pfx.c_str());
+                write_cloud_splat(tc, ntc, (size_t) max_splats, splat_scale, path);
+                if (fuse) {                                  // best-frame consensus, de-ghosted
+                    free_splatter_point * fz = nullptr; size_t nfz = 0;
+                    if (free_splatter_fuse_cloud(tc, ntc, voxel, fuse_k, 2, &fz, &nfz) == 0 && nfz > 0) {
+                        std::snprintf(path, sizeof path, "%s_tree_fused.splat", pfx.c_str());
+                        write_cloud_splat(fz, nfz, (size_t) max_splats, splat_scale, path);
+                        free_splatter_buf_free(fz);
+                    }
+                }
+                free_splatter_buf_free(tc);
+            }
+            free_splatter_accumulator_free(acc); free_splatter_free(ctx); return 0;
+        }
         if (fuse && splat_prefix) {
             if (refine) free_splatter_accumulator_refine(acc, refine_voxel, refine_iters, refine_alpha);
             free_splatter_point * fc = nullptr; size_t nf = 0;