From d13933747e947cd31ff1284c56a46df9afec5075 Mon Sep 17 00:00:00 2001 From: Richard Palethorpe Date: Fri, 26 Jun 2026 15:28:37 +0100 Subject: [PATCH 01/33] pose/: downstream PnP + cross-run alignment prototype, verified vs upstream A self-contained, pure-Python prototype DOWNSTREAM of the engine seam ([N,H,W,23]); deliberately OUTSIDE the validated src/ and NOT wired into CMake/ctest. It prototypes live, accumulating reconstruction from a moving camera: recover each view's camera (PnP) and align successive runs. - focal.py / pnp.py: faithful port of FreeSplatter's scene estimate_poses -- Weiszfeld shared focal (view 0 only, all pixels; use_first_focal), integer pixels, cv2.solvePnPRansac(SQPNP, reprojErr=5, iters=10), cam2world=inv(w2c), and the runner's 1/baseline camera rescale. numpy DLT+RANSAC fallback runs without cv2. - align.py: Umeyama similarity fit + RANSAC + a residual ladder (diagnose) that detects whether cross-run mismatch is a uniform-scale similarity or a nonlinear warp; plus similarity chaining and loop-closure metrics. Verification (all green): - check_cv2_parity.py -- numpy solver == exact cv2.solvePnPRansac on synthetic ground truth (~1e-7 clean). - check_upstream_parity.py -- our WHOLE orchestration vs upstream estimate_poses on REAL engine output. Caught and fixed five divergences (the costly one: focal averaged over a low-overlap 2nd view gave 507 vs the correct 596, ~15% -> 1.35 deg pose error). Now bit-exact (0.00 deg) on 2 scene + 2 object dumps; the only residual is upstream's float32 K vs our float64 (a RANSAC inlier- boundary precision effect, <=0.5 deg on near-degenerate object data, 0 on scene), root-caused and documented, not papered over. - test_pose.py -- 28 asset-free golden tests (no model/fixtures/cv2). Empirical finding: cross-run mismatch is a uniform-scale similarity (~11% scene scale drift), no nonlinear warp -- a 7-DoF similarity is the right alignment model; diagnose() flags it if that ever changes. flake.nix: add opencv4 (cv2) for the exact-upstream PnP path; numpy-only fallback otherwise. Not needed by the engine build/test. Co-Authored-By: Claude Opus 4.8 (1M context) --- flake.nix | 5 +- pose/README.md | 95 +++++++++++++ pose/_upstream.py | 107 +++++++++++++++ pose/align.py | 214 +++++++++++++++++++++++++++++ pose/check_cv2_parity.py | 73 ++++++++++ pose/check_upstream_parity.py | 154 +++++++++++++++++++++ pose/empirical.py | 83 ++++++++++++ pose/focal.py | 48 +++++++ pose/pnp.py | 245 ++++++++++++++++++++++++++++++++++ pose/test_pose.py | 235 ++++++++++++++++++++++++++++++++ 10 files changed, 1258 insertions(+), 1 deletion(-) create mode 100644 pose/README.md create mode 100644 pose/_upstream.py create mode 100644 pose/align.py create mode 100644 pose/check_cv2_parity.py create mode 100644 pose/check_upstream_parity.py create mode 100644 pose/empirical.py create mode 100644 pose/focal.py create mode 100644 pose/pnp.py create mode 100644 pose/test_pose.py diff --git a/flake.nix b/flake.nix index 7976a20..5a252a5 100644 --- a/flake.nix +++ b/flake.nix @@ -11,7 +11,10 @@ # hf_dump.py) run in docker/Dockerfile.cuda alongside the GPU model. The # M0 gate synthesizes its test GGUF in C++ (tests/test_loader.cpp), so the # build + fast test tier need no Python at all. - pyEnv = pkgs.python3.withPackages (ps: with ps; [ numpy ]); + # opencv4 (cv2): the downstream pose/ prototype mirrors FreeSplatter's + # cv2.solvePnPRansac for the exact-upstream PnP path; numpy-only fallback + # otherwise. Not needed by the engine build/test. + pyEnv = pkgs.python3.withPackages (ps: with ps; [ numpy opencv4 ]); in { devShells.${system} = { diff --git a/pose/README.md b/pose/README.md new file mode 100644 index 0000000..aa048d9 --- /dev/null +++ b/pose/README.md @@ -0,0 +1,95 @@ +# pose/ — camera recovery + cross-run alignment (downstream prototype) + +**Status: research prototype, deliberately OUTSIDE the validated engine.** + +The free-splatter.cpp engine is pieces 1–3 only; CLAUDE.md keeps the PnP pose +solver out of `src/` on purpose. This directory is the *downstream consumer* of +the engine's `[N, H, W, 23]` output — a self-contained, pure-Python prototype, +**not wired into the CMake build or `ctest`**. It exists to prototype live, +accumulating reconstruction from a moving-camera feed. + +## Why it's needed + +Each `free_splatter_run` over a different photo pair produces gaussians in its +**own** coordinate system: positions are predicted in the first view's camera +frame, then the whole scene is rescaled by `s = 1/mean(‖point‖)` (see upstream +`estimate_poses`). To stitch successive splats into one world you need: + +1. **PnP** — recover each view's camera from the per-pixel 2D↔3D correspondences + the gaussians hand you for free. +2. **Cross-run alignment** — because two runs normalize by a *different* `1/d`, + the shared content comes out at a different **scale**. Aligning run B into run + A is therefore a **similarity** (rotation + translation + one uniform scale, + 7-DoF), not a rigid transform. + +## Upstream recipe being mirrored (TencentARC/FreeSplatter) + +`freesplatter/models/model.py::estimate_poses` → DUSt3R-derived `fast_pnp` +(`utils/recon_util.py`): + +Verified against the cached upstream source (`.cache/upstream/{model,runner}.py`) +and the fetched `recon_util.py`, the **scene** recipe (the live use case) is: + +| step | upstream (scene path) | here | +|---|---|---| +| correspondences | pixel ↔ gaussian center `X` (ch 0:3) | `pnp.pixel_grid` (integer px) + `points` | +| validity mask (PnP) | `sigmoid(opacity) > 0.05` | `estimate_poses(opacity_threshold=)` (engine output is already activated) | +| focal | Weiszfeld, **view 0 only** (`use_first_focal=True`), **all pixels** (`masks=None`) | `estimate_poses(use_first_focal=True)` → `focal.estimate_focal` | +| pose | `cv2.solvePnPRansac(reprojErr=5, SOLVEPNP_SQPNP, iters=10)`, `cam2world=inv(world2cam)` | `pnp.solve_pnp_cv2` (exact) / `solve_pnp_numpy` (reference) | +| anchor | none — view 0 is ~identity by construction (gaussians live in view 0's frame) | `estimate_poses` returns raw poses | +| scene scale | runner rescales **camera centers** so the view0→view1 **baseline = 1**: `s = 1/(‖t₁−t₀‖+1e-2)` | `pnp.rescale_to_baseline` / `estimate_poses(normalize=True)` | + +> An earlier draft had three of these wrong (focal averaged over all views, an +> `inv(c2w[0])` re-anchor, and a `1/mean‖pts‖` scene scale). Driving the *actual* +> upstream `estimate_poses` on real engine output (see `check_upstream_parity.py`) +> surfaced and fixed all three — the focal one mattered: averaging a low-overlap +> second view dragged focal 507 vs the correct 596, a 15% error → 1.35° pose error. + +`solve_pnp_cv2` is the byte-for-byte upstream call (needs `opencv`, in the CUDA +container or the nix shell). `solve_pnp_numpy` is a DLT + RANSAC reference with no +third-party deps, verified against analytic ground truth so the pipeline runs +anywhere numpy is. + +## Files + +- `focal.py` — Weiszfeld shared-focal estimation (mirrors upstream `estimate_focal`). +- `pnp.py` — DLT/RANSAC + cv2 PnP backends, `estimate_poses` orchestration, and + `rescale_to_baseline` (the runner's scene normalization). +- `align.py` — Umeyama similarity fit, RANSAC variant, the **residual ladder** + (`diagnose`: rigid→similarity→affine + depth-correlation) that detects whether a + single uniform scale suffices or the mismatch is non-uniform, and similarity + chaining / loop-closure metrics. +- `_upstream.py` — vendored upstream kernels (`fast_pnp` verbatim, `estimate_focal` + numpy-transcribed) used ONLY by the parity check; not our code. +- `test_pose.py` — asset-free golden tests (no model, no fixtures, no cv2). +- `check_cv2_parity.py` — `solve_pnp_numpy` vs the exact `cv2.solvePnPRansac` on + synthetic ground truth (needs cv2). +- `check_upstream_parity.py` — our whole orchestration vs the upstream recipe on + REAL engine output (needs cv2 + a dumped `[N,H,W,23]` `.f32`). +- `empirical.py` — cross-run residual-ladder on real engine output (the scale test). + +## Run the tests + +```sh +nix develop -c python3 pose/test_pose.py # golden, numpy only +nix develop -c python3 pose/check_cv2_parity.py # solver vs cv2 (needs cv2) +EMP_DIR=/path/to/dumps nix develop -c python3 \ + pose/check_upstream_parity.py A_scn.f32 # orchestration vs upstream +``` + +## Status + +- ✅ **Solver parity** — `solve_pnp_numpy` ≡ `cv2.solvePnPRansac` (~1e-7 clean). +- ✅ **Orchestration parity** — our `estimate_poses` is **bit-exact** to upstream's + scene recipe on real engine output (focal/PnP/relative-pose all 0.00° at matched + precision; the only residual is upstream's float32-K vs our float64, a RANSAC + inlier-boundary effect of ≤0.5° on near-degenerate object data, 0° on scene). +- ✅ **Empirical scale test** — cross-run mismatch is a uniform-scale similarity + (scene ~11% scale drift), no nonlinear warp; `diagnose` classifies it. + +## Not done yet (honest) + +- A **dense/known-good control** sequence (other system or GT poses) before + trusting the live path — the baseline-sweet-spot experiment. +- The live pipeline itself (sliding window, sim3 pose-graph to bound drift) and + consensus fusion (the floater-removal step) — separate, later. diff --git a/pose/_upstream.py b/pose/_upstream.py new file mode 100644 index 0000000..97f7199 --- /dev/null +++ b/pose/_upstream.py @@ -0,0 +1,107 @@ +"""Vendored upstream reference for the parity check -- NOT our code. + +These are the two numerical kernels FreeSplatter's `estimate_poses` calls, copied +from TencentARC/FreeSplatter `freesplatter/utils/recon_util.py` +(https://raw.githubusercontent.com/TencentARC/FreeSplatter/main/freesplatter/utils/recon_util.py, +fetched 2026-06-26). They exist here ONLY so `check_upstream_parity.py` can run +the original algorithm against our `focal.py`/`pnp.py` on real engine output. + + * `fast_pnp` -- copied VERBATIM (it is cv2 + numpy only; runs as-is in nix). + * `estimate_focal_np` -- a faithful numpy transcription of upstream + `estimate_focal` (the original is torch; the math is identical -- integer + `xy_grid` pixels, nan_to_num, mean/mean init, 10 IRLS reweights, the + `focal_base` clip which is a no-op at the default min/max). Kept line-by-line + so any divergence is a real algorithmic difference, not a re-derivation. + +Do not "improve" or refactor this file; its value is being unmodified upstream. +""" +import cv2 +import numpy as np + + +def xy_grid(W, H): + """Upstream xy_grid (numpy branch): (H,W,2) int, [...,0]=col(x), [...,1]=row(y).""" + tw, th = np.arange(W), np.arange(H) + gx, gy = np.meshgrid(tw, th, indexing="xy") # both (H,W) + return np.stack([gx, gy], axis=-1) + + +def estimate_focal_np(pts3d, pp=None, mask=None, min_focal=0.0, max_focal=np.inf): + """Faithful numpy transcription of upstream recon_util.estimate_focal.""" + H, W, THREE = pts3d.shape + assert THREE == 3 + if pp is None: + pp = np.array([W / 2.0, H / 2.0], dtype=float) + pp = np.asarray(pp, float) + + pixels = xy_grid(W, H).reshape(-1, 2).astype(float) - pp.reshape(1, 2) # (HW,2) + pts = pts3d.reshape(H * W, 3).astype(float) + + if mask is not None: + m = np.asarray(mask).ravel().astype(bool) + assert len(m) == pts.shape[0] + pts = pts[m] + pixels = pixels[m] + + xy_over_z = pts[:, :2] / pts[:, 2:3] + xy_over_z = np.nan_to_num(xy_over_z, posinf=0.0, neginf=0.0) + + dot_xy_px = (xy_over_z * pixels).sum(axis=-1) + dot_xy_xy = (xy_over_z ** 2).sum(axis=-1) + focal = dot_xy_px.mean() / dot_xy_xy.mean() + + for _ in range(10): + dis = np.linalg.norm(pixels - focal * xy_over_z, axis=-1) + w = 1.0 / np.clip(dis, 1e-8, None) + focal = (w * dot_xy_px).mean() / (w * dot_xy_xy).mean() + + focal_base = max(H, W) / (2 * np.tan(np.deg2rad(60) / 2)) + focal = float(np.clip(focal, min_focal * focal_base, max_focal * focal_base)) + return focal + + +def fast_pnp(pts3d, mask, focal=None, pp=None, niter_PnP=10, k_dtype=np.float32): + """VERBATIM copy of upstream recon_util.fast_pnp (cv2 + numpy). + + The default `k_dtype=np.float32` reproduces upstream byte-for-byte (upstream + writes `K = np.float32(...)`). `check_upstream_parity.py` passes float64 to + isolate the one precision choice that differs from our wrapper: on near- + threshold-dense object data the float32 K nudges a few hundred points across + the 5px inlier boundary, shifting the pose ~0.5deg. Matched precision -> exact.""" + H, W, _ = pts3d.shape + pixels = np.mgrid[:W, :H].T.astype(float) + + if focal is None: + S = max(W, H) + tentative_focals = np.geomspace(S / 2, S * 3, 21) + else: + tentative_focals = [focal] + + if pp is None: + pp = (W / 2, H / 2) + + best = 0, + for focal in tentative_focals: + K = np.array([(focal, 0, pp[0]), (0, focal, pp[1]), (0, 0, 1)], dtype=k_dtype) + + success, R, T, inliers = cv2.solvePnPRansac( + pts3d[mask], pixels[mask], K, None, + iterationsCount=niter_PnP, reprojectionError=5, flags=cv2.SOLVEPNP_SQPNP) + if not success: + continue + + score = len(inliers) + if success and score > best[0]: + best = score, R, T, focal + + if not best[0]: + return None + + _, R, T, best_focal = best + R = cv2.Rodrigues(R)[0] # world to cam + world2cam = np.eye(4).astype(float) + world2cam[:3, :3] = R + world2cam[:3, 3] = T.reshape(3) + cam2world = np.linalg.inv(world2cam) + + return best_focal, cam2world diff --git a/pose/align.py b/pose/align.py new file mode 100644 index 0000000..55d8652 --- /dev/null +++ b/pose/align.py @@ -0,0 +1,214 @@ +"""Coordinate-system normalization: align one inference's world into another's. + +DOWNSTREAM of the engine seam. Each free_splatter_run over a different photo +pair invents its OWN coordinate system (gaussians in the first view's camera +frame, then rescaled by s = 1/mean-point-distance -- see FreeSplatter +estimate_poses). Two runs that share a photo therefore describe the SAME surface +at a DIFFERENT scale, because each run's 1/d normalization is computed over a +different point support. So aligning run B into run A is a SIMILARITY transform +(rotation + translation + ONE uniform scale, 7 DoF), not a rigid one. + +This module fits that similarity in closed form (Umeyama 1991), provides a +RANSAC variant for floaters, and -- crucially -- a residual ladder that *detects* +whether a single uniform scale actually suffices or whether the mismatch is +non-uniform (focal error / affine depth ambiguity), which is the open question. + +Pure numpy. A similarity is represented as the tuple T = (s, R, t) acting on +column-stacked points by x -> s * R @ x + t. +""" +import numpy as np + + +def apply_sim(T, X): + """Apply T=(s,R,t) to X (N,3) -> (N,3).""" + s, R, t = T + return (s * (R @ X.T)).T + t + + +def _rms(resid): + """Root-mean-square point error of an (N,3) residual array.""" + return float(np.sqrt((resid ** 2).sum(axis=1).mean())) + + +def fit_similarity(X, Y, with_scale=True): + """Umeyama: best (s,R,t) minimizing ||s R X + t - Y||^2 in closed form. + + with_scale=False degenerates to rigid Kabsch (s fixed at 1). Reflections are + rejected (proper rotation, det R = +1) exactly as Umeyama prescribes. + """ + X = np.asarray(X, float) + Y = np.asarray(Y, float) + n = X.shape[0] + mx = X.mean(0) + my = Y.mean(0) + Xc = X - mx + Yc = Y - my + Sigma = (Yc.T @ Xc) / n # 3x3 cross-covariance, maps X -> Y + U, D, Vt = np.linalg.svd(Sigma) + S = np.eye(3) + if np.linalg.det(U) * np.linalg.det(Vt) < 0: # reflection guard + S[-1, -1] = -1.0 + R = U @ S @ Vt + if with_scale: + var_x = (Xc ** 2).sum() / n + s = float((D * np.diag(S)).sum() / var_x) if var_x > 0 else 1.0 + else: + s = 1.0 + t = my - s * (R @ mx) + return s, R, t + + +def fit_rigid(X, Y): + """Rigid (6-DoF) fit -- similarity with the scale DoF removed.""" + return fit_similarity(X, Y, with_scale=False) + + +def fit_affine(X, Y): + """Full affine (12-DoF) fit: Y ~= A X + b. Returns ((A,b), prediction). + + Used only as the next rung of the residual ladder: how much of the leftover + error after a similarity is explained by *non-uniform* (shear/anisotropic) + deformation -- the signature of focal error or affine-invariant depth. + """ + n = X.shape[0] + Xh = np.hstack([X, np.ones((n, 1))]) # (n,4) + M, *_ = np.linalg.lstsq(Xh, Y, rcond=None) # (4,3) + pred = Xh @ M + A = M[:3].T + b = M[3] + return (A, b), pred + + +def fit_similarity_ransac(X, Y, thresh, iters=300, with_scale=True, seed=0): + """RANSAC similarity: robust to gross outliers (floaters in the overlap). + + Minimal sample is 3 point pairs (a similarity has 7 DoF; 3 pairs give 9 + constraints). Returns (s,R,t, inlier_mask). + """ + rng = np.random.default_rng(seed) + n = len(X) + best = np.zeros(n, bool) + for _ in range(iters): + idx = rng.choice(n, 3, replace=False) + try: + T = fit_similarity(X[idx], Y[idx], with_scale) + except np.linalg.LinAlgError: + continue + res = np.linalg.norm(apply_sim(T, X) - Y, axis=1) + inl = res < thresh + if inl.sum() > best.sum(): + best = inl + if best.sum() < 3: # fall back to plain fit + best = np.ones(n, bool) + T = fit_similarity(X[best], Y[best], with_scale) + return (*T, best) + + +def residual_ladder(X, Y): + """Fit rigid -> similarity -> affine and report each RMS residual. + + The DROP from rigid->similarity is how much was pure uniform scale; the drop + from similarity->affine is how much non-uniform warp remains. depth_corr is + the Pearson correlation between the per-point similarity residual magnitude + and the point's distance from the centroid -- a positive value means the + leftover error GROWS with depth, the fingerprint of focal/projective error. + All residuals are absolute (same units as Y); compare against `scene`. + """ + scene = _rms(Y - Y.mean(0)) # overall extent of the target + + Tr = fit_rigid(X, Y) + r_rigid = _rms(apply_sim(Tr, X) - Y) + + Ts = fit_similarity(X, Y, with_scale=True) + res_sim = apply_sim(Ts, X) - Y + r_sim = _rms(res_sim) + + _, pred_aff = fit_affine(X, Y) + r_aff = _rms(pred_aff - Y) + + depth = np.linalg.norm(X - X.mean(0), axis=1) + rmag = np.linalg.norm(res_sim, axis=1) + depth_corr = float(np.corrcoef(depth, rmag)[0, 1]) if rmag.std() > 1e-12 else 0.0 + + return { + "scene": scene, + "rigid": r_rigid, + "similarity": r_sim, + "affine": r_aff, + "scale": float(Ts[0]), + "depth_corr": depth_corr, + } + + +def diagnose(X, Y, tol=1e-3, corr_tol=0.3): + """Classify the A<->B mismatch from the residual ladder. + + Residuals are judged relative to scene extent. Verdicts: + rigid_ok : a rigid transform already fits -- same scale (rare across runs) + needs_scale : a SIMILARITY fits but rigid does not -- the expected case + needs_affine: even similarity fails but affine fits -- non-uniform warp + nonlinear : affine fails too -- genuinely nonlinear (e.g. quadratic depth) + `depth_corr` is reported as corroborating evidence of structure. + """ + L = residual_ladder(X, Y) + sc = L["scene"] or 1.0 + rr, rs, ra = L["rigid"] / sc, L["similarity"] / sc, L["affine"] / sc + # Is the leftover after a similarity STRUCTURED (a real warp) or just noise? + # Two independent signatures: affine meaningfully beats similarity, OR the + # residual correlates with depth. Neither => unstructured noise floor. + aff_gain = (L["similarity"] - L["affine"]) / L["similarity"] if L["similarity"] > 0 else 0.0 + structured = abs(L["depth_corr"]) > corr_tol or aff_gain > 0.25 + if rr < tol: + v = "rigid_ok" # same scale AND pose: rare across runs + elif rs < tol: + v = "needs_scale" # a clean similarity fits: the expected case + elif ra < tol: + v = "needs_affine" # similarity fails, affine fits: non-uniform + elif structured: + v = "nonlinear" # even affine fails AND residual is structured + else: + v = "similarity_plus_noise" # similarity is the model; rest is noise floor + L["verdict"] = v + L["structured"] = bool(structured) + L["aff_gain"] = float(aff_gain) + return L + + +# --- chaining similarities along the photo stream -------------------------- + +def compose(T2, T1): + """Return T2 . T1 (apply T1 then T2).""" + s1, R1, t1 = T1 + s2, R2, t2 = T2 + return (s2 * s1, R2 @ R1, s2 * (R2 @ t1) + t2) + + +def invert(T): + """Inverse similarity.""" + s, R, t = T + si = 1.0 / s + Ri = R.T + return (si, Ri, -si * (Ri @ t)) + + +def identity(): + return (1.0, np.eye(3), np.zeros(3)) + + +def loop_closure_error(transforms): + """Compose a closed loop of similarities and measure deviation from identity. + + transforms: list [T_{0->1}, T_{1->2}, ..., T_{k->0}]. With drift-free links + the product is identity; the returned dict quantifies residual scale, rotation + (deg) and translation drift -- a direct scale-drift / loop-closure metric. + """ + T = identity() + for Ti in transforms: + T = compose(Ti, T) + s, R, t = T + ang = np.degrees(np.arccos(np.clip((np.trace(R) - 1) / 2, -1, 1))) + return { + "scale_err": abs(np.log(s)), # 0 == no scale drift + "rot_deg": float(ang), + "trans": float(np.linalg.norm(t)), + } diff --git a/pose/check_cv2_parity.py b/pose/check_cv2_parity.py new file mode 100644 index 0000000..4f9e8f2 --- /dev/null +++ b/pose/check_cv2_parity.py @@ -0,0 +1,73 @@ +"""Verify the numpy DLT/RANSAC PnP against the EXACT upstream solver. + +FreeSplatter uses cv2.solvePnPRansac(reprojErr=5, SOLVEPNP_SQPNP). This compares +our dependency-free reference backend to that exact call on synthetic scenes with +known ground truth -- both must recover the true pose AND agree with each other. +Needs cv2 (nix develop, after opencv was added to the flake). + + nix develop -c python3 pose/check_cv2_parity.py +""" +import numpy as np +import pnp + +RNG = np.random.default_rng(7) +fails = [] + + +def rot_angle(R): + return np.degrees(np.arccos(np.clip((np.trace(R) - 1) / 2, -1, 1))) + + +def rand_small_pose(rng): + ax = rng.normal(size=3) + ax /= np.linalg.norm(ax) + ang = np.radians(rng.uniform(3, 20)) + Kx = np.array([[0, -ax[2], ax[1]], [ax[2], 0, -ax[0]], [-ax[1], ax[0], 0]]) + R = np.eye(3) + np.sin(ang) * Kx + (1 - np.cos(ang)) * Kx @ Kx + t = rng.uniform(-0.6, 0.6, size=3) + return R, t + + +def run(label, with_outliers): + f = 400.0 + K = np.array([[f, 0, 256.0], [0, f, 256.0], [0, 0, 1.0]]) + worst = {"np_R": 0, "np_t": 0, "cv_R": 0, "cv_t": 0, "agree_R": 0, "agree_t": 0} + for _ in range(10): + Pw = RNG.uniform([-2, -2, 4], [2, 2, 8], size=(500, 3)) + R, t = rand_small_pose(RNG) + Xc = (R @ Pw.T).T + t + if np.any(Xc[:, 2] <= 0): + continue + px = (K @ Xc.T).T + px = px[:, :2] / px[:, 2:3] + if with_outliers: + oi = RNG.choice(len(px), int(0.2 * len(px)), replace=False) + px[oi] += RNG.uniform(-120, 120, size=(len(oi), 2)) + + c2w_np, _ = pnp.solve_pnp_numpy(Pw, px, K, thresh_px=4.0, iters=300) + c2w_cv, _ = pnp.solve_pnp_cv2(Pw, px, K) + w_np, w_cv = np.linalg.inv(c2w_np), np.linalg.inv(c2w_cv) + worst["np_R"] = max(worst["np_R"], rot_angle(w_np[:3, :3] @ R.T)) + worst["np_t"] = max(worst["np_t"], np.linalg.norm(w_np[:3, 3] - t)) + worst["cv_R"] = max(worst["cv_R"], rot_angle(w_cv[:3, :3] @ R.T)) + worst["cv_t"] = max(worst["cv_t"], np.linalg.norm(w_cv[:3, 3] - t)) + worst["agree_R"] = max(worst["agree_R"], rot_angle(w_np[:3, :3] @ w_cv[:3, :3].T)) + worst["agree_t"] = max(worst["agree_t"], np.linalg.norm(w_np[:3, 3] - w_cv[:3, 3])) + + tolR, tolt = (0.5, 0.05) if with_outliers else (1e-3, 1e-4) + print(f"{label}: worst-of-10 " + f"numpy(R={worst['np_R']:.2e}deg t={worst['np_t']:.2e}) " + f"cv2(R={worst['cv_R']:.2e}deg t={worst['cv_t']:.2e}) " + f"agree(R={worst['agree_R']:.2e}deg t={worst['agree_t']:.2e})") + ok = (worst["np_R"] < tolR and worst["cv_R"] < tolR and + worst["agree_R"] < tolR and worst["agree_t"] < tolt) + if not ok: + fails.append(label) + return ok + + +print("PnP backend parity: numpy DLT/RANSAC vs upstream cv2.solvePnPRansac") +run("clean ", with_outliers=False) +run("outliers", with_outliers=True) +print("PARITY OK" if not fails else f"PARITY FAILED: {fails}") +raise SystemExit(1 if fails else 0) diff --git a/pose/check_upstream_parity.py b/pose/check_upstream_parity.py new file mode 100644 index 0000000..6fceb65 --- /dev/null +++ b/pose/check_upstream_parity.py @@ -0,0 +1,154 @@ +"""Verify our WHOLE pose orchestration against upstream, on REAL engine output. + +`check_cv2_parity.py` already pinned our solver wrapper to cv2.solvePnPRansac on +synthetic ground truth. This goes one level up: it drives the *entire* upstream +`estimate_poses` recipe (FreeSplatter scene path: use_first_focal, opacity mask +for PnP, no focal mask, pnp_iter=10 -- see .cache/upstream/{model,runner}.py) +using the vendored-verbatim upstream kernels in `_upstream.py`, and compares it +to our `focal.py`/`pnp.py` on the gaussians the C++ engine actually produced. + +Three levels, each isolating one stage so a divergence points at one file: + 1. focal kernel -- focal.estimate_focal vs _upstream.estimate_focal_np + 2. PnP solver -- pnp.solve_pnp_cv2 vs _upstream.fast_pnp (shared focal) + 3. full pipeline -- pnp.estimate_poses vs the upstream recipe replica +Plus the scene baseline-rescale (runner.run_freesplatter_scene) for reference. + + nix develop -c python3 pose/check_upstream_parity.py A_scn.f32 # scene (live case) + +Defaults to the scratchpad dumps if no path is given. Opacity in the engine +output is ALREADY sigmoid-activated, so mask = (opacity > thr) reproduces +upstream's sigmoid(raw) > thr exactly. +""" +import os +import sys +import numpy as np +import cv2 + +import focal +import pnp +import _upstream as up + +C, OPACITY = 23, 15 +THR = 0.05 +SEED = 0 # cv2.solvePnPRansac draws from the shared global cv::theRNG(); + # reseed identically before each call so RANSAC picks the SAME + # minimal samples -> isolates algorithmic parity from RNG state. +SCRATCH = os.environ.get("EMP_DIR", "").rstrip("/") +_fails = [] + + +def rot_angle(R): + return float(np.degrees(np.arccos(np.clip((np.trace(R) - 1) / 2, -1, 1)))) + + +def load(path, H=512, W=512): + a = np.fromfile(path, np.float32) + N = a.size // (H * W * C) + a = a.reshape(N, H, W, C) + pts = [a[i, ..., 0:3].astype(np.float64) for i in range(N)] # (H,W,3) each + op = [a[i, ..., OPACITY].astype(np.float64) for i in range(N)] # (H,W) each + return N, H, W, pts, op + + +def check(name, cond, detail=""): + tag = "PASS" if cond else "FAIL" + print(f" [{tag}] {name}" + (f" ({detail})" if detail else "")) + if not cond: + _fails.append(name) + + +def upstream_estimate_poses(pts, op, H, W, pnp_iter=10): + """Replicate FreeSplatter scene estimate_poses with vendored kernels. + + use_first_focal=True, masks=None: focal from view0 over ALL pixels; per-view + PnP masked by opacity>THR. Returns (c2ws list, focal).""" + pp = np.array([W / 2.0, H / 2.0]) + f = up.estimate_focal_np(pts[0], pp=pp, mask=None) # view0, all pixels + c2ws = [] + cv2.setRNGSeed(SEED) + for i in range(len(pts)): + mask = op[i] > THR # (H,W) bool + res = up.fast_pnp(pts[i], mask, focal=f, niter_PnP=pnp_iter, + k_dtype=np.float64) # matched precision (our wrapper is f64) + c2ws.append(res[1]) + return c2ws, f + + +def main(): + path = sys.argv[1] if len(sys.argv) > 1 else os.path.join(SCRATCH, "A_scn.f32") + if not os.path.isabs(path) and SCRATCH and not os.path.exists(path): + path = os.path.join(SCRATCH, path) + N, H, W, pts, op = load(path) + pp = np.array([W / 2.0, H / 2.0]) + print(f"=== upstream-parity on {os.path.basename(path)} ({N} views) ===") + for i in range(N): + print(f" view{i}: valid(opacity>{THR}) = {(op[i] > THR).mean()*100:.1f}%") + + # ---- Level 1: focal kernel (view0, all pixels) ---- + print("\n[1] focal kernel focal.estimate_focal vs upstream.estimate_focal_np") + grid = pnp.pixel_grid(H, W) # our pixel convention + f_up = up.estimate_focal_np(pts[0], pp=pp, mask=None) + f_ours = focal.estimate_focal(pts[0].reshape(-1, 3), grid, pp) + rel = abs(f_ours - f_up) / f_up + check("focal matches upstream (all pixels, view0)", rel < 1e-6, + f"ours {f_ours:.4f} vs up {f_up:.4f} rel {rel:.2e}") + + # ---- Level 2: PnP solver, shared upstream focal ---- + print("\n[2] PnP solver pnp.solve_pnp_cv2 vs upstream.fast_pnp (focal fixed)") + K = np.array([[f_up, 0, pp[0]], [0, f_up, pp[1]], [0, 0, 1.0]]) + for i in range(N): + mask = op[i] > THR + Xw = pts[i].reshape(-1, 3)[mask.ravel()] + px = grid[mask.ravel()] + cv2.setRNGSeed(SEED) + c2w_up = up.fast_pnp(pts[i], mask, focal=f_up, niter_PnP=10, + k_dtype=np.float64)[1] # matched precision + cv2.setRNGSeed(SEED) + c2w_ours, _ = pnp.solve_pnp_cv2(Xw, px, K, iters=10, reproj=5.0) + dR = rot_angle(c2w_ours[:3, :3] @ c2w_up[:3, :3].T) + dt = float(np.linalg.norm(c2w_ours[:3, 3] - c2w_up[:3, 3])) + check(f"view{i} cam2world matches upstream", dR < 1e-3 and dt < 1e-4, + f"dR {dR:.2e} deg dt {dt:.2e}") + # diagnostic: upstream's native float32 K (precision-only effect) + cv2.setRNGSeed(SEED) + c2w_f32 = up.fast_pnp(pts[i], mask, focal=f_up, niter_PnP=10)[1] + dR32 = rot_angle(c2w_ours[:3, :3] @ c2w_f32[:3, :3].T) + print(f" (upstream native float32 K: dR {dR32:.2e} deg -- " + f"RANSAC inlier-boundary precision, not orchestration)") + + # ---- Level 3: full pipeline ---- + print("\n[3] full pipeline pnp.estimate_poses vs upstream recipe") + up_c2ws, f_up2 = upstream_estimate_poses(pts, op, H, W, pnp_iter=10) + cv2.setRNGSeed(SEED) + out = pnp.estimate_poses(pts, op, opacity_threshold=THR, backend="cv2", + normalize=False, cam_frame_points=pts) + our_c2ws, f_our2 = out["cam2world"], out["focal"] + relf = abs(f_our2 - f_up2) / f_up2 + check("pipeline focal matches", relf < 1e-6, f"ours {f_our2:.4f} vs up {f_up2:.4f}") + # anchor-invariant relative pose view0->view1 (handles any re-anchor convention) + def rel_pose(c): + return np.linalg.inv(c[0]) @ c[1] + Rup, Rours = rel_pose(up_c2ws), rel_pose(our_c2ws) + dR = rot_angle(Rours[:3, :3] @ Rup[:3, :3].T) + # compare translation DIRECTION (scale/anchor independent) + tup, tour = Rup[:3, 3], Rours[:3, 3] + cos_t = float(tup @ tour / (np.linalg.norm(tup) * np.linalg.norm(tour) + 1e-12)) + check("relative-pose rotation matches", dR < 1e-2, f"dR {dR:.2e} deg") + check("relative-pose translation direction matches", cos_t > 1 - 1e-6, + f"cos {cos_t:.8f}") + + # ---- scene baseline rescale (runner), reference only ---- + baseline = float(np.linalg.norm(up_c2ws[1][:3, 3] - up_c2ws[0][:3, 3])) + 1e-2 + print(f"\n[ref] scene baseline rescale: 1/baseline = {1.0/baseline:.4f} " + f"(runner applies this to camera centers; our estimate_poses returns raw " + f"poses, normalize=False)") + + print() + if _fails: + print(f"UPSTREAM PARITY FAILED ({len(_fails)}): " + ", ".join(_fails)) + raise SystemExit(1) + print("UPSTREAM PARITY OK") + + +if __name__ == "__main__": + main() diff --git a/pose/empirical.py b/pose/empirical.py new file mode 100644 index 0000000..bd91498 --- /dev/null +++ b/pose/empirical.py @@ -0,0 +1,83 @@ +"""Empirical cross-run test on REAL engine output. + +A photo that appears in two separate inferences (an overlapping pair sharing it) +is reconstructed twice, in two different coordinate systems. Its per-pixel +gaussian centers from the two runs are the SAME physical points described two +ways -- exact correspondences. Fitting a similarity between them and reading the +residual ladder answers the open question on real data: is the cross-run mismatch +a clean uniform-scale similarity, or does it carry nonlinear warp? + +Usage: + nix develop -c python3 pose/empirical.py A.f32 viewA B.f32 viewB [label] + +A.f32/B.f32 are raw engine outputs [N,H,W,23]; viewA/viewB pick the shared photo +within each (e.g. it's view 1 of pair (000,004) and view 0 of pair (004,013)). +""" +import sys +import numpy as np +import align + +C = 23 +OPACITY = 15 + + +def load_view(path, view, H=512, W=512): + a = np.fromfile(path, np.float32) + N = a.size // (H * W * C) + a = a.reshape(N, H, W, C) + g = a[view] + return g[..., 0:3].reshape(-1, 3).astype(np.float64), g[..., OPACITY].reshape(-1) + + +def main(): + pA, vA, pB, vB = sys.argv[1], int(sys.argv[2]), sys.argv[3], int(sys.argv[4]) + label = sys.argv[5] if len(sys.argv) > 5 else "" + XA, oA = load_view(pA, vA) + XB, oB = load_view(pB, vB) + + print(f"=== {label} ===") + print(f"opacity A: [{oA.min():.3f},{oA.max():.3f}] mean {oA.mean():.3f} | " + f"B: [{oB.min():.3f},{oB.max():.3f}] mean {oB.mean():.3f}") + + thr = 0.05 + mask = (oA > thr) & (oB > thr) + A, B = XA[mask], XB[mask] + print(f"valid (opacity>{thr} in both): {mask.sum()}/{mask.size} " + f"({100*mask.mean():.1f}%)") + if mask.sum() < 100: + print("too few valid correspondences"); return + + scene = align._rms(A - A.mean(0)) + # robust fit B -> A; threshold = 2% of scene extent + s, R, t, inl = align.fit_similarity_ransac(B, A, thresh=0.02 * scene, iters=400) + print(f"scene extent (rms): {scene:.4f}") + print(f"robust similarity B->A: scale={s:.4f} inliers={inl.sum()}/{len(A)} " + f"({100*inl.mean():.1f}%)") + + # how cross-run-consistent is the WHOLE valid set under the robust fit? + res_all = np.linalg.norm(align.apply_sim((s, R, t), B) - A, axis=1) + print("cross-run consistency (all valid pixels, under robust similarity):") + for q in (0.01, 0.02, 0.05, 0.10): + print(f" within {100*q:4.0f}% of scene extent: " + f"{100*(res_all < q*scene).mean():5.1f}% of pixels") + + # residual ladder on the inlier (well-corresponded) surface + L = align.diagnose(B[inl], A[inl]) + sc = L["scene"] + print("residual ladder (inliers, RMS | % of scene extent):") + for k in ("rigid", "similarity", "affine"): + print(f" {k:11s} {L[k]:.5f} {100*L[k]/sc:6.2f}%") + print(f" recovered scale {L['scale']:.4f}") + print(f" depth_corr {L['depth_corr']:+.3f} (structured={L['structured']})") + print(f" VERDICT {L['verdict']}") + + # interpretation + drop_scale = (L["rigid"] - L["similarity"]) / sc + drop_aff = (L["similarity"] - L["affine"]) / sc + print(f"interpretation: rigid->similarity drop {100*drop_scale:.2f}% " + f"(=how much was uniform scale); similarity->affine drop " + f"{100*drop_aff:.2f}% (=non-uniform warp).") + + +if __name__ == "__main__": + main() diff --git a/pose/focal.py b/pose/focal.py new file mode 100644 index 0000000..1e2ac27 --- /dev/null +++ b/pose/focal.py @@ -0,0 +1,48 @@ +"""Shared focal-length estimation (Weiszfeld), faithful to FreeSplatter. + +Upstream (freesplatter/models/model.py::estimate_focal, via DUSt3R) assumes a +single shared focal across views, centered principal point, square pixels. Per +view it solves + + f* = argmin_f sum_ij || (i',j') - f * (X_ij0/X_ij2, X_ij1/X_ij2) || + +where (i',j') are pixel coords relative to the principal point and X is that +view's camera-frame point map (gaussian xyz). The robust minimizer is found by +the Weiszfeld algorithm (iteratively reweighted least squares, ~L1). Per-view +focals are then averaged into one shared value. + +Pure numpy -- no cv2 needed for this stage. +""" +import numpy as np + + +def estimate_focal(pts3d_cam, pixels, pp, weiszfeld_iters=10): + """Weiszfeld focal from camera-frame points and their pixels. + + pts3d_cam : (...,3) points in the view's OWN camera frame (Z>0 in front). + pixels : (...,2) pixel coordinates, same leading shape. + pp : (2,) principal point (image center). + """ + X = np.asarray(pts3d_cam, float).reshape(-1, 3) + P = np.asarray(pixels, float).reshape(-1, 2) - np.asarray(pp, float) + # ray directions; Z==0 -> 0 exactly as upstream estimate_focal (nan_to_num) + u = np.nan_to_num(X[:, :2] / X[:, 2:3], posinf=0.0, neginf=0.0) # (N,2) + + pu = (P * u).sum(1) # per-point p . u + uu = (u * u).sum(1) # per-point ||u||^2 + f = pu.sum() / uu.sum() # least-squares init + for _ in range(weiszfeld_iters): + r = np.linalg.norm(P - f * u, axis=1) + w = 1.0 / np.maximum(r, 1e-8) + f = (w * pu).sum() / (w * uu).sum() + return float(f) + + +def estimate_shared_focal(views, pp, weiszfeld_iters=10): + """Average per-view focals into one shared focal. + + views: list of (pts3d_cam, pixels) pairs. Mirrors upstream's + `focals = focals.mean().repeat(N)`. + """ + fs = [estimate_focal(p3, px, pp, weiszfeld_iters) for (p3, px) in views] + return float(np.mean(fs)), fs diff --git a/pose/pnp.py b/pose/pnp.py new file mode 100644 index 0000000..85f3dd8 --- /dev/null +++ b/pose/pnp.py @@ -0,0 +1,245 @@ +"""PnP camera recovery from the engine's per-pixel gaussians. + +Mirrors FreeSplatter's estimate_poses (freesplatter/models/model.py, which calls +the DUSt3R-derived fast_pnp in utils/recon_util.py): + + * correspondences : every pixel -> its predicted 3D gaussian center X (channels + 0:3) <-> its 2D pixel coordinate. + * validity mask : opacity > 0.05 (drops floaters before PnP). + * focal : Weiszfeld per view, averaged into one shared focal (focal.py). + * pose : cv2.solvePnPRansac(reprojectionError=5, SOLVEPNP_SQPNP, + iterationsCount=10); world2cam, then cam2world = inv(.); + OpenCV convention; view 0 comes out ~identity by construction + (gaussians are predicted in view 0's frame), no re-anchor. + * scene scale : upstream's `estimate_poses` does NOT normalize; the runner + (run_freesplatter_scene) rescales camera centers so the + view0->view1 baseline is 1.0 (s = 1/(||t1-t0||+1e-2)). See + `rescale_to_baseline`. (An earlier 1/mean||pts|| guess was + wrong -- verified against .cache/upstream/{model,runner}.py.) + +Two solver backends, selected by `backend`: + - "cv2" : the exact upstream call (needs opencv; available in the CUDA + container or after adding opencv to the nix shell). Use this for + numerical parity against the original. + - "numpy": a dependency-free DLT + RANSAC reference solver, so the pipeline and + its golden tests run anywhere numpy is present. Verified against + analytic ground truth in test_pose.py. + - "auto" : cv2 if importable, else numpy. + +Gaussian channel layout (scene, 23 ch): xyz[0:3] SH[3:15] opacity[15] scale[16:19] +rotation[19:23]; opacity is assumed already activated (sigmoid) in [0,1]. +""" +import numpy as np + +OPACITY_CHANNEL = 15 + + +# --- numpy reference PnP (known intrinsics) -------------------------------- + +def _dlt(Xw, xn): + """Direct Linear Transform for world2cam [R|t] from normalized image coords. + + Xw : (N,3) world points. xn : (N,2) normalized coords ( K^-1 [u v 1] ). + Returns the raw 3x4 matrix p (defined up to sign/scale); decode with _decode. + """ + N = len(Xw) + A = np.zeros((2 * N, 12)) + Xh = np.hstack([Xw, np.ones((N, 1))]) # (N,4) + A[0::2, 0:4] = Xh + A[0::2, 8:12] = -xn[:, 0:1] * Xh + A[1::2, 4:8] = Xh + A[1::2, 8:12] = -xn[:, 1:2] * Xh + _, _, Vt = np.linalg.svd(A) + return Vt[-1].reshape(3, 4) + + +def _decode(p, Xw, xn): + """Turn a DLT matrix into a proper (R,t) world2cam, resolving sign by + reprojection error and cheirality (points must sit in front of the camera).""" + best = None + for sgn in (1.0, -1.0): + M = sgn * p[:, :3] + tau = sgn * p[:, 3] + U, Sv, Vt = np.linalg.svd(M) + d = np.linalg.det(U @ Vt) + R = U @ np.diag([1, 1, d]) @ Vt # nearest proper rotation + lam = Sv.mean() # M ~= lam * R + if lam < 1e-12: + continue + t = tau / lam + Xc = (R @ Xw.T).T + t + if np.any(Xc[:, 2] <= 0): + err = np.inf + else: + proj = Xc[:, :2] / Xc[:, 2:3] + err = float(np.sqrt(((proj - xn) ** 2).sum(1)).mean()) + if best is None or err < best[0]: + best = (err, R, t) + if best is None: + raise np.linalg.LinAlgError("PnP decode failed (all-behind camera)") + return best[1], best[2], best[0] + + +def solve_pnp_numpy(Xw, pixels, K, thresh_px=5.0, iters=100, seed=0): + """RANSAC DLT PnP. Returns (cam2world 4x4, inlier_mask).""" + Kinv = np.linalg.inv(K) + uv1 = np.hstack([pixels, np.ones((len(pixels), 1))]) + xn = (Kinv @ uv1.T).T[:, :2] # normalized image coords + thresh = thresh_px / K[0, 0] # px tol -> normalized tol + + rng = np.random.default_rng(seed) + n = len(Xw) + best = np.zeros(n, bool) + for _ in range(iters): + idx = rng.choice(n, 6, replace=False) + try: + R, t, _ = _decode(_dlt(Xw[idx], xn[idx]), Xw[idx], xn[idx]) + except np.linalg.LinAlgError: + continue + Xc = (R @ Xw.T).T + t + good = Xc[:, 2] > 0 + proj = np.full((n, 2), 1e9) + proj[good] = Xc[good, :2] / Xc[good, 2:3] + inl = np.linalg.norm(proj - xn, axis=1) < thresh + if inl.sum() > best.sum(): + best = inl + if best.sum() < 6: + best = np.ones(n, bool) + R, t, _ = _decode(_dlt(Xw[best], xn[best]), Xw[best], xn[best]) + world2cam = np.eye(4) + world2cam[:3, :3] = R + world2cam[:3, 3] = t + return np.linalg.inv(world2cam), best + + +def solve_pnp_cv2(Xw, pixels, K, iters=20, reproj=5.0): + """Exact upstream call (needs cv2). Returns (cam2world, inlier_mask).""" + import cv2 + ok, rvec, tvec, inliers = cv2.solvePnPRansac( + Xw.astype(np.float64), pixels.astype(np.float64), K.astype(np.float64), + None, iterationsCount=iters, reprojectionError=reproj, + flags=cv2.SOLVEPNP_SQPNP) + if not ok: + raise RuntimeError("cv2.solvePnPRansac failed") + R, _ = cv2.Rodrigues(rvec) + world2cam = np.eye(4) + world2cam[:3, :3] = R + world2cam[:3, 3] = tvec[:, 0] + mask = np.zeros(len(Xw), bool) + if inliers is not None: + mask[inliers[:, 0]] = True + return np.linalg.inv(world2cam), mask + + +def _have_cv2(): + try: + import cv2 # noqa: F401 + return True + except Exception: + return False + + +def solve_pnp(Xw, pixels, K, backend="auto", **kw): + if backend == "auto": + backend = "cv2" if _have_cv2() else "numpy" + if backend == "cv2": + return solve_pnp_cv2(Xw, pixels, K, **kw) + return solve_pnp_numpy(Xw, pixels, K, **kw) + + +# --- orchestration --------------------------------------------------------- + +def pixel_grid(H, W): + """(H*W, 2) INTEGER pixel coords [x=col, y=row], row-major -- matches upstream + xy_grid / np.mgrid[:W,:H].T exactly (NO half-pixel offset). Row-major so it + lines up with a C-order (H,W,3) gaussian map flattened by reshape(-1,3).""" + xs, ys = np.meshgrid(np.arange(W), np.arange(H)) # both (H,W): xs=col, ys=row + return np.stack([xs.ravel(), ys.ravel()], axis=1).astype(float) + + +def rescale_to_baseline(cam2world): + """Scene normalization from runner.run_freesplatter_scene: rescale camera + centers so the view0->view(last) baseline is 1.0. Returns (list, scale_factor). + + Mirrors upstream exactly: baseline = ||t_last - t_0|| + 1e-2; s = 1/baseline; + every camera translation *= s (rotations untouched).""" + t0 = cam2world[0][:3, 3] + baseline = float(np.linalg.norm(cam2world[-1][:3, 3] - t0)) + 1e-2 + s = 1.0 / baseline + out = [] + for c in cam2world: + c = c.copy() + c[:3, 3] *= s + out.append(c) + return out, s + + +def estimate_poses(points, opacities=None, focal=None, pp=None, + opacity_threshold=0.05, backend="auto", cam_frame_points=None, + use_first_focal=True, focal_use_opacity_mask=False, + pnp_iter=10, normalize=False): + """Faithful port of FreeSplatter's scene estimate_poses (.cache/upstream/model.py) + plus the runner's optional baseline rescale. + + Defaults mirror the scene recipe: focal from view 0 over ALL pixels + (use_first_focal=True, masks=None), per-view PnP masked by opacity>thr, + pnp_iter=10, cam2world=inv(world2cam). estimate_poses returns the RAW poses + (view 0 comes out ~identity because the gaussians live in view 0's frame -- no + explicit re-anchor, matching upstream). Set normalize=True for the runner's + 1/baseline camera rescale (the scene viewer normalization). + + points : list of (H,W,3) gaussian-center maps (view 0 frame). + opacities : list of (H,W) activated opacities for the PnP mask (or None). + cam_frame_points: per-view points in each view's OWN frame for focal. With + use_first_focal only view 0 is used, and view 0's frame IS the + world frame, so points[0] suffices; pass None to use `points`. + focal_use_opacity_mask: scene=False (all pixels, faithful); object would pass an + alpha mask -- approximated here by the opacity mask if True. + Returns dict: cam2world (list of 4x4), focal, scale. + """ + from focal import estimate_focal + + backend_cv2 = backend == "cv2" or (backend == "auto" and _have_cv2()) + + N = len(points) + H, W = points[0].shape[:2] + if pp is None: + pp = np.array([W / 2.0, H / 2.0]) + grid = pixel_grid(H, W) + + masks = [] + for i in range(N): + if opacities is None: + masks.append(np.ones(H * W, bool)) + else: + masks.append(opacities[i].ravel() > opacity_threshold) + + # focal: mirror estimate_focals -- view 0 only (use_first_focal), all pixels + # unless an opacity mask is explicitly requested. + if focal is None: + cfp = cam_frame_points if cam_frame_points is not None else points + n_focal = 1 if use_first_focal else N + fs = [] + for i in range(n_focal): + m = masks[i] if (focal_use_opacity_mask and opacities is not None) \ + else np.ones(H * W, bool) + fs.append(estimate_focal(cfp[i].reshape(-1, 3)[m], grid[m], pp)) + focal = float(np.mean(fs)) + + K = np.array([[focal, 0, pp[0]], [0, focal, pp[1]], [0, 0, 1.0]]) + + cam2world = [] + for i in range(N): + Xw = points[i].reshape(-1, 3)[masks[i]] + px = grid[masks[i]] + if backend_cv2: + c2w, _ = solve_pnp_cv2(Xw, px, K, iters=pnp_iter, reproj=5.0) + else: + c2w, _ = solve_pnp_numpy(Xw, px, K) + cam2world.append(c2w) + + scale = 1.0 + if normalize: + cam2world, scale = rescale_to_baseline(cam2world) + + return {"cam2world": cam2world, "focal": focal, "scale": scale} diff --git a/pose/test_pose.py b/pose/test_pose.py new file mode 100644 index 0000000..aec4af7 --- /dev/null +++ b/pose/test_pose.py @@ -0,0 +1,235 @@ +"""Asset-free golden tests for the pose / coordinate-normalization prototype. + +No model, no fixtures, no cv2: synthetic geometry with KNOWN ground truth, the +way CLAUDE.md wants new ops verified ("pinned to hand-computed references... a +wrong op should fail with zero fixtures"). Run: + + nix develop -c python3 pose/test_pose.py # from repo root + cd pose && nix develop -c python3 test_pose.py + +Each test prints PASS/FAIL with the numbers it checked; exit code is nonzero on +any failure. +""" +import numpy as np + +import align +import focal +import pnp + +RNG = np.random.default_rng(12345) +_fails = [] + + +def check(name, cond, detail: object = ""): + tag = "PASS" if cond else "FAIL" + print(f" [{tag}] {name}" + (f" ({detail})" if detail else "")) + if not cond: + _fails.append(name) + + +def rand_rotation(rng): + Q, _ = np.linalg.qr(rng.normal(size=(3, 3))) + if np.linalg.det(Q) < 0: + Q[:, 0] = -Q[:, 0] + return Q + + +def rot_angle(R): + return np.degrees(np.arccos(np.clip((np.trace(R) - 1) / 2, -1, 1))) + + +# =========================================================================== +# Coordinate-system normalization (the scale question) +# =========================================================================== + +def test_similarity_roundtrip(): + """Recover a KNOWN (s,R,t) exactly -- the fundamental correctness golden.""" + print("test_similarity_roundtrip") + X = RNG.normal(size=(200, 3)) * np.array([3, 2, 5]) + s_t, R_t, t_t = 1.7, rand_rotation(RNG), np.array([4.0, -1.0, 2.0]) + Y = align.apply_sim((s_t, R_t, t_t), X) + s, R, t = align.fit_similarity(X, Y) + check("scale", abs(s - s_t) < 1e-9, f"{s:.12f} vs {s_t}") + check("rotation", rot_angle(R @ R_t.T) < 1e-7, f"{rot_angle(R @ R_t.T):.2e} deg") + check("translation", np.linalg.norm(t - t_t) < 1e-7) + check("residual~0", align._rms(align.apply_sim((s, R, t), X) - Y) < 1e-9) + + +def test_scale_detection(): + """B differs from A by PURE uniform scale: rigid fails, similarity nails it. + This is the detector for 'you need the scale DoF' -- the expected cross-run case.""" + print("test_scale_detection") + X = RNG.normal(size=(300, 3)) * 2.0 + s_t = 2.5 + Y = align.apply_sim((s_t, rand_rotation(RNG), np.array([1.0, 2.0, -3.0])), X) + L = align.diagnose(X, Y) + check("recovered scale", abs(L["scale"] - s_t) < 1e-8, f"{L['scale']:.10f}") + check("rigid residual large", L["rigid"] / L["scene"] > 0.1, f"{L['rigid']/L['scene']:.3f}") + check("similarity residual ~0", L["similarity"] / L["scene"] < 1e-9) + check("verdict needs_scale", L["verdict"] == "needs_scale", L["verdict"]) + + +def test_nonlinear_detection(): + """Affine and genuinely-nonlinear warps are distinguished from pure scale.""" + print("test_nonlinear_detection") + X = RNG.normal(size=(400, 3)) * 2.0 + np.array([0, 0, 6.0]) + + # (a) anisotropic (affine) warp: stretch axes unequally -> not a similarity, + # but affine fits exactly. + A = np.diag([1.0, 1.6, 0.7]) @ rand_rotation(RNG) + Yaff = (A @ X.T).T + np.array([1.0, 0.0, 2.0]) + L = align.diagnose(X, Yaff) + check("affine: similarity fails", L["similarity"] / L["scene"] > 1e-3, + f"{L['similarity']/L['scene']:.3f}") + check("affine: affine fits", L["affine"] / L["scene"] < 1e-9) + check("affine: verdict needs_affine", L["verdict"] == "needs_affine", L["verdict"]) + + # (b) nonlinear warp: depth-quadratic stretch (focal-error fingerprint) -> + # even affine cannot remove it, and the residual correlates with depth. + Z = X[:, 2] + Ynl = X.copy() + Ynl[:, 0] *= (1.0 + 0.15 * (Z - Z.mean()) ** 2 / Z.var()) + L2 = align.diagnose(X, Ynl) + check("nonlinear: affine fails", L2["affine"] / L2["scene"] > 1e-3, + f"{L2['affine']/L2['scene']:.3f}") + check("nonlinear: verdict nonlinear", L2["verdict"] == "nonlinear", L2["verdict"]) + check("nonlinear: depth-correlated residual", L2["structured"], + f"corr={L2['depth_corr']:.2f}") + + +def test_outlier_robustness(): + """RANSAC similarity survives 30% gross floaters in the overlap.""" + print("test_outlier_robustness") + n = 400 + X = RNG.normal(size=(n, 3)) * 2.0 + s_t, R_t, t_t = 1.3, rand_rotation(RNG), np.array([0.5, -2.0, 1.0]) + Y = align.apply_sim((s_t, R_t, t_t), X) + n_out = int(0.3 * n) + out_idx = RNG.choice(n, n_out, replace=False) + Y[out_idx] += RNG.normal(size=(n_out, 3)) * 10.0 # floaters + s, R, t, inl = align.fit_similarity_ransac(X, Y, thresh=0.05) + check("scale recovered", abs(s - s_t) < 1e-3, f"{s:.6f} vs {s_t}") + check("rotation recovered", rot_angle(R @ R_t.T) < 0.1) + check("translation recovered", np.linalg.norm(t - t_t) < 1e-2) + check("outliers excluded", (~inl[out_idx]).mean() > 0.9, + f"{(~inl[out_idx]).mean()*100:.0f}% of outliers rejected") + + +def test_loop_closure(): + """A closed loop of consistent links ~= identity; injected scale drift shows.""" + print("test_loop_closure") + links = [] + for _ in range(4): + links.append((RNG.uniform(0.8, 1.2), rand_rotation(RNG), RNG.normal(size=3))) + # close the loop exactly: last link = inverse of the composed first three + T = align.identity() + for Ti in links[:3]: + T = align.compose(Ti, T) + links_closed = links[:3] + [align.invert(T)] + e = align.loop_closure_error(links_closed) + check("closed loop ~ identity", e["scale_err"] < 1e-9 and e["rot_deg"] < 1e-6 + and e["trans"] < 1e-9, f"scale_err={e['scale_err']:.1e}") + # now inject a 12% scale error into one link + bad = links_closed[:] + bad[0] = (bad[0][0] * 1.12, bad[0][1], bad[0][2]) + e2 = align.loop_closure_error(bad) + check("scale drift detected", abs(e2["scale_err"] - np.log(1.12)) < 1e-9, + f"scale_err={e2['scale_err']:.4f} (expected {np.log(1.12):.4f})") + + +# =========================================================================== +# PnP (verified against analytic ground truth, numpy backend) +# =========================================================================== + +def make_intrinsics(f=400.0, H=512, W=512): + pp = np.array([W / 2.0, H / 2.0]) + K = np.array([[f, 0, pp[0]], [0, f, pp[1]], [0, 0, 1.0]]) + return K, pp, f, H, W + + +def project(Pw, R, t, K): + Xc = (R @ Pw.T).T + t + proj = (K @ Xc.T).T + return proj[:, :2] / proj[:, 2:3], Xc[:, 2] + + +def test_focal_recovery(): + """Weiszfeld recovers a known focal from exact (point, pixel) pairs.""" + print("test_focal_recovery") + K, pp, f, H, W = make_intrinsics(f=437.0) + Xc = RNG.uniform([-2, -2, 4], [2, 2, 9], size=(500, 3)) # camera-frame, Z>0 + px = f * Xc[:, :2] / Xc[:, 2:3] + pp + f_est = focal.estimate_focal(Xc, px, pp) + check("exact focal", abs(f_est - f) < 1e-6, f"{f_est:.9f} vs {f}") + px_noisy = px + RNG.normal(size=px.shape) * 0.5 + f_n = focal.estimate_focal(Xc, px_noisy, pp) + check("focal under noise", abs(f_n - f) / f < 0.02, f"{f_n:.2f} vs {f}") + + +def test_pnp_recovery(): + """numpy DLT+RANSAC PnP recovers known per-view extrinsics (cheirality ok).""" + print("test_pnp_recovery") + K, pp, f, H, W = make_intrinsics() + Pw = RNG.uniform([-2, -2, 4], [2, 2, 8], size=(500, 3)) # world == view-0 cam frame + worst_R, worst_t = 0.0, 0.0 + for _ in range(8): + # small motion so all points stay in front of the moved camera + ax = RNG.normal(size=3) + ax /= np.linalg.norm(ax) + ang = np.radians(RNG.uniform(3, 20)) + K_ = np.array([[0, -ax[2], ax[1]], [ax[2], 0, -ax[0]], [-ax[1], ax[0], 0]]) + R = np.eye(3) + np.sin(ang) * K_ + (1 - np.cos(ang)) * K_ @ K_ + t = RNG.uniform(-0.6, 0.6, size=3) + px, z = project(Pw, R, t, K) + if np.any(z <= 0): + continue + c2w, inl = pnp.solve_pnp_numpy(Pw, px, K) + w2c = np.linalg.inv(c2w) + worst_R = max(worst_R, rot_angle(w2c[:3, :3] @ R.T)) + worst_t = max(worst_t, np.linalg.norm(w2c[:3, 3] - t)) + check("rotation recovered", worst_R < 1e-4, f"worst {worst_R:.2e} deg") + check("translation recovered", worst_t < 1e-5, f"worst {worst_t:.2e}") + + +def test_pnp_outliers(): + """PnP RANSAC tolerates corrupted correspondences (floaters).""" + print("test_pnp_outliers") + K, pp, f, H, W = make_intrinsics() + Pw = RNG.uniform([-2, -2, 4], [2, 2, 8], size=(600, 3)) + R = rand_rotation(RNG) + # keep rotation modest so points stay in front + R = np.eye(3) + 0.15 * (R - R.T) + R, _ = np.linalg.qr(R) + t = np.array([0.3, -0.2, 0.4]) + px, z = project(Pw, R, t, K) + keep = z > 0 + Pw, px = Pw[keep], px[keep] + n_out = int(0.25 * len(px)) + oi = RNG.choice(len(px), n_out, replace=False) + px[oi] += RNG.uniform(-150, 150, size=(n_out, 2)) # corrupt pixels + c2w, inl = pnp.solve_pnp_numpy(Pw, px, K, thresh_px=2.0, iters=300) + w2c = np.linalg.inv(c2w) + check("rotation under outliers", rot_angle(w2c[:3, :3] @ R.T) < 0.2, + f"{rot_angle(w2c[:3, :3] @ R.T):.3f} deg") + check("translation under outliers", np.linalg.norm(w2c[:3, 3] - t) < 0.02) + check("outliers rejected", (~inl[oi]).mean() > 0.85, + f"{(~inl[oi]).mean()*100:.0f}% rejected") + + +def main(): + tests = [ + test_similarity_roundtrip, test_scale_detection, test_nonlinear_detection, + test_outlier_robustness, test_loop_closure, + test_focal_recovery, test_pnp_recovery, test_pnp_outliers, + ] + for t in tests: + t() + print() + if _fails: + print(f"FAILED ({len(_fails)}): " + ", ".join(_fails)) + raise SystemExit(1) + print("ALL POSE TESTS OK") + + +if __name__ == "__main__": + main() From 2dfe8f38fee35a63ad1571981c941bcefe89ef3e Mon Sep 17 00:00:00 2001 From: Richard Palethorpe Date: Fri, 26 Jun 2026 16:10:49 +0100 Subject: [PATCH 02/33] pose/: dense GT-posed control (Tanks&Temples OOD, RealEstate10K validates pipeline) A known-good, ground-truth-posed control so a live-path failure can be attributed to the data/model vs our code -- and a reusable engine-vs-GT harness. - tt_control.py / tt_experiment.py: Tanks-and-Temples (NSVF) loader (OpenGL->OpenCV poses, baseline-vs-stride report) + engine-vs-GT check. Verdict: T&T is OUT OF DISTRIBUTION for FreeSplatter-scene (narrow-FOV object orbits) -- opacity confident on only 8-17% of pixels, pose error 28-145deg. Kept as harness + negative result. - re10k_control.py / re10k_fetch.py / re10k_experiment.py: RealEstate10K loader (parser + GT geometry, GT focal_512 = fy*512), yt-dlp/ffmpeg frame fetch with a dead-video skip, and the engine-vs-GT check. IN DISTRIBUTION -> the control works: relative pose recovered to 0.4-1.5deg vs INDEPENDENT GT, opacity confident on 68-75% of pixels. Validates our PnP beyond the upstream parity, and confirms the re10k camera convention. Findings: (1) the model has a CONSTANT wide-FOV focal bias (recovers ~274 vs GT ~439, ~37%) -- benign for relative accumulation (consistent across runs), off for metric scale; (2) pose error scales ~linearly (~25% of baseline rotation), no degeneracy even at ~1.4deg baselines on texture-rich interiors. Data (T&T ~1.5GB, re10k poses ~720MB, fetched frames) stays under .cache/ and the scratchpad -- gitignored, never committed. Co-Authored-By: Claude Opus 4.8 (1M context) --- pose/README.md | 25 +++++++- pose/re10k_control.py | 105 +++++++++++++++++++++++++++++++++ pose/re10k_experiment.py | 76 ++++++++++++++++++++++++ pose/re10k_fetch.py | 84 ++++++++++++++++++++++++++ pose/tt_control.py | 124 +++++++++++++++++++++++++++++++++++++++ pose/tt_experiment.py | 79 +++++++++++++++++++++++++ 6 files changed, 491 insertions(+), 2 deletions(-) create mode 100644 pose/re10k_control.py create mode 100644 pose/re10k_experiment.py create mode 100644 pose/re10k_fetch.py create mode 100644 pose/tt_control.py create mode 100644 pose/tt_experiment.py diff --git a/pose/README.md b/pose/README.md index aa048d9..431fe59 100644 --- a/pose/README.md +++ b/pose/README.md @@ -68,6 +68,17 @@ anywhere numpy is. REAL engine output (needs cv2 + a dumped `[N,H,W,23]` `.f32`). - `empirical.py` — cross-run residual-ladder on real engine output (the scale test). +### Dense GT-posed control (is it us, or the data/model?) + +- `tt_control.py` / `tt_experiment.py` — Tanks-and-Temples (NSVF) loader + engine + vs GT-pose check. **Verdict: T&T is OUT OF DISTRIBUTION** for FreeSplatter-scene + (narrow-FOV object orbits): opacity confident on only 8–17% of pixels, pose error + 28–145°. Kept as the harness + the negative result. +- `re10k_control.py` / `re10k_fetch.py` / `re10k_experiment.py` — RealEstate10K + loader (parser + GT geometry), yt-dlp/ffmpeg frame fetch, and the engine vs + GT-pose check. **In distribution → the control works**: relative pose recovered + to **0.4–1.5°** vs GT, opacity confident on 68–75% of pixels. + ## Run the tests ```sh @@ -75,6 +86,10 @@ nix develop -c python3 pose/test_pose.py # golden, numpy on nix develop -c python3 pose/check_cv2_parity.py # solver vs cv2 (needs cv2) EMP_DIR=/path/to/dumps nix develop -c python3 \ pose/check_upstream_parity.py A_scn.f32 # orchestration vs upstream +# dense control (needs the engine + a GGUF + network for re10k frames): +nix develop -c python3 pose/re10k_fetch.py CLIP.txt OUTDIR 10 +FS_DEVICE=cpu nix develop -c python3 \ + pose/re10k_experiment.py CLIP.txt OUTDIR 40 20,40,80,160 ``` ## Status @@ -86,10 +101,16 @@ EMP_DIR=/path/to/dumps nix develop -c python3 \ inlier-boundary effect of ≤0.5° on near-degenerate object data, 0° on scene). - ✅ **Empirical scale test** — cross-run mismatch is a uniform-scale similarity (scene ~11% scale drift), no nonlinear warp; `diagnose` classifies it. +- ✅ **Dense GT-posed control** — pipeline validated against INDEPENDENT GT on + in-distribution re10k (relative pose to 0.4–1.5°); T&T confirmed OOD. Finding: + the model has a constant wide-FOV focal bias (recovers ~274 vs GT ~439) — benign + for relative accumulation, off for metric scale. ## Not done yet (honest) -- A **dense/known-good control** sequence (other system or GT poses) before - trusting the live path — the baseline-sweet-spot experiment. +- **Cross-run consistency / accumulation test** on in-distribution data (the live + question: clean accumulation vs needing consensus fusion). +- A **higher-motion clip** for the wide-baseline sweet-spot sweep (the sample clip + is a slow pan, max ~6° over 160 frames). - The live pipeline itself (sliding window, sim3 pose-graph to bound drift) and consensus fusion (the floater-removal step) — separate, later. diff --git a/pose/re10k_control.py b/pose/re10k_control.py new file mode 100644 index 0000000..fe389ea --- /dev/null +++ b/pose/re10k_control.py @@ -0,0 +1,105 @@ +"""RealEstate10K loader -- the dense, GT-posed, IN-DISTRIBUTION control. + +FreeSplatter-scene is trained on RealEstate10K-style wide-FOV walkthroughs, so +(unlike Tanks-and-Temples, which is out of distribution) the model is known-good +here. Each clip .txt is: + + line 1 : YouTube URL + line k : timestamp(us) fx fy cx cy 0 0 m00 m01 m02 m03 m10 ... m23 + ^int ^normalized intrinsics ^3x4 world->camera, row-major + +Intrinsics are resolution-independent (fx normalized by width, fy by height). +Frames come from the YouTube video at the given timestamps (yt-dlp + ffmpeg). + +Camera convention: the pose is world->camera extrinsic P=[R|t] (KP maps a world +point to the image), OpenCV-style. c2w = inv([[R,t],[0,0,0,1]]). We VALIDATE this +convention empirically by checking that GT relative rotation agrees with the pose +FreeSplatter recovers on the same pair (the model is accurate in-distribution). + +GT focal after our 512x512 preprocessing (center-crop to height, resize): because +fy is normalized by height and we crop to the full height then scale to 512, + focal_512 = fy * 512 (independent of source resolution). + +Pure numpy for parsing; frame extraction shells out to yt-dlp/ffmpeg. +""" +import os +import subprocess +import numpy as np + +HERE = os.path.dirname(os.path.abspath(__file__)) +RE10K = os.environ.get("RE10K_ROOT", os.path.join(HERE, "..", ".cache", "re10k", "RealEstate10K")) + + +def parse_clip(path): + """Return (url, frames). frames: list of dicts {ts, fx,fy,cx,cy, w2c(3x4), c2w(4x4)}.""" + with open(path) as f: + lines = [ln.strip() for ln in f if ln.strip()] + url = lines[0] + frames = [] + for ln in lines[1:]: + v = ln.split() + ts = int(v[0]) + fx, fy, cx, cy = (float(x) for x in v[1:5]) + # v[5], v[6] are zeros (distortion); v[7:19] are the 3x4 world->cam + P = np.array([float(x) for x in v[7:19]], float).reshape(3, 4) + w2c = np.eye(4) + w2c[:3, :4] = P + c2w = np.linalg.inv(w2c) + frames.append({"ts": ts, "fx": fx, "fy": fy, "cx": cx, "cy": cy, + "w2c": w2c, "c2w": c2w}) + return url, frames + + +def gt_focal_512(frame): + """GT focal in the engine's 512x512 input (center-crop to height + resize).""" + return frame["fy"] * 512.0 + + +def rel_pose(c2w_a, c2w_b): + return np.linalg.inv(c2w_a) @ c2w_b + + +def rot_deg(R): + return float(np.degrees(np.arccos(np.clip((np.trace(R[:3, :3]) - 1) / 2, -1, 1)))) + + +def list_clips(split="test"): + d = os.path.join(RE10K, split) + return sorted(os.path.join(d, f) for f in os.listdir(d) if f.endswith(".txt")) + + +def clip_video_id(url): + """YouTube id from the clip header URL.""" + for key in ("watch?v=", "youtu.be/", "/embed/"): + if key in url: + return url.split(key)[1][:11] + return url.rsplit("/", 1)[-1][:11] + + +def baseline_report(clip_path): + url, fr = parse_clip(clip_path) + n = len(fr) + cen = np.array([f["c2w"][:3, 3] for f in fr]) + span = float(np.linalg.norm(cen - cen.mean(0), axis=1).mean()) + print(f"=== {os.path.basename(clip_path)} ({n} frames) {url} ===") + print(f"video id: {clip_video_id(url)} duration: " + f"{(fr[-1]['ts']-fr[0]['ts'])/1e6:.1f}s GT focal(512)~{gt_focal_512(fr[0]):.1f}") + print(f"camera-center spread (mean dist to centroid): {span:.4f}") + print(f" {'stride':>6} {'pairs':>6} {'rot(deg) med':>13} {'p10..p90':>16} " + f"{'baseline med':>13}") + for s in (1, 2, 5, 10, 20, 40): + rots, bl = [], [] + for i in range(0, n - s): + T = rel_pose(fr[i]["c2w"], fr[i + s]["c2w"]) + rots.append(rot_deg(T)); bl.append(np.linalg.norm(T[:3, 3])) + if not rots: + continue + rots, bl = np.array(rots), np.array(bl) + print(f" {s:>6} {len(rots):>6} {np.median(rots):>13.2f} " + f"{np.percentile(rots,10):>7.2f}..{np.percentile(rots,90):<7.2f} " + f"{np.median(bl):>13.4f}") + + +if __name__ == "__main__": + import sys + baseline_report(sys.argv[1]) diff --git a/pose/re10k_experiment.py b/pose/re10k_experiment.py new file mode 100644 index 0000000..a2712d3 --- /dev/null +++ b/pose/re10k_experiment.py @@ -0,0 +1,76 @@ +"""Run FreeSplatter-scene on RealEstate10K pairs (IN-distribution) and check the +recovered relative pose + focal against ground truth, vs frame stride (baseline). + +Unlike Tanks-and-Temples (out of distribution -> garbage), the model is known-good +here, so this is a real control: low poseErr confirms our PnP against an INDEPENDENT +GT (re10k poses), and agreement validates the re10k camera convention. + +Frames must already be fetched into frame_dir as fNNNN.png (re10k_fetch.py); both +anchor and anchor+stride must be among the fetched indices. + + FS_DEVICE=cpu nix develop -c python3 pose/re10k_experiment.py CLIP.txt FRAME_DIR 40 20,40,80,160 +""" +import os +import sys +import subprocess +import numpy as np + +import re10k_control as rc +import pnp + +HERE = os.path.dirname(os.path.abspath(__file__)) +GGUF = os.environ.get("FS_GGUF", os.path.join(HERE, "..", ".cache", "freesplatter-scene-f16.gguf")) +CLI = os.environ.get("FS_CLI", os.path.join(HERE, "..", "build", "release", "bin", "free_splatter-cli")) +DEVICE = os.environ.get("FS_DEVICE", "cpu") +SCRATCH = os.environ.get("SCRATCH", "/tmp") + + +def run_engine(img0, img1, out): + r = subprocess.run([CLI, "--device", DEVICE, "--out", out, GGUF, img0, img1], + capture_output=True, text=True) + if r.returncode != 0: + raise RuntimeError(f"CLI failed: {r.stderr[-500:]}") + a = np.fromfile(out, np.float32).reshape(2, 512, 512, 23) + pts = [a[v, ..., 0:3].astype(np.float64) for v in (0, 1)] + op = [a[v, ..., 15].astype(np.float64) for v in (0, 1)] + return pts, op + + +def cos(a, b): + return float(a @ b / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-12)) + + +def experiment(clip, frame_dir, anchor, strides): + url, frames = rc.parse_clip(clip) + f_gt = rc.gt_focal_512(frames[0]) + print(f"=== {os.path.basename(clip)} anchor #{anchor} GT focal(512)={f_gt:.1f} " + f"device={DEVICE} ===\n{url}") + print(f" {'stride':>6} {'GTrot':>6} {'valid%':>6} {'poseErr':>8} " + f"{'transDir':>8} {'scaleRat':>8} {'focal':>7} {'fErr%':>6}") + for s in strides: + j = anchor + s + img0 = os.path.join(frame_dir, f"f{anchor:04d}.png") + img1 = os.path.join(frame_dir, f"f{j:04d}.png") + if not (os.path.exists(img0) and os.path.exists(img1)): + print(f" {s:>6} (missing frame {anchor} or {j})"); continue + out = os.path.join(SCRATCH, f"re_{anchor}_{s}.f32") + pts, op = run_engine(img0, img1, out) + res = pnp.estimate_poses(pts, op, normalize=False) + c2w, f_est = res["cam2world"], res["focal"] + rel = np.linalg.inv(c2w[0]) @ c2w[1] + G = rc.rel_pose(frames[anchor]["c2w"], frames[j]["c2w"]) + poseErr = rc.rot_deg(rel[:3, :3] @ G[:3, :3].T) + tdir = cos(rel[:3, 3], G[:3, 3]) + srat = np.linalg.norm(rel[:3, 3]) / (np.linalg.norm(G[:3, 3]) + 1e-12) + validp = min((op[0] > 0.05).mean(), (op[1] > 0.05).mean()) * 100 + ferr = abs(f_est - f_gt) / f_gt * 100 + print(f" {s:>6} {rc.rot_deg(G):>6.2f} {validp:>6.1f} {poseErr:>8.2f} " + f"{tdir:>8.4f} {srat:>8.3f} {f_est:>7.1f} {ferr:>6.1f}") + + +if __name__ == "__main__": + clip, frame_dir = sys.argv[1], sys.argv[2] + anchor = int(sys.argv[3]) if len(sys.argv) > 3 else 40 + strides = tuple(int(x) for x in sys.argv[4].split(",")) if len(sys.argv) > 4 \ + else (20, 40, 80, 160) + experiment(clip, frame_dir, anchor, strides) diff --git a/pose/re10k_fetch.py b/pose/re10k_fetch.py new file mode 100644 index 0000000..94b1770 --- /dev/null +++ b/pose/re10k_fetch.py @@ -0,0 +1,84 @@ +"""Fetch a RealEstate10K clip's frames via yt-dlp + ffmpeg -- the standard re10k +workflow, since the dataset ships only poses + YouTube URLs (frames can't be +redistributed). Gives us a dense, in-distribution, GT-posed sequence to drive the +control. Frames are written as fNNNN.png (NNNN = pose-line index) so they pair 1:1 +with re10k_control.parse_clip()'s frames. + + nix develop -c python3 pose/re10k_fetch.py CLIP.txt OUT_DIR [step] + +yt-dlp comes from `nix run nixpkgs#yt-dlp`; ffmpeg is on PATH. Dead/blocked videos +raise -- use find_live_clip to skip them. +""" +import os +import subprocess +import re10k_control as rc + + +def _ytdlp(*a): + return ["nix", "run", "nixpkgs#yt-dlp", "--", *a] + + +def is_live(url, timeout=60): + r = subprocess.run(_ytdlp("--no-warnings", "--simulate", "--get-duration", url), + capture_output=True, text=True, timeout=timeout) + return r.returncode == 0 + + +def download_video(url, out_dir, height=720, timeout=900): + os.makedirs(out_dir, exist_ok=True) + vid = os.path.join(out_dir, "video.mp4") + if os.path.exists(vid) and os.path.getsize(vid) > 0: + return vid + fmt = (f"bv*[height<={height}][ext=mp4]/bv*[height<={height}]/" + f"b[height<={height}]/best") + r = subprocess.run(_ytdlp("-f", fmt, "--no-warnings", "--no-playlist", + "-o", vid, url), + capture_output=True, text=True, timeout=timeout) + if not (os.path.exists(vid) and os.path.getsize(vid) > 0): + raise RuntimeError("yt-dlp failed:\n" + r.stderr[-600:]) + return vid + + +def extract_frames(vid, frames, indices, out_dir): + imgs = {} + for i in indices: + ts = frames[i]["ts"] / 1e6 # absolute video time (s) + p = os.path.join(out_dir, f"f{i:04d}.png") + if not os.path.exists(p): + subprocess.run(["ffmpeg", "-nostdin", "-loglevel", "error", + "-ss", f"{ts:.6f}", "-i", vid, "-frames:v", "1", + "-q:v", "2", "-y", p], check=True, timeout=120) + imgs[i] = p + return imgs + + +def fetch(clip_path, out_dir, indices=None, step=10, height=720): + url, frames = rc.parse_clip(clip_path) + if indices is None: + indices = list(range(0, len(frames), step)) + vid = download_video(url, out_dir, height) + imgs = extract_frames(vid, frames, indices, out_dir) + return url, frames, imgs + + +def find_live_clip(split="test", start=0, limit=300, min_frames=120): + """First clip in [start, start+limit) that has >= min_frames AND a live video.""" + for cp in rc.list_clips(split)[start:start + limit]: + _, fr = rc.parse_clip(cp) + if len(fr) < min_frames: + continue + try: + if is_live(rc.parse_clip(cp)[0]): + return cp + except subprocess.TimeoutExpired: + continue + return None + + +if __name__ == "__main__": + import sys + clip, out_dir = sys.argv[1], sys.argv[2] + step = int(sys.argv[3]) if len(sys.argv) > 3 else 10 + url, frames, imgs = fetch(clip, out_dir, step=step) + print(f"got {len(imgs)} frames from {url}") + print("indices:", sorted(imgs)[:6], "...", sorted(imgs)[-3:]) diff --git a/pose/tt_control.py b/pose/tt_control.py new file mode 100644 index 0000000..fdd402f --- /dev/null +++ b/pose/tt_control.py @@ -0,0 +1,124 @@ +"""Tanks-and-Temples (NSVF release) loader -- the dense, GT-posed control. + +A known-good, ground-truth-posed sequence so that when the live FreeSplatter path +misbehaves we can tell whether it's the data/model or our code. NSVF layout per +scene (.cache/tt/TanksAndTemple//): + rgb/{split}_{idx}_{origframe}.png 1920x1080 + pose/{split}_{idx}_{origframe}.txt 4x4 camera-to-world, OpenGL convention + intrinsics.txt 4x4 (fx 0 cx 0 / 0 fy cy 0 / ...) +split 0 = train, 1 = test (held-out orbit views). Frames sorted by origframe give +true capture order. + +What this gives the control: + * GT relative pose between any two frames -> check our PnP relative pose. + * GT focal after 512x512 preprocessing -> check FreeSplatter's recovered focal. + * GT baseline / parallax per frame-stride -> drive the baseline sweep. + +OpenGL->OpenCV: negate the camera y,z axes (columns 1,2 of the c2w rotation), +matching the runner's `c2ws[:, :, 1:3] *= -1`. cam2world, OpenCV (x right, y down, +z forward) -- the same convention our pnp.estimate_poses returns. + +Usage: + nix develop -c python3 pose/tt_control.py [Scene] # default Truck +""" +import os +import re +import sys +import numpy as np + +TT_ROOT = os.environ.get( + "TT_ROOT", + os.path.join(os.path.dirname(__file__), "..", ".cache", "tt", "TanksAndTemple")) + +_GL2CV = np.diag([1.0, -1.0, -1.0, 1.0]) # negate camera y,z axes (right-mul) + + +def gl_to_cv(c2w): + """OpenGL camera-to-world -> OpenCV camera-to-world.""" + return c2w @ _GL2CV + + +def load_intrinsics(scene_dir): + M = np.loadtxt(os.path.join(scene_dir, "intrinsics.txt")) + return {"fx": float(M[0, 0]), "fy": float(M[1, 1]), + "cx": float(M[0, 2]), "cy": float(M[1, 2])} + + +def _origframe(name): + # {split}_{idx}_{origframe}.ext -> int(origframe) + return int(re.split(r"[_.]", name)[2]) + + +def load_scene(scene, split=None): + """Return (frames, intr). frames: list of dicts {name, idx, orig, img, c2w(OpenCV)} + sorted by capture order. split=0/1 to filter train/test, None for all.""" + scene_dir = scene if os.path.isdir(scene) else os.path.join(TT_ROOT, scene) + intr = load_intrinsics(scene_dir) + pose_dir, rgb_dir = os.path.join(scene_dir, "pose"), os.path.join(scene_dir, "rgb") + frames = [] + for pf in os.listdir(pose_dir): + if not pf.endswith(".txt"): + continue + sp = int(pf.split("_")[0]) + if split is not None and sp != split: + continue + c2w_gl = np.loadtxt(os.path.join(pose_dir, pf)) + img = os.path.join(rgb_dir, pf[:-4] + ".png") + frames.append({"name": pf[:-4], "split": sp, "orig": _origframe(pf), + "img": img, "c2w": gl_to_cv(c2w_gl)}) + frames.sort(key=lambda f: f["orig"]) + return frames, intr + + +def rel_pose(c2w_a, c2w_b): + """Camera b expressed in camera a's frame: inv(c2w_a) @ c2w_b (OpenCV).""" + return np.linalg.inv(c2w_a) @ c2w_b + + +def rot_deg(R): + return float(np.degrees(np.arccos(np.clip((np.trace(R[:3, :3]) - 1) / 2, -1, 1)))) + + +def scene_radius(frames): + """Mean camera-center distance to the centroid of all camera centers.""" + cen = np.array([f["c2w"][:3, 3] for f in frames]) + return float(np.linalg.norm(cen - cen.mean(0), axis=1).mean()), cen.mean(0) + + +def baseline_report(scene="Truck"): + frames, intr = load_scene(scene) + n = len(frames) + radius, _ = scene_radius(frames) + f512 = intr["fx"] * 512.0 / 1080.0 # after center-crop 1080 + resize 512 + print(f"=== {scene}: {n} frames " + f"(train {sum(f['split']==0 for f in frames)}, " + f"test {sum(f['split']==1 for f in frames)}) ===") + print(f"intrinsics 1920x1080: fx={intr['fx']:.1f} fy={intr['fy']:.1f} " + f"cx={intr['cx']:.1f} cy={intr['cy']:.1f}") + print(f"GT focal after 512x512 preprocess (center-crop 1080 -> resize): " + f"{f512:.1f} px (pp=256)") + print(f"orbit radius (mean cam dist to center): {radius:.4f}") + print("\nframe-stride sweep -- viewpoint change between frame i and i+stride:") + print(f" {'stride':>6} {'pairs':>6} {'rot(deg) med':>13} {'p10..p90':>16} " + f"{'baseline/r med':>15} {'parallax(deg)':>14}") + for s in (1, 2, 3, 5, 10, 20, 40): + rots, bl = [], [] + for i in range(0, n - s): + T = rel_pose(frames[i]["c2w"], frames[i + s]["c2w"]) + rots.append(rot_deg(T)) + bl.append(np.linalg.norm(T[:3, 3])) + if not rots: + continue + rots, bl = np.array(rots), np.array(bl) + blr = bl / radius + # small-angle parallax ~ baseline / radius (rad) -> deg + par = np.degrees(np.median(blr)) + print(f" {s:>6} {len(rots):>6} {np.median(rots):>13.2f} " + f"{np.percentile(rots,10):>7.2f}..{np.percentile(rots,90):<7.2f} " + f"{np.median(blr):>15.3f} {par:>14.2f}") + print("\nUse this to pick pair strides for the sweet-spot sweep: too small a " + "stride -> ~0 parallax (degenerate depth); too large -> low overlap.") + + +if __name__ == "__main__": + baseline_report(sys.argv[1] if len(sys.argv) > 1 else "Truck") diff --git a/pose/tt_experiment.py b/pose/tt_experiment.py new file mode 100644 index 0000000..8f9449a --- /dev/null +++ b/pose/tt_experiment.py @@ -0,0 +1,79 @@ +"""Run FreeSplatter-scene on Tanks-and-Temples pairs; check recovered pose + focal +against GROUND TRUTH as a function of frame stride (baseline). The control: GT +poses are known-good, so error here is OUR pipeline or the model being out of +distribution -- not unknown data. + +Metrics per pair (view0 -> view1): + GTrot : GT relative rotation (deg) = the viewpoint change we asked for. + valid% : min over views of opacity>0.05 fraction (PnP support). + poseErr : angle between our recovered relative rotation and GT (deg) -- the + scale-free "is the pose right" check. + transDir : cos angle between our relative-translation direction and GT (1=perfect; + FreeSplatter's metric scale is arbitrary, so only direction is checked). + scaleRat : ||our t|| / ||GT t|| -- the monocular scale FreeSplatter chose vs metric. + focalErr%: |recovered focal - GT focal(512)| / GT. + + FS_DEVICE=cpu nix develop -c python3 pose/tt_experiment.py Truck 120 2,5,10,20 +""" +import os +import sys +import subprocess +import numpy as np + +import tt_control as tt +import pnp + +HERE = os.path.dirname(os.path.abspath(__file__)) +GGUF = os.environ.get("FS_GGUF", os.path.join(HERE, "..", ".cache", "freesplatter-scene-f16.gguf")) +CLI = os.environ.get("FS_CLI", os.path.join(HERE, "..", "build", "release", "bin", "free_splatter-cli")) +DEVICE = os.environ.get("FS_DEVICE", "cpu") +SCRATCH = os.environ.get("SCRATCH", "/tmp") + + +def run_engine(img0, img1, out): + r = subprocess.run([CLI, "--device", DEVICE, "--out", out, GGUF, img0, img1], + capture_output=True, text=True) + if r.returncode != 0: + raise RuntimeError(f"CLI failed: {r.stderr[-500:]}") + a = np.fromfile(out, np.float32).reshape(2, 512, 512, 23) + pts = [a[v, ..., 0:3].astype(np.float64) for v in (0, 1)] + op = [a[v, ..., 15].astype(np.float64) for v in (0, 1)] + return pts, op + + +def cos(a, b): + return float(a @ b / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-12)) + + +def experiment(scene="Truck", anchor=120, strides=(2, 5, 10, 20)): + frames, intr = tt.load_scene(scene) + f_gt = intr["fx"] * 512.0 / 1080.0 + print(f"=== {scene} anchor #{anchor} ({frames[anchor]['name']}) " + f"GT focal(512)={f_gt:.1f} device={DEVICE} ===") + print(f" {'stride':>6} {'GTrot':>6} {'valid%':>6} {'poseErr':>8} " + f"{'transDir':>8} {'scaleRat':>8} {'focal':>7} {'fErr%':>6}") + for s in strides: + j = anchor + s + if j >= len(frames): + continue + out = os.path.join(SCRATCH, f"tt_{scene}_{anchor}_{s}.f32") + pts, op = run_engine(frames[anchor]["img"], frames[j]["img"], out) + res = pnp.estimate_poses(pts, op, normalize=False) # scene recipe + c2w, f_est = res["cam2world"], res["focal"] + rel = np.linalg.inv(c2w[0]) @ c2w[1] + G = tt.rel_pose(frames[anchor]["c2w"], frames[j]["c2w"]) + poseErr = tt.rot_deg(rel[:3, :3] @ G[:3, :3].T) + tdir = cos(rel[:3, 3], G[:3, 3]) + srat = np.linalg.norm(rel[:3, 3]) / (np.linalg.norm(G[:3, 3]) + 1e-12) + validp = min((op[0] > 0.05).mean(), (op[1] > 0.05).mean()) * 100 + ferr = abs(f_est - f_gt) / f_gt * 100 + print(f" {s:>6} {tt.rot_deg(G):>6.1f} {validp:>6.1f} {poseErr:>8.2f} " + f"{tdir:>8.4f} {srat:>8.3f} {f_est:>7.1f} {ferr:>6.1f}") + + +if __name__ == "__main__": + scene = sys.argv[1] if len(sys.argv) > 1 else "Truck" + anchor = int(sys.argv[2]) if len(sys.argv) > 2 else 120 + strides = tuple(int(x) for x in sys.argv[3].split(",")) if len(sys.argv) > 3 \ + else (2, 5, 10, 20) + experiment(scene, anchor, strides) From 59aaea8b61cadd9e267626cb2ed8913637e24153 Mon Sep 17 00:00:00 2001 From: Richard Palethorpe Date: Fri, 26 Jun 2026 16:17:19 +0100 Subject: [PATCH 03/33] pose/: cross-run consistency on in-distribution re10k (accumulation is clean) re10k_crossrun.py: a frame shared by two overlapping pairs (m-s,m) and (m,m+s) is reconstructed twice in two coordinate systems; fit a robust similarity between the two and read consistency + the residual ladder, swept over baseline stride. Mirrors empirical.py but on in-distribution data where geometry is good. Result (shared frame 120): cross-run consistency is HIGH and best at small baseline -- stride 20 (0.41deg): 65% of pixels agree within 2% of scene extent, 98% within 10%; stride 80 (6deg): 46% / 95%. Versus the OOD doll's 7% / 21%. The mismatch is a clean uniform-scale similarity everywhere (sim->affine buys ~0.1%, verdict similarity_plus_noise); per-step scale drift is small and grows with baseline (1.7% at 0.4deg, 11% at 6deg). Takeaways for the live path: (1) the small-baseline hypothesis holds -- the sliding window's small steps land in the high-consistency regime, so accumulation is mostly clean; (2) consensus fusion is a polish for the residual ~2% floaters, not a prerequisite; (3) registration is a 7-DoF similarity (no nonlinear warp), with slow scale drift a sim3 pose-graph can bound. Co-Authored-By: Claude Opus 4.8 (1M context) --- pose/re10k_crossrun.py | 69 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 pose/re10k_crossrun.py diff --git a/pose/re10k_crossrun.py b/pose/re10k_crossrun.py new file mode 100644 index 0000000..422c1d5 --- /dev/null +++ b/pose/re10k_crossrun.py @@ -0,0 +1,69 @@ +"""Cross-run consistency on IN-DISTRIBUTION re10k -- the accumulation question. + +A shared frame m is reconstructed by two overlapping pairs: + run A = (m-s, m) -> frame m is view 1 (gaussians in frame (m-s)'s world) + run B = (m, m+s) -> frame m is view 0 (gaussians in frame m's world) +Frame m's per-pixel gaussian centers from A and B are the SAME physical points in +two coordinate systems -- exact correspondences. Fit a robust similarity B->A and +read the residual ladder + how many pixels actually agree. High agreement => clean +accumulation; low => floaters dominate and we need consensus fusion. + +This mirrors empirical.py but on data the model handles well (re10k), and sweeps +the baseline stride s so we can see whether a sweet spot maximizes consistency. + + FS_DEVICE=cpu nix develop -c python3 pose/re10k_crossrun.py CLIP.txt FRAME_DIR 120 20,40,80 +""" +import os +import sys +import numpy as np + +import re10k_control as rc +import re10k_experiment as ex # run_engine +import align + +SCRATCH = os.environ.get("SCRATCH", "/tmp") +THR = 0.05 + + +def crossrun(clip, frame_dir, m, strides): + _, frames = rc.parse_clip(clip) + print(f"=== cross-run consistency: {os.path.basename(clip)} shared frame #{m} ===") + print(f" {'stride':>6} {'GTrot(i-j)':>10} {'valid%':>6} {'scale':>6} " + f"{'<1%':>5} {'<2%':>5} {'<5%':>5} {'<10%':>6} {'rigid%':>7} {'sim%':>6} " + f"{'aff%':>6} {'verdict':>20}") + fp = lambda i: os.path.join(frame_dir, f"f{i:04d}.png") + for s in strides: + i, j = m - s, m + s + if not all(os.path.exists(fp(k)) for k in (i, m, j)): + print(f" {s:>6} (missing a frame among {i},{m},{j})"); continue + ptsA, opA = ex.run_engine(fp(i), fp(m), os.path.join(SCRATCH, f"xrA_{m}_{s}.f32")) + ptsB, opB = ex.run_engine(fp(m), fp(j), os.path.join(SCRATCH, f"xrB_{m}_{s}.f32")) + XA, oA = ptsA[1].reshape(-1, 3), opA[1].reshape(-1) # m as view1 of A + XB, oB = ptsB[0].reshape(-1, 3), opB[0].reshape(-1) # m as view0 of B + mask = (oA > THR) & (oB > THR) + A, B = XA[mask], XB[mask] + if mask.sum() < 200: + print(f" {s:>6} (too few valid: {mask.sum()})"); continue + scene = align._rms(A - A.mean(0)) + sc, R, t, inl = align.fit_similarity_ransac(B, A, thresh=0.02 * scene, iters=400) + res = np.linalg.norm(align.apply_sim((sc, R, t), B) - A, axis=1) + within = {q: 100 * (res < q * scene).mean() for q in (0.01, 0.02, 0.05, 0.10)} + L = align.diagnose(B[inl], A[inl]) + gtrot = rc.rot_deg(rc.rel_pose(frames[i]["c2w"], frames[j]["c2w"])) + print(f" {s:>6} {gtrot:>10.2f} {100*mask.mean():>6.1f} {sc:>6.3f} " + f"{within[0.01]:>5.0f} {within[0.02]:>5.0f} {within[0.05]:>5.0f} " + f"{within[0.10]:>6.0f} {100*L['rigid']/L['scene']:>7.1f} " + f"{100*L['similarity']/L['scene']:>6.1f} {100*L['affine']/L['scene']:>6.1f} " + f"{L['verdict']:>20}") + print("\n accumulation\n" + "is clean; low => most per-pixel gaussians are partner-dependent floaters " + "(need consensus fusion).") + + +if __name__ == "__main__": + clip, frame_dir = sys.argv[1], sys.argv[2] + m = int(sys.argv[3]) if len(sys.argv) > 3 else 120 + strides = tuple(int(x) for x in sys.argv[4].split(",")) if len(sys.argv) > 4 \ + else (20, 40, 80) + crossrun(clip, frame_dir, m, strides) From a3a6641ee883da367cf8ed6c281189248be6ed57 Mon Sep 17 00:00:00 2001 From: Richard Palethorpe Date: Fri, 26 Jun 2026 21:33:56 +0100 Subject: [PATCH 04/33] CLAUDE.md: PnP now in scope; C++/Go-only policy, no-Python runtime Policy update directed by the project owner: - PnP pose recovery and cross-run Sim(3) alignment / accumulation are now IN scope (previously out of scope), alongside the engine. - Everything ships in C++; Go only for the demo web server (purego -> C API -> Vulkan + WebGL). The CLI and C API must have NO Python dependency at runtime. - Python is confined to (1) dev-time reference/conversion/validation in the CUDA docker (hf_dump/convert/compare_taps), never a runtime dep, and (2) the pose/ research prototype TEMPORARILY -- continued in Python only until the approach is proven, then rewritten in C++ and the Python deleted. - Per-component discipline now lists the C++ `pose` component, inheriting the parity discipline the Python prototype established (bit-exact to upstream estimate_poses; validated vs independent GT poses). pose/README.md updated to match (temporary prototype, C++ port pending). Co-Authored-By: Claude Opus 4.8 (1M context) --- CLAUDE.md | 54 ++++++++++++++++++++++++++++++++++++++------------ pose/README.md | 18 ++++++++++------- 2 files changed, 52 insertions(+), 20 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 1d1ec3e..d4e3a71 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -2,19 +2,43 @@ ## What this project is -A GGML/C++ inference engine for the **neural-network front half** of -TencentARC/FreeSplatter: image patch tokenizer → multi-view self-attention -transformer → per-pixel 3D-Gaussian parameter head. Given N uncalibrated views -it returns, per input pixel, the activated Gaussian parameters. - -**Scope:** pieces 1–3 only. The **rasterizer and the PnP pose solver are OUT OF -SCOPE** — they are a downstream consumer of the `[N, H, W, gaussian_channels]` -tensor this engine produces. Do not add them here. +A GGML/C++ engine for the **neural-network front half** of +TencentARC/FreeSplatter (image patch tokenizer → multi-view self-attention +transformer → per-pixel 3D-Gaussian parameter head) **plus the downstream +camera-pose recovery and cross-run registration that consume its output**. Given +N uncalibrated views it returns, per input pixel, the activated Gaussian +parameters; **PnP** then recovers each view's camera, and successive runs are +aligned into one accumulating world — the path toward live reconstruction from a +moving camera. + +**Scope (updated):** the engine (pieces 1–3), **PnP pose recovery (now IN +scope)**, and the cross-run Sim(3) alignment / accumulation. Keep the seam at the +`[N, H, W, gaussian_channels]` tensor clean — it is the contract between the +engine and the pose consumer. Rendering itself stays in the demo viewer +(Vulkan/WebGL), not the engine. Target checkpoint first: **`freesplatter-scene`** (`gaussian_channels=23`, `sh_residual=true`, black background). The transformer backbone is identical across all three variants, so object/object-2dgs are cheap follow-ons. +## Language & dependency policy + +- **Everything ships in C++** (the engine, PnP, and the alignment/accumulation + once proven). **Go only for the demo web server** — the purego layer that drives + the C API → Vulkan + WebGL viewer. +- **The CLI and the C API must have NO Python dependency at runtime.** Every piece + of current and future functionality is reachable from `free_splatter-cli` and + `include/free_splatter.h` without invoking Python. +- **Python is confined to two places, neither shipped:** + 1. **Dev-time reference / conversion / validation** that runs in + `docker/Dockerfile.cuda` (`scripts/hf_dump.py`, `convert.py`, + `compare_taps.py`, …) — the only place torch runs; never a runtime dependency. + 2. **The `pose/` research prototype, TEMPORARILY.** It may continue in Python + (numpy + cv2) **only until the accumulating-reconstruction approach is + proven**. Once proven it is **rewritten in C++**, exposed via the CLI + C API, + and **the Python is deleted.** `pose/` is not wired into CMake/ctest and is a + throwaway prototype, not a parallel implementation to maintain. + ## Validation is the backbone (non-negotiable) This is a numerical port. Correctness means matching the PyTorch reference @@ -47,11 +71,15 @@ This is a numerical port. Correctness means matching the PyTorch reference ## Per-component discipline -Each component (`image`, `gguf_loader`, `backend`, `model`, the head) has its own -unit test and is made independently green **before** cross-component parity. -Component boundaries match the file layout. Keep the seam at the -`[N,H,W,gaussian_channels]` tensor clean — that is the contract with any -rasterizer. +Each component (`image`, `gguf_loader`, `backend`, `model`, the head, and — once +ported to C++ — **`pose` = PnP + focal + Sim(3) alignment**) has its own unit test +and is made independently green **before** cross-component parity. Component +boundaries match the file layout. Keep the seam at the +`[N,H,W,gaussian_channels]` tensor clean — that is the contract between the engine +and the pose/rendering consumers. The C++ `pose` component inherits the parity +discipline the Python prototype already established: **bit-exact to upstream +`estimate_poses`** (see `pose/check_upstream_parity.py`) and **validated against +independent ground-truth poses** (`pose/re10k_experiment.py`). ## Debugging philosophy (mandatory sequence) diff --git a/pose/README.md b/pose/README.md index 431fe59..05f66f5 100644 --- a/pose/README.md +++ b/pose/README.md @@ -1,12 +1,16 @@ # pose/ — camera recovery + cross-run alignment (downstream prototype) -**Status: research prototype, deliberately OUTSIDE the validated engine.** - -The free-splatter.cpp engine is pieces 1–3 only; CLAUDE.md keeps the PnP pose -solver out of `src/` on purpose. This directory is the *downstream consumer* of -the engine's `[N, H, W, 23]` output — a self-contained, pure-Python prototype, -**not wired into the CMake build or `ctest`**. It exists to prototype live, -accumulating reconstruction from a moving-camera feed. +**Status: TEMPORARY Python prototype — to be rewritten in C++ once proven, then +deleted.** + +PnP pose recovery and cross-run alignment are now **in scope** (see CLAUDE.md). +The shipped implementation will be **C++**, reachable from `free_splatter-cli` and +the C API with **no Python dependency**. This directory is the throwaway research +prototype that proves the accumulating-reconstruction approach first: a +self-contained numpy + cv2 consumer of the engine's `[N, H, W, 23]` output, +**not wired into the CMake build or `ctest`**. Once the approach is proven it is +ported to C++ and this Python is removed — it is not a parallel implementation to +maintain. ## Why it's needed From 0ecadeea7afb1bd400b2ad028dddf34bdd84f0f3 Mon Sep 17 00:00:00 2001 From: Richard Palethorpe Date: Fri, 26 Jun 2026 21:46:30 +0100 Subject: [PATCH 05/33] pose/: sliding-window accumulation prototype (the live idea, proven end-to-end) accumulate.py assembles the validated pieces into the live pipeline (minus realtime): slide a window over a re10k clip, recover each pair's camera (PnP), fit a Sim(3) between consecutive runs from their SHARED-frame per-pixel correspondences and compose into a per-run global transform (align.compose), drop every frame's gaussians into one world (frame f_0's), and measure the recovered camera trajectory vs ground truth. Engine dumps are cached so analysis re-runs skip inference. render_ply.py projects the colored cloud through a pinhole camera to a PNG. Result (13 pairs, stride 20, frames 0..260): - per-link Sim(3) registration clean: residual ~1.0-1.4% of scene extent. - scale drift accumulates monotonically (forward pan, no revisit): 0.755 over 12 links, ~2.3%/link -- the monocular 1/d drift compounding. - recovered camera trajectory tracks GT to ATE ~11% of extent (single global Sim(3) align); drift grows 7%->13% first->second half, worst at the endpoint -- exactly what a Sim(3) pose-graph + loop closure bounds. - the 2.6M-point accumulated cloud renders from camera 0 as a COHERENT room that matches the input frame (wider FOV due to the model's ~274-vs-439 focal bias). The accumulating-reconstruction idea is proven; per CLAUDE.md the next implementation step is the C++ port (CLI + C API), after which this Python goes. Co-Authored-By: Claude Opus 4.8 (1M context) --- pose/README.md | 27 ++++++-- pose/accumulate.py | 157 +++++++++++++++++++++++++++++++++++++++++++++ pose/render_ply.py | 54 ++++++++++++++++ 3 files changed, 234 insertions(+), 4 deletions(-) create mode 100644 pose/accumulate.py create mode 100644 pose/render_ply.py diff --git a/pose/README.md b/pose/README.md index 05f66f5..d0322e6 100644 --- a/pose/README.md +++ b/pose/README.md @@ -82,6 +82,21 @@ anywhere numpy is. loader (parser + GT geometry), yt-dlp/ffmpeg frame fetch, and the engine vs GT-pose check. **In distribution → the control works**: relative pose recovered to **0.4–1.5°** vs GT, opacity confident on 68–75% of pixels. +- `re10k_crossrun.py` — cross-run consistency vs baseline (the accumulation + question): a shared frame's two reconstructions agree on **98% of pixels within + 10%** of scene extent at small baseline. + +### Accumulation prototype (the live idea, assembled) + +- `accumulate.py` — slide a window over a clip, recover each pair (PnP), chain a + Sim(3) between consecutive runs via their shared-frame correspondences + (`align.compose`), drop every frame's gaussians into one world, and measure the + recovered camera trajectory vs GT. `render_ply.py` — pinhole projection of the + colored cloud to a PNG (visual coherence check). +- **Result (13 pairs, stride 20):** per-link Sim(3) residual ~1.0–1.4%; trajectory + ATE **11%** of extent with drift growing 7%→13% (monotone monocular scale drift, + 0.755 over 12 links); the 2.6M-point cloud renders as a **coherent room** matching + the input. The accumulating-reconstruction idea is **proven** end-to-end. ## Run the tests @@ -109,12 +124,16 @@ FS_DEVICE=cpu nix develop -c python3 \ in-distribution re10k (relative pose to 0.4–1.5°); T&T confirmed OOD. Finding: the model has a constant wide-FOV focal bias (recovers ~274 vs GT ~439) — benign for relative accumulation, off for metric scale. +- ✅ **Accumulation prototype** — sliding-window Sim(3) chaining over a clip builds + one coherent world; trajectory tracks GT to ~11% ATE with bounded-able drift. The + idea is proven; per CLAUDE.md the next implementation step is the **C++ port**. ## Not done yet (honest) -- **Cross-run consistency / accumulation test** on in-distribution data (the live - question: clean accumulation vs needing consensus fusion). +- **Loop closure / Sim(3) pose-graph** to bound the monotone scale + pose drift + (the prototype chains open-loop; a forward pan never revisits). +- **Consensus fusion** to remove the residual ~2% floaters during accumulation. - A **higher-motion clip** for the wide-baseline sweet-spot sweep (the sample clip is a slow pan, max ~6° over 160 frames). -- The live pipeline itself (sliding window, sim3 pose-graph to bound drift) and - consensus fusion (the floater-removal step) — separate, later. +- The **C++ port** (CLI + C API, no Python) once the design is locked — then this + prototype is deleted. diff --git a/pose/accumulate.py b/pose/accumulate.py new file mode 100644 index 0000000..decab3a --- /dev/null +++ b/pose/accumulate.py @@ -0,0 +1,157 @@ +"""Sliding-window accumulation prototype -- assemble the validated pieces. + +Slide a window of overlapping pairs across a re10k clip: + pair k = (f_k, f_{k+1}) -> gaussians in f_k's camera frame (run k's world) +Consecutive runs share a frame (f_{k+1} is view1 of pair k and view0 of pair k+1), +so their per-pixel gaussians for that frame are exact correspondences. Fit a Sim(3) +S_k mapping run k's frame into run k-1's frame, and COMPOSE into a global transform + T_0 = I , T_k = T_{k-1} . S_k +that drops every run's gaussians into ONE world (frame f_0's). Accumulate them, +and measure the recovered camera trajectory against ground truth (drift). + +This is the live pipeline minus realtime: PnP (pnp.py) + Sim(3) align (align.py) + +chaining (align.compose) over a real sequence, justified by re10k_crossrun.py. + + FS_DEVICE=cpu nix develop -c python3 pose/accumulate.py CLIP.txt FRAME_DIR \\ + --start 0 --stride 20 --count 13 --cache DIR --ply out.ply + +Engine dumps are cached (pair_*.f32); re-runs skip inference. +""" +import os +import argparse +import numpy as np + +import re10k_control as rc +import re10k_fetch as rf +import re10k_experiment as ex # run_engine +import pnp +import align + +C0 = 0.28209479177387814 +THR = 0.05 + + +def load_dump(path): + a = np.fromfile(path, np.float32).reshape(2, 512, 512, 23) + pts = [a[v, ..., 0:3].astype(np.float64) for v in (0, 1)] + op = [a[v, ..., 15].astype(np.float64) for v in (0, 1)] + rgb = [np.clip(a[v, ..., 3:6] * C0 + 0.5, 0, 1) for v in (0, 1)] + return pts, op, rgb + + +def write_ply(path, xyz, rgb): + n = len(xyz) + hdr = ("ply\nformat binary_little_endian 1.0\n" + f"element vertex {n}\n" + "property float x\nproperty float y\nproperty float z\n" + "property uchar red\nproperty uchar green\nproperty uchar blue\n" + "end_header\n") + rec = np.zeros(n, dtype=[("x", " global (frame f_0) --- + T = [align.identity()] + chain_info = [] + for k in range(1, a.count): + XA, oA = runs[k]["pts"][0].reshape(-1, 3), runs[k]["op"][0].reshape(-1) # f_k as view0 of run k + XB, oB = runs[k-1]["pts"][1].reshape(-1, 3), runs[k-1]["op"][1].reshape(-1) # f_k as view1 of run k-1 + m = (oA > THR) & (oB > THR) + scene = align._rms(XB[m] - XB[m].mean(0)) + s, R, t, inl = align.fit_similarity_ransac(XA[m], XB[m], thresh=0.02 * scene, iters=300) + S_k = (s, R, t) + T.append(align.compose(T[k-1], S_k)) + resid = align._rms(align.apply_sim(S_k, XA[m][inl]) - XB[m][inl]) / scene + chain_info.append((k, s, 100*inl.mean(), 100*m.mean(), 100*resid)) + + # --- accumulate gaussians into the global frame (each frame once) --- + XYZ, RGB = [], [] + def add(run_idx, view, Tk): + pts = runs[run_idx]["pts"][view].reshape(-1, 3) + op = runs[run_idx]["op"][view].reshape(-1) + rgb = runs[run_idx]["rgb"][view].reshape(-1, 3) + keep = op > THR + XYZ.append(align.apply_sim(Tk, pts[keep])); RGB.append(rgb[keep]) + add(0, 0, T[0]) # f_0 (view0 of pair 0) + for k in range(a.count): # f_{k+1} = view1 of pair k + add(k, 1, T[k]) + XYZ = np.concatenate(XYZ); RGB = np.concatenate(RGB) + + # --- recovered camera trajectory (global) vs GT --- + rec = [align.apply_sim(T[k], runs[k]["c2w"][0][:3, 3]) for k in range(a.count)] + rec.append(align.apply_sim(T[a.count-1], runs[a.count-1]["c2w"][1][:3, 3])) + rec = np.array(rec) + gt = np.array([frames[i]["c2w"][:3, 3] for i in idx]) + sA, RA, tA = align.fit_similarity(rec, gt) # align recovered traj -> GT + rec_al = align.apply_sim((sA, RA, tA), rec) + perr = np.linalg.norm(rec_al - gt, axis=1) + gt_ext = align._rms(gt - gt.mean(0)) + + # --- report --- + print("\nchain (per shared frame): step scale inlier% valid% resid%scene") + for k, s, inlp, vp, rp in chain_info: + print(f" {k:>3} {s:>6.3f} {inlp:>6.1f} {vp:>6.1f} {rp:>7.2f}") + total_scale = np.prod([ci[1] for ci in chain_info]) if chain_info else 1.0 + print(f"cumulative scale drift over {a.count-1} links: {total_scale:.3f} " + f"(per-link mean {total_scale**(1/max(a.count-1,1)):.4f})") + + print(f"\ntrajectory vs GT ({len(idx)} frames, after global Sim(3) align):") + print(f" ATE rms = {align._rms(rec_al-gt):.4f} ({100*align._rms(rec_al-gt)/gt_ext:.1f}% " + f"of GT extent {gt_ext:.4f})") + print(f" per-frame error % of extent: " + + " ".join(f"{100*e/gt_ext:.0f}" for e in perr)) + print(f" drift: first-half {100*perr[:len(perr)//2].mean()/gt_ext:.1f}% " + f"second-half {100*perr[len(perr)//2:].mean()/gt_ext:.1f}%") + + print(f"\naccumulated cloud: {len(XYZ):,} points (opacity>{THR})") + if a.ply: + if len(XYZ) > a.max_points: + sel = np.random.default_rng(0).choice(len(XYZ), a.max_points, replace=False) + XYZ, RGB = XYZ[sel], RGB[sel] + write_ply(a.ply, XYZ, RGB) + print(f" wrote {a.ply} ({len(XYZ):,} points)") + + +if __name__ == "__main__": + main() diff --git a/pose/render_ply.py b/pose/render_ply.py new file mode 100644 index 0000000..1cde6c7 --- /dev/null +++ b/pose/render_ply.py @@ -0,0 +1,54 @@ +"""Quick pinhole projection of a colored PLY to a PNG (z-buffered point splat) -- +a visual coherence check for the accumulated cloud, no 3D viewer needed. + + nix develop -c python3 pose/render_ply.py world.ply out.png [focal] [radius] +""" +import sys +import numpy as np +import cv2 + + +def read_ply(path): + with open(path, "rb") as f: + n = 0 + while True: + l = f.readline() + if l.startswith(b"element vertex"): + n = int(l.split()[-1]) + if l.strip() == b"end_header": + break + rec = np.frombuffer(f.read(n * 15), + dtype=[("x", " 1e-3 + Xc, col = Xc[good], rgb[good] + u = focal * Xc[:, 0] / Xc[:, 2] + W / 2 + v = focal * Xc[:, 1] / Xc[:, 2] + H / 2 + ui, vi = np.round(u).astype(int), np.round(v).astype(int) + img = np.zeros((H, W, 3), np.uint8) + order = np.argsort(-Xc[:, 2]) # far first; near overwrites + ui, vi, col = ui[order], vi[order], col[order] + for dy in range(-radius, radius + 1): + for dx in range(-radius, radius + 1): + uu, vv = ui + dx, vi + dy + inb = (uu >= 0) & (uu < W) & (vv >= 0) & (vv < H) + img[vv[inb], uu[inb]] = col[inb] + return img + + +if __name__ == "__main__": + ply, out = sys.argv[1], sys.argv[2] + focal = float(sys.argv[3]) if len(sys.argv) > 3 else 274.0 + radius = int(sys.argv[4]) if len(sys.argv) > 4 else 1 + xyz, rgb = read_ply(ply) + img = render(xyz, rgb, focal, radius=radius) + cv2.imwrite(out, cv2.cvtColor(img, cv2.COLOR_RGB2BGR)) + print(f"rendered {len(xyz):,} pts -> {out} ({(img.any(2)).mean()*100:.0f}% filled)") From 17105fdc00a9aecc3635389f715a53cccbc93be8 Mon Sep 17 00:00:00 2001 From: Richard Palethorpe Date: Fri, 26 Jun 2026 22:30:08 +0100 Subject: [PATCH 06/33] pose/: loop closure -- machinery verified; real loop reveals it's not the lever find_loop.py searches re10k poses for a clip that revisits its start; loop_closure.py chains open-loop, measures the loop error via a closing pair (f_0,f_n), distributes it by even Sim(3) relaxation (D^(k/n)), and reports ATE before/after. Sim(3) 4x4 helpers (sim_matrix, sim_frac_power) promoted into align.py; golden test added. Honest result on a real loop clip (camera out to 2.29 and back to 0.23): - the open-loop chain ALREADY closes the loop (loop error 4.4deg / scale 1.12 / 8% trans) -- there is almost no accumulated drift to distribute. - the dominant ~34% ATE is per-link ODOMETRY NOISE (Sim(3) inlier% as low as 17-24% on the fast outbound leg) plus the model's focal-bias warp: self-consistent (the loop closes) but distorted vs GT. Loop closure can't fix that; naive distribution slightly hurts. - diagnosis confirmed not a bug: the correction recovers SYNTHETIC uniform accumulated drift to ~0 (test_pose.py::test_loop_correction, ATE 1e-15). Lesson: loop closure pays off on LONG trajectories with consistent accumulated drift (cf. the forward clip's monotone 7%->13%); for short loops the lever is better odometry -- smaller baselines, consensus fusion, and the focal bias. Golden suite green (test_loop_correction added). Co-Authored-By: Claude Opus 4.8 (1M context) --- pose/README.md | 25 ++++++-- pose/align.py | 20 ++++++ pose/find_loop.py | 70 ++++++++++++++++++++ pose/loop_closure.py | 150 +++++++++++++++++++++++++++++++++++++++++++ pose/test_pose.py | 25 +++++++- 5 files changed, 284 insertions(+), 6 deletions(-) create mode 100644 pose/find_loop.py create mode 100644 pose/loop_closure.py diff --git a/pose/README.md b/pose/README.md index d0322e6..ded0297 100644 --- a/pose/README.md +++ b/pose/README.md @@ -97,6 +97,16 @@ anywhere numpy is. ATE **11%** of extent with drift growing 7%→13% (monotone monocular scale drift, 0.755 over 12 links); the 2.6M-point cloud renders as a **coherent room** matching the input. The accumulating-reconstruction idea is **proven** end-to-end. +- `find_loop.py` — search re10k poses for a clip that revisits its start (the loop + substrate). `loop_closure.py` — chain open-loop, measure the loop error via a + closing pair, then distribute it (even Sim(3) relaxation, `align.sim_frac_power`) + and report ATE before/after. **Finding:** loop closure helps only when drift + *accumulates* into a large loop error. On the real loop clip tested it did NOT + help — the open-loop chain already closes (loop error 4.4° / scale 1.12 / 8% + trans), so the ~34% ATE is per-link **odometry noise** (inlier% as low as 17% + on fast legs) + the focal-bias warp, *self-consistent but distorted vs GT*, which + loop closure can't fix. The correction machinery itself is verified to recover + synthetic accumulated drift to ~0 (`test_pose.py::test_loop_correction`). ## Run the tests @@ -127,13 +137,18 @@ FS_DEVICE=cpu nix develop -c python3 \ - ✅ **Accumulation prototype** — sliding-window Sim(3) chaining over a clip builds one coherent world; trajectory tracks GT to ~11% ATE with bounded-able drift. The idea is proven; per CLAUDE.md the next implementation step is the **C++ port**. +- ✅ **Loop closure** — implemented + machinery verified on synthetic drift (recovers + to ~0). On a real loop clip it did NOT help: the chain already closes, so the error + is odometry noise + focal warp, not accumulated drift. Lesson: better odometry + (smaller baselines / fusion / focal) is the lever for short loops; loop closure + pays off on long trajectories with consistent accumulated drift. ## Not done yet (honest) -- **Loop closure / Sim(3) pose-graph** to bound the monotone scale + pose drift - (the prototype chains open-loop; a forward pan never revisits). -- **Consensus fusion** to remove the residual ~2% floaters during accumulation. -- A **higher-motion clip** for the wide-baseline sweet-spot sweep (the sample clip - is a slow pan, max ~6° over 160 frames). +- **Better odometry** is the bigger lever than loop closure for short clips: + smaller per-pair baselines, **consensus fusion** (remove the ~2% floaters / + lift the low-inlier links), and the **focal bias** (recovered ~274 vs GT ~440+). +- A **higher-motion clip** wide-baseline sweep, and a **long** trajectory where + loop closure demonstrably pays off. - The **C++ port** (CLI + C API, no Python) once the design is locked — then this prototype is deleted. diff --git a/pose/align.py b/pose/align.py index 55d8652..0beec07 100644 --- a/pose/align.py +++ b/pose/align.py @@ -195,6 +195,26 @@ def identity(): return (1.0, np.eye(3), np.zeros(3)) +# --- Sim(3) as 4x4 matrices (for pose-graph relaxation) -------------------- + +def sim_matrix(s, R, t): + """Sim(3) as a 4x4 homogeneous matrix [[sR, t],[0,1]] (compose == matmul).""" + M = np.eye(4) + M[:3, :3] = s * R + M[:3, 3] = t + return M + + +def sim_frac_power(M, f): + """Fractional power M^f of a Sim(3) 4x4, via eigendecomposition (real part). + + The even loop-closure relaxation: distribute an accumulated drift D over n + nodes by applying D^(k/n) at node k. M^0=I, M^1=M, (M^0.5)^2=M; valid while the + rotation is < 180deg (principal branch). Tested in test_pose.py.""" + w, V = np.linalg.eig(M) + return (V @ np.diag(w ** f) @ np.linalg.inv(V)).real + + def loop_closure_error(transforms): """Compose a closed loop of similarities and measure deviation from identity. diff --git a/pose/find_loop.py b/pose/find_loop.py new file mode 100644 index 0000000..ab732cf --- /dev/null +++ b/pose/find_loop.py @@ -0,0 +1,70 @@ +"""Search RealEstate10K poses for a LOOPING clip -- camera wanders away from its +start then returns near it (with a similar viewing direction, so the first and last +frames overlap). That overlap is the loop-closure constraint we need to test drift +correction. Pose-only: no video downloads until a candidate is chosen. + +A good loop: large max distance from the start (real motion) but small end-to-start +distance (returns), and small first<->last rotation (same view => the closing pair +(f_0, f_n) is a valid stereo pair the engine can reconstruct). + + nix develop -c python3 pose/find_loop.py [split] [limit] [--live N] +""" +import sys +import numpy as np + +import re10k_control as rc +import re10k_fetch as rf + + +def score(fr): + C = np.array([f["c2w"][:3, 3] for f in fr]) + d = np.linalg.norm(C - C[0], axis=1) + maxd = d.max() + if maxd < 1e-2: + return None + rotend = rc.rot_deg(rc.rel_pose(fr[0]["c2w"], fr[-1]["c2w"])) + return {"n": len(fr), "maxd": float(maxd), "endd": float(d[-1]), + "return_ratio": float(d[-1] / maxd), "rotend": rotend, + "loopiness": float(maxd / (d[-1] + 0.05 * maxd))} + + +def search(split="test", limit=None, min_frames=80, + max_return_ratio=0.25, max_rotend=45.0): + rows = [] + clips = rc.list_clips(split) + if limit: + clips = clips[:limit] + for cp in clips: + try: + url, fr = rc.parse_clip(cp) + except Exception: + continue + if len(fr) < min_frames: + continue + s = score(fr) + if not s or s["return_ratio"] > max_return_ratio or s["rotend"] > max_rotend: + continue + s["clip"], s["id"] = cp, rc.clip_video_id(url) + rows.append(s) + rows.sort(key=lambda r: r["loopiness"], reverse=True) + return rows + + +if __name__ == "__main__": + split = sys.argv[1] if len(sys.argv) > 1 and not sys.argv[1].startswith("-") else "test" + limit = int(sys.argv[2]) if len(sys.argv) > 2 and not sys.argv[2].startswith("-") else None + n_live = int(sys.argv[sys.argv.index("--live") + 1]) if "--live" in sys.argv else 0 + rows = search(split, limit) + print(f"{len(rows)} loop candidates (split={split}, limit={limit})") + print(f" {'loopiness':>9} {'n':>4} {'maxd':>6} {'ret%':>5} {'rotEnd':>6} id") + shown = 0 + for r in rows[:30]: + live = "" + if shown < n_live: + try: + live = " LIVE" if rf.is_live(rc.parse_clip(r["clip"])[0]) else " dead" + except Exception: + live = " ?" + shown += 1 + print(f" {r['loopiness']:>9.1f} {r['n']:>4} {r['maxd']:>6.2f} " + f"{100*r['return_ratio']:>5.0f} {r['rotend']:>6.1f} {r['id']}{live}") diff --git a/pose/loop_closure.py b/pose/loop_closure.py new file mode 100644 index 0000000..961af61 --- /dev/null +++ b/pose/loop_closure.py @@ -0,0 +1,150 @@ +"""Loop-closure drift correction on a LOOPING re10k clip (see find_loop.py). + +accumulate.py chains Sim(3) open-loop and the drift grows to the endpoint (~11% +ATE). When the camera RETURNS near its start, frame 0 and frame n overlap -- a +loop-closure constraint. Here we: + 1. chain open-loop -> per-frame global poses P_k, recovered trajectory, ATE vs GT. + 2. measure the loop -> run the closing pair (f_0, f_n), scale-aligned to run 0, + giving f_n's pose via the loop (P_n_loop), INDEPENDENT of the chain. + 3. the discrepancy D (P_n_loop = D . P_n) IS the accumulated drift (scale/rot/trans). + 4. distribute D over the chain (even Sim(3) relaxation, D^(k/n)) and show the + trajectory ATE and endpoint error collapse. + +Closes the drift story accumulate.py left open; exercises align's chaining on a +real loop. Poses are 4x4 similarity matrices [[sR,t],[0,1]] (compose = matmul). + + FS_DEVICE=cpu nix develop -c python3 pose/loop_closure.py CLIP FRAME_DIR \\ + --stride 20 --count 13 --cache DIR +""" +import os +import argparse +import numpy as np + +import re10k_control as rc +import re10k_fetch as rf +import re10k_experiment as ex +import pnp +import align + +THR = 0.05 +simmat = align.sim_matrix # Sim(3) 4x4 +matfrac = align.sim_frac_power # M^f for even loop-closure relaxation + + +def fit_sim_M(X, Y): + scene = align._rms(Y - Y.mean(0)) + s, R, t, inl = align.fit_similarity_ransac(X, Y, thresh=0.02 * scene, iters=300) + return simmat(s, R, t), inl + + +def decompose(M): + A = M[:3, :3] + s = float(np.linalg.det(A) ** (1.0 / 3.0)) + R = A / s + ang = float(np.degrees(np.arccos(np.clip((np.trace(R) - 1) / 2, -1, 1)))) + return s, ang, M[:3, 3] + + +def load(dump): + a = np.fromfile(dump, np.float32).reshape(2, 512, 512, 23) + pts = [a[v, ..., 0:3].astype(np.float64) for v in (0, 1)] + op = [a[v, ..., 15].astype(np.float64) for v in (0, 1)] + return pts, op + + +def ate(C, gt): + sA, RA, tA = align.fit_similarity(C, gt) + Cal = align.apply_sim((sA, RA, tA), C) + return align._rms(Cal - gt), np.linalg.norm(Cal - gt, axis=1) + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("clip"); ap.add_argument("frame_dir") + ap.add_argument("--start", type=int, default=0) + ap.add_argument("--stride", type=int, default=20) + ap.add_argument("--count", type=int, default=13) + ap.add_argument("--cache", default=os.environ.get("SCRATCH", "/tmp")) + a = ap.parse_args() + + url, frames = rc.parse_clip(a.clip) + idx = [a.start + k * a.stride for k in range(a.count + 1)] + os.makedirs(a.cache, exist_ok=True) + fp = lambda i: os.path.join(a.frame_dir, f"f{i:04d}.png") + if any(not os.path.exists(fp(i)) for i in idx): + rf.fetch(a.clip, a.frame_dir, indices=idx) + print(f"=== loop closure: {os.path.basename(a.clip)} frames {idx[0]}..{idx[-1]} " + f"stride {a.stride} device={ex.DEVICE} ===\n{url}") + + # --- run / load chain pairs --- + runs = [] + for k in range(a.count): + dump = os.path.join(a.cache, f"pair_{idx[k]}_{idx[k+1]}.f32") + if not os.path.exists(dump): + ex.run_engine(fp(idx[k]), fp(idx[k + 1]), dump) + pts, op = load(dump) + c2w = pnp.estimate_poses(pts, op, normalize=False)["cam2world"] + runs.append({"pts": pts, "op": op, "c2w": c2w}) + + # --- open-loop chain: T[k] maps run k -> global --- + T = [np.eye(4)]; chain_diag = [] + for k in range(1, a.count): + XA = runs[k]["pts"][0].reshape(-1, 3); oA = runs[k]["op"][0].reshape(-1) + XB = runs[k-1]["pts"][1].reshape(-1, 3); oB = runs[k-1]["op"][1].reshape(-1) + m = (oA > THR) & (oB > THR) + S, inlk = fit_sim_M(XA[m], XB[m]); T.append(T[k-1] @ S) + sk, _, _ = decompose(S) + chain_diag.append((sk, 100 * inlk.mean(), 100 * m.mean())) + P = [T[k] @ runs[k]["c2w"][0] for k in range(a.count)] + P.append(T[a.count-1] @ runs[a.count-1]["c2w"][1]) # f_n from last run's view1 + centers = np.array([p[:3, 3] for p in P]) + gt = np.array([frames[i]["c2w"][:3, 3] for i in idx]) + ext = align._rms(gt - gt.mean(0)) + + # --- loop measurement: closing pair (f_0, f_n) --- + cdump = os.path.join(a.cache, f"close_{idx[0]}_{idx[-1]}.f32") + if not os.path.exists(cdump): + ex.run_engine(fp(idx[0]), fp(idx[-1]), cdump) + cpts, cop = load(cdump) + cc2w = pnp.estimate_poses(cpts, cop, normalize=False)["cam2world"] + # scale/pose-align the closing run's f_0 (view0) to run 0's f_0 (view0, == global) + XA = cpts[0].reshape(-1, 3); oA = cop[0].reshape(-1) + XB = runs[0]["pts"][0].reshape(-1, 3); oB = runs[0]["op"][0].reshape(-1) + m = (oA > THR) & (oB > THR) + close_valid = 100 * m.mean() + G, inl = fit_sim_M(XA[m], XB[m]) + P_n_loop = G @ cc2w[1] # f_n via the loop, in global + + # --- drift D (P_n_loop = D . P_n) and even distribution --- + D = P_n_loop @ np.linalg.inv(P[-1]) + s, ang, t = decompose(D) + n = a.count + centers_c = np.array([(matfrac(D, k / n) @ np.append(centers[k], 1.0))[:3] + for k in range(len(centers))]) + + # --- report --- + ate0, perr0 = ate(centers, gt) + ate1, perr1 = ate(centers_c, gt) + # diagnostics: is the open-loop chain a clean drift, and does it even return? + sA, RA, tA = align.fit_similarity(centers, gt) + pnl_al = align.apply_sim((sA, RA, tA), P_n_loop[:3, 3]) + print("\nchain per-link scale/inlier%/valid%: " + + " ".join(f"{c[0]:.2f}/{c[1]:.0f}/{c[2]:.0f}" for c in chain_diag)) + print(f"return-to-start dist: recovered {np.linalg.norm(centers[-1]-centers[0]):.3f} " + f"GT {np.linalg.norm(gt[-1]-gt[0]):.3f} (GT extent {ext:.3f})") + print(f"loop measurement P_n_loop endpoint err vs GT: " + f"{100*np.linalg.norm(pnl_al-gt[-1])/ext:.0f}% of extent") + print(f"\nclosing pair (f_{idx[0]}, f_{idx[-1]}): valid%={close_valid:.1f} " + f"inliers={100*inl.mean():.0f}%") + print(f"LOOP-CLOSURE ERROR (open-loop drift at the loop point):") + print(f" scale {s:.3f} ({100*abs(np.log(s)):.1f}% log) rotation {ang:.1f} deg " + f"translation {np.linalg.norm(t):.3f} ({100*np.linalg.norm(t)/ext:.0f}% of extent)") + print(f"\ntrajectory ATE vs GT (after global Sim(3) align):") + print(f" open-loop : {100*ate0/ext:5.1f}% of extent endpoint {100*perr0[-1]/ext:.0f}%") + print(f" closed : {100*ate1/ext:5.1f}% of extent endpoint {100*perr1[-1]/ext:.0f}%") + print(f" per-frame % (open ): " + " ".join(f"{100*e/ext:.0f}" for e in perr0)) + print(f" per-frame % (closed): " + " ".join(f"{100*e/ext:.0f}" for e in perr1)) + + +if __name__ == "__main__": + main() diff --git a/pose/test_pose.py b/pose/test_pose.py index aec4af7..3ad0c80 100644 --- a/pose/test_pose.py +++ b/pose/test_pose.py @@ -115,6 +115,29 @@ def test_outlier_robustness(): f"{(~inl[out_idx]).mean()*100:.0f}% of outliers rejected") +def test_loop_correction(): + """Sim(3) even-distribution loop closure recovers a loop from uniform drift.""" + print("test_loop_correction") + R = rand_rotation(RNG) + M = align.sim_matrix(1.07, R, np.array([0.3, -0.2, 0.1])) + check("M^0 == I", np.allclose(align.sim_frac_power(M, 0.0), np.eye(4), atol=1e-9)) + check("M^1 == M", np.allclose(align.sim_frac_power(M, 1.0), M, atol=1e-9)) + check("(M^.5)^2 == M", + np.allclose(align.sim_frac_power(M, .5) @ align.sim_frac_power(M, .5), M, atol=1e-9)) + # distributing a known accumulated drift D^(k/n) cancels it to ~0 + n = 12 + D = align.sim_matrix(1.15, R, np.array([0.4, -0.2, 0.1])) + clean = np.array([[np.cos(t), np.sin(t), 0.1 * t] + for t in np.linspace(0, 2 * np.pi, n + 1)]) + drift = np.array([(align.sim_frac_power(np.linalg.inv(D), k / n) @ + np.append(clean[k], 1))[:3] for k in range(n + 1)]) + corr = np.array([(align.sim_frac_power(D, k / n) @ + np.append(drift[k], 1))[:3] for k in range(n + 1)]) + check("drift present before", align._rms(drift - clean) > 0.1) + check("recovers clean loop", align._rms(corr - clean) < 1e-9, + f"ATE {align._rms(corr - clean):.1e}") + + def test_loop_closure(): """A closed loop of consistent links ~= identity; injected scale drift shows.""" print("test_loop_closure") @@ -219,7 +242,7 @@ def test_pnp_outliers(): def main(): tests = [ test_similarity_roundtrip, test_scale_detection, test_nonlinear_detection, - test_outlier_robustness, test_loop_closure, + test_outlier_robustness, test_loop_correction, test_loop_closure, test_focal_recovery, test_pnp_recovery, test_pnp_outliers, ] for t in tests: From 1d31eef3863d0c48b8fff34d8326adea1714a918 Mon Sep 17 00:00:00 2001 From: Richard Palethorpe Date: Sat, 27 Jun 2026 09:05:43 +0100 Subject: [PATCH 07/33] pose/: consensus fusion -- removes the edge-noise floaters from the accumulated cloud fuse.py answers the question that started this thread ("around the edges there's a lot of noise -- does accumulation remove it?"). Each per-pixel gaussian is partner-view-dependent, so occlusion-edge / depth-ambiguous points are floaters; a real surface point is reconstructed by several overlapping frames and they agree in the global frame. So: voxelize the accumulated cloud at the consistency scale (~2% of extent) and keep only voxels corroborated by >= K distinct frames, averaging the agreeing predictions (which also denoises the surface). Reuses cached engine dumps (no new inference). Result (forward clip, 14 frames, K>=2): 46% of voxels are single-frame, holding 14% of points -- these render as INCOHERENT EDGE-HAZE (floaters + swept-volume periphery). The >=2-frame consensus (86% of points) renders as a CLEAN, CRISP room with the haze gone. Definitive yes: accumulation + consensus fusion removes the edge noise. Honest tradeoff: dropping single-frame points also trims the single-view periphery (coverage vs cleanliness). Co-Authored-By: Claude Opus 4.8 (1M context) --- pose/README.md | 21 ++++++-- pose/fuse.py | 134 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 150 insertions(+), 5 deletions(-) create mode 100644 pose/fuse.py diff --git a/pose/README.md b/pose/README.md index ded0297..f2bee03 100644 --- a/pose/README.md +++ b/pose/README.md @@ -107,6 +107,12 @@ anywhere numpy is. on fast legs) + the focal-bias warp, *self-consistent but distorted vs GT*, which loop closure can't fix. The correction machinery itself is verified to recover synthetic accumulated drift to ~0 (`test_pose.py::test_loop_correction`). +- `fuse.py` — **consensus fusion** (the edge-noise answer): voxelize the accumulated + cloud at the consistency scale and keep only voxels corroborated by ≥K distinct + frames, averaging the agreeing predictions. On the forward clip: **14% of points + are single-frame floaters** that render as incoherent edge-haze, while the ≥2-frame + consensus renders as a **clean, crisp room** with the haze gone. Tradeoff: dropping + single-frame points also trims the single-view periphery (coverage vs cleanliness). ## Run the tests @@ -142,13 +148,18 @@ FS_DEVICE=cpu nix develop -c python3 \ is odometry noise + focal warp, not accumulated drift. Lesson: better odometry (smaller baselines / fusion / focal) is the lever for short loops; loop closure pays off on long trajectories with consistent accumulated drift. +- ✅ **Consensus fusion** — gating the accumulated cloud on ≥2-frame agreement removes + the 14% single-frame floaters (incoherent edge-haze) and yields a clean surface — + the definitive "yes, accumulation removes the edge noise." Trades single-view + periphery for a cleaner core. ## Not done yet (honest) -- **Better odometry** is the bigger lever than loop closure for short clips: - smaller per-pair baselines, **consensus fusion** (remove the ~2% floaters / - lift the low-inlier links), and the **focal bias** (recovered ~274 vs GT ~440+). -- A **higher-motion clip** wide-baseline sweep, and a **long** trajectory where - loop closure demonstrably pays off. +- The **focal bias** (recovered ~274 vs GT ~440+) — the remaining geometry-distortion + source; worth understanding whether it's fixable. +- A **higher-motion clip** wide-baseline sweep, and a **long** trajectory where loop + closure demonstrably pays off. +- **Fuse-then-align**: feed the consensus surface back into the odometry (lift the + low-inlier links the loop-closure diagnosis flagged), not just the final cloud. - The **C++ port** (CLI + C API, no Python) once the design is locked — then this prototype is deleted. diff --git a/pose/fuse.py b/pose/fuse.py new file mode 100644 index 0000000..df7b1f7 --- /dev/null +++ b/pose/fuse.py @@ -0,0 +1,134 @@ +"""Consensus fusion -- remove the partner-dependent floaters (the edge noise). + +Each per-pixel gaussian is conditioned on its partner view, so occlusion-edge / +depth-ambiguous points land differently per run = floaters. A REAL surface point, +though, is reconstructed by several overlapping frames and they agree in the global +frame. So: voxelize the accumulated cloud at the consistency scale and keep only +voxels corroborated by >= K DISTINCT frames, averaging the agreeing predictions +(which also denoises the surface). Singletons / scattered floaters are dropped. + +This is the direct answer to "does accumulation remove the edge noise?" -- yes, once +you gate on cross-frame consensus. Reuses cached engine dumps from accumulate.py +(no new inference). + + nix develop -c python3 pose/fuse.py CLIP FRAME_DIR --cache DIR \\ + --stride 20 --count 13 --voxel 0.02 --k 2 --ply-raw raw.ply --ply-fused fused.ply +""" +import os +import argparse +import numpy as np + +import align + +THR = 0.05 +C0 = 0.28209479177387814 + + +def load(dump): + a = np.fromfile(dump, np.float32).reshape(2, 512, 512, 23) + pts = [a[v, ..., 0:3].astype(np.float64) for v in (0, 1)] + op = [a[v, ..., 15].astype(np.float64) for v in (0, 1)] + rgb = [np.clip(a[v, ..., 3:6] * C0 + 0.5, 0, 1) for v in (0, 1)] + return pts, op, rgb + + +def fit_sim(X, Y): + scene = align._rms(Y - Y.mean(0)) + s, R, t, _ = align.fit_similarity_ransac(X, Y, thresh=0.02 * scene, iters=300) + return (s, R, t) + + +def write_ply(path, xyz, rgb): + n = len(xyz) + hdr = ("ply\nformat binary_little_endian 1.0\n" + f"element vertex {n}\nproperty float x\nproperty float y\nproperty float z\n" + "property uchar red\nproperty uchar green\nproperty uchar blue\nend_header\n") + rec = np.zeros(n, dtype=[("x", " global (same as accumulate.py) + T = [align.identity()] + for k in range(1, a.count): + XA = runs[k]["pts"][0].reshape(-1, 3); oA = runs[k]["op"][0].reshape(-1) + XB = runs[k-1]["pts"][1].reshape(-1, 3); oB = runs[k-1]["op"][1].reshape(-1) + m = (oA > THR) & (oB > THR) + T.append(align.compose(T[k-1], fit_sim(XA[m], XB[m]))) + + # accumulate every frame's gaussians, tagged with a source-frame id + XYZ, RGB, FR = [], [], [] + def add(run, view, Tk, fid): + op = runs[run]["op"][view].reshape(-1) + keep = op > THR + XYZ.append(align.apply_sim(Tk, runs[run]["pts"][view].reshape(-1, 3)[keep])) + RGB.append(runs[run]["rgb"][view].reshape(-1, 3)[keep]) + FR.append(np.full(int(keep.sum()), fid, np.int64)) + add(0, 0, T[0], 0) + for k in range(a.count): + add(k, 1, T[k], k + 1) + XYZ = np.concatenate(XYZ); RGB = np.concatenate(RGB); FR = np.concatenate(FR) + + # voxel multi-frame consensus + ext = align._rms(XYZ - XYZ.mean(0)) + v = a.voxel * ext + ijk = np.floor((XYZ - XYZ.min(0)) / v).astype(np.int64) + uniq, inv = np.unique(ijk, axis=0, return_inverse=True) + G = len(uniq) + # distinct source-frames per voxel + pairs = np.unique(np.stack([inv, FR], 1), axis=0) + support = np.bincount(pairs[:, 0], minlength=G) + cnt = np.bincount(inv, minlength=G).astype(float) + mean = np.stack([np.bincount(inv, XYZ[:, c], minlength=G) / cnt for c in range(3)], 1) + mrgb = np.stack([np.bincount(inv, RGB[:, c], minlength=G) / cnt for c in range(3)], 1) + keep = support >= a.k + + print(f"=== consensus fusion voxel={a.voxel:.3f}*extent K>={a.k} ===") + print(f"raw points: {len(XYZ):,} voxels: {G:,}") + hist = np.bincount(support)[:6] + print("voxels by distinct-frame support: " + + " ".join(f"{i}f:{hist[i] if i < len(hist) else 0:,}" for i in range(1, 6)) + + f" 6+f:{(support >= 6).sum():,}") + print(f"FUSED points (>= {a.k} frames): {keep.sum():,} " + f"({100*keep.sum()/G:.0f}% of voxels, {100*(1-keep.sum()/len(XYZ)):.0f}% point reduction)") + pkeep = keep[inv] # per-point consensus mask + print(f"per-point: kept {pkeep.sum():,} floaters dropped {(~pkeep).sum():,} " + f"({100*(~pkeep).mean():.0f}%)") + if a.ply_raw: + write_ply(a.ply_raw, XYZ, RGB); print(f" raw -> {a.ply_raw}") + if a.ply_fused: + write_ply(a.ply_fused, mean[keep], mrgb[keep]); print(f" fused -> {a.ply_fused}") + if a.ply_kept: + write_ply(a.ply_kept, XYZ[pkeep], RGB[pkeep]); print(f" kept -> {a.ply_kept}") + if a.ply_floaters: + write_ply(a.ply_floaters, XYZ[~pkeep], RGB[~pkeep]); print(f" floaters -> {a.ply_floaters}") + + +if __name__ == "__main__": + main() From f7e5db58b954da147a5a79d8a533d885e4f7b0c7 Mon Sep 17 00:00:00 2001 From: Richard Palethorpe Date: Sat, 27 Jun 2026 09:46:10 +0100 Subject: [PATCH 08/33] pose/: begin the C++ port -- focal + Umeyama align + DLT/RANSAC PnP, golden-tested MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Start porting the proven Python pose prototype (focal.py + align.py + pnp.py) to shipped C++, dependency-free per CLAUDE.md (only a self-contained Jacobi eigensolver -- no Eigen, no OpenCV), wired into the library and the asset-free test tier. src/linalg.h small dense linear algebra: symmetric cyclic-Jacobi eigensolver, 3x3 SVD (via MᵀM), det/inv, 4x4 rigid inverse. Everything the pose math needs reduces to the eigensolver. src/pose.{h,cpp} estimate_focal (Weiszfeld); fit_similarity (Umeyama) + RANSAC + residual-ladder/diagnose; Sim(3) compose/invert/sim_matrix/ loop_closure_error; sim_frac_power (closed-form one-parameter subgroup, no complex eig); solve_pnp_numpy (DLT via the 12x12 AᵀA nullspace + cheirality decode + RANSAC); estimate_poses (scene recipe: view-0 all-pixel focal, per-view opacity-masked PnP, optional baseline rescale). tests/test_pose.cpp the asset-free mirror of pose/test_pose.py -- 9 golden tests (similarity roundtrip, scale/nonlinear detection, RANSAC outliers, loop correction/closure, focal, PnP recovery/outliers). All green under the debug (ASan/UBSan) preset; ctest -LE model. Cross-checked against the Python reference on a real scene dump: focal is bit-exact (596.408591886). PnP is correct on clean data, but on real scenes the DLT solver inherits the textbook planar/mirror degeneracy (3/5 RANSAC seeds match cv2's ~57deg relative rotation, 2/5 flip) -- the same instability that made the prototype use cv2 (SQPNP) for all real-data results. Next: a robust in-house PnP (EPnP/SQPNP + Gauss-Newton refine) for cv2-parity with no OpenCV dependency, then the accumulation/loop-closure/fusion chaining and the CLI / C-API surface. Co-Authored-By: Claude Opus 4.8 (1M context) --- .gitignore | 3 + CMakeLists.txt | 1 + pose/README.md | 25 ++ src/linalg.h | 212 ++++++++++++++++ src/pose.cpp | 565 +++++++++++++++++++++++++++++++++++++++++++ src/pose.h | 104 ++++++++ tests/CMakeLists.txt | 1 + tests/test_pose.cpp | 352 +++++++++++++++++++++++++++ 8 files changed, 1263 insertions(+) create mode 100644 src/linalg.h create mode 100644 src/pose.cpp create mode 100644 src/pose.h create mode 100644 tests/test_pose.cpp diff --git a/.gitignore b/.gitignore index 97a3841..3c76e18 100644 --- a/.gitignore +++ b/.gitignore @@ -20,6 +20,9 @@ __pycache__/ # reference code (third-party, fetched on demand — never vendored here). .cache/ +# Throwaway working dir for ad-hoc harnesses / scratch (never committed). +/scratchpad/ + # Model + data artifacts (generated; never committed) *.gguf *.safetensors diff --git a/CMakeLists.txt b/CMakeLists.txt index 83fabbc..ae59312 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -122,6 +122,7 @@ add_library(free_splatter ${_fs_libtype} src/gguf_loader.cpp src/model.cpp src/image.cpp + src/pose.cpp src/free_splatter.cpp ) target_include_directories(free_splatter PUBLIC include) diff --git a/pose/README.md b/pose/README.md index f2bee03..e827981 100644 --- a/pose/README.md +++ b/pose/README.md @@ -12,6 +12,31 @@ self-contained numpy + cv2 consumer of the engine's `[N, H, W, 23]` output, ported to C++ and this Python is removed — it is not a parallel implementation to maintain. +## C++ port status (the ship target — `src/pose.{h,cpp}`) + +The port has begun, dependency-free (only the self-contained `src/linalg.h` +Jacobi eigensolver — no Eigen/OpenCV), wired into the library and the asset-free +test tier (`tests/test_pose.cpp`, `ctest -LE model`): + +- ✅ **`focal.py` → `estimate_focal`** — Weiszfeld; **bit-exact** to the numpy + reference on a real scene dump (596.408591886 both). +- ✅ **`align.py` → Umeyama / RANSAC / residual-ladder / Sim(3) chaining / + `sim_frac_power`** — `sim_frac_power` is a closed-form Sim(3) one-parameter + subgroup (`A^f=s^f R^f`, translation `(A^f−I)(A−I)⁻¹t`), no complex eig needed. + All 9 golden tests (the mirror of `test_pose.py`) pass under ASan/UBSan. +- ⚠ **`pnp.py` (numpy backend) → `solve_pnp_numpy` + `estimate_poses`** — DLT (via + the 12×12 `AᵀA` nullspace, which sidesteps the numpy reference's full-SVD OOM) + + RANSAC + cheirality decode. **Correct on clean synthetic data** (golden + tests), but on a real scene dump it inherits the **DLT algorithm's planar/mirror + degeneracy**: ~3/5 RANSAC seeds land near the cv2/SQPNP answer (58° vs cv2's + 57° relative rotation), ~2/5 collapse to a ~135–152° flip. This is the *same* + instability the numpy reference has — which is exactly why the prototype used + **cv2 (SQPNP) for every real-data result**. **Next step:** a robust in-house PnP + (EPnP/SQPNP + Gauss-Newton refine) to reach cv2-parity on real scenes with no + OpenCV dependency. +- ⬜ Not yet ported: accumulation chaining, loop closure, consensus fusion (these + compose the primitives above), and the CLI / C-API surface. + ## Why it's needed Each `free_splatter_run` over a different photo pair produces gaussians in its diff --git a/src/linalg.h b/src/linalg.h new file mode 100644 index 0000000..5705621 --- /dev/null +++ b/src/linalg.h @@ -0,0 +1,212 @@ +// linalg.h — self-contained small dense linear algebra for the pose component. +// +// The downstream pose math (Umeyama similarity, DLT PnP, pose decode) needs only +// a handful of small dense operations. Rather than pull in Eigen/OpenCV — neither +// of which we want as a runtime dependency (see CLAUDE.md: everything ships in +// C++ with no extra runtime deps beyond ggml) — every routine here reduces to one +// primitive: a symmetric cyclic-Jacobi eigensolver. SVD of a 3x3 is built as the +// eigendecomposition of MᵀM; the DLT nullspace is the smallest eigenvector of +// AᵀA. This mirrors the dependency-free numpy reference solver in pose/pnp.py, +// which is itself verified bit-for-bit (~1e-7) against cv2 in check_cv2_parity.py. +// +// Everything is f64, row-major, and header-only. Sizes are tiny (<=12) so the +// O(n^3) Jacobi sweeps are negligible next to the engine forward pass. +#ifndef FREE_SPLATTER_LINALG_H +#define FREE_SPLATTER_LINALG_H + +#include +#include +#include +#include +#include + +namespace fsla { + +using Vec3 = std::array; + +// Row-major fixed matrices. m[r][c] lives at a[r*N + c]. +struct Mat3 { double a[9]; double operator()(int r, int c) const { return a[r * 3 + c]; } + double & operator()(int r, int c) { return a[r * 3 + c]; } }; +struct Mat4 { double a[16]; double operator()(int r, int c) const { return a[r * 4 + c]; } + double & operator()(int r, int c) { return a[r * 4 + c]; } }; + +inline Mat3 mat3_identity() { return Mat3{ {1,0,0, 0,1,0, 0,0,1} }; } +inline Mat4 mat4_identity() { return Mat4{ {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1} }; } + +inline Mat3 mat3_mul(const Mat3 & A, const Mat3 & B) { + Mat3 C{}; + for (int i = 0; i < 3; i++) + for (int j = 0; j < 3; j++) { + double s = 0; + for (int k = 0; k < 3; k++) s += A(i, k) * B(k, j); + C(i, j) = s; + } + return C; +} + +inline Mat3 mat3_transpose(const Mat3 & A) { + Mat3 T{}; + for (int i = 0; i < 3; i++) for (int j = 0; j < 3; j++) T(i, j) = A(j, i); + return T; +} + +inline Vec3 mat3_apply(const Mat3 & A, const Vec3 & x) { + return { A(0,0)*x[0] + A(0,1)*x[1] + A(0,2)*x[2], + A(1,0)*x[0] + A(1,1)*x[1] + A(1,2)*x[2], + A(2,0)*x[0] + A(2,1)*x[1] + A(2,2)*x[2] }; +} + +inline double det3(const Mat3 & M) { + return M(0,0) * (M(1,1)*M(2,2) - M(1,2)*M(2,1)) + - M(0,1) * (M(1,0)*M(2,2) - M(1,2)*M(2,0)) + + M(0,2) * (M(1,0)*M(2,1) - M(1,1)*M(2,0)); +} + +inline Mat3 inv3(const Mat3 & M) { + const double d = det3(M); + const double id = (std::fabs(d) > 0) ? 1.0 / d : 0.0; + Mat3 R{}; + R(0,0) = (M(1,1)*M(2,2) - M(1,2)*M(2,1)) * id; + R(0,1) = -(M(0,1)*M(2,2) - M(0,2)*M(2,1)) * id; + R(0,2) = (M(0,1)*M(1,2) - M(0,2)*M(1,1)) * id; + R(1,0) = -(M(1,0)*M(2,2) - M(1,2)*M(2,0)) * id; + R(1,1) = (M(0,0)*M(2,2) - M(0,2)*M(2,0)) * id; + R(1,2) = -(M(0,0)*M(1,2) - M(0,2)*M(1,0)) * id; + R(2,0) = (M(1,0)*M(2,1) - M(1,1)*M(2,0)) * id; + R(2,1) = -(M(0,0)*M(2,1) - M(0,1)*M(2,0)) * id; + R(2,2) = (M(0,0)*M(1,1) - M(0,1)*M(1,0)) * id; + return R; +} + +// Inverse of a rigid 4x4 [[R,t],[0,1]] (R orthonormal): [[Rᵀ, -Rᵀt],[0,1]]. Exact +// and cheap; used for cam2world = inv(world2cam) where world2cam is rigid. +inline Mat4 inv_rigid4(const Mat4 & M) { + Mat4 R = mat4_identity(); + for (int i = 0; i < 3; i++) for (int j = 0; j < 3; j++) R(i, j) = M(j, i); // Rᵀ + for (int i = 0; i < 3; i++) + R(i, 3) = -(R(i,0)*M(0,3) + R(i,1)*M(1,3) + R(i,2)*M(2,3)); + return R; +} + +inline Mat4 mat4_mul(const Mat4 & A, const Mat4 & B) { + Mat4 C{}; + for (int i = 0; i < 4; i++) + for (int j = 0; j < 4; j++) { + double s = 0; + for (int k = 0; k < 4; k++) s += A(i, k) * B(k, j); + C(i, j) = s; + } + return C; +} + +// --- symmetric eigensolver ------------------------------------------------- +// +// Cyclic Jacobi for a symmetric n x n matrix (Golub & Van Loan, Alg. 8.4.x). +// Ain is row-major and assumed symmetric. Writes n eigenvalues to `eval` and the +// corresponding eigenvectors as the COLUMNS of `evec` (evec[i*n+j] = component i +// of eigenvector j). Output is NOT sorted. Converges in a handful of sweeps for +// n <= 12; we cap at 100 as a safety net. +inline void jacobi_eigh(const double * Ain, int n, double * eval, double * evec) { + std::vector A(Ain, Ain + (size_t) n * n); + for (int i = 0; i < n * n; i++) evec[i] = 0.0; + for (int i = 0; i < n; i++) evec[i * n + i] = 1.0; + + for (int sweep = 0; sweep < 100; sweep++) { + double off = 0.0; + for (int p = 0; p < n; p++) + for (int q = p + 1; q < n; q++) off += A[p*n+q] * A[p*n+q]; + if (off <= 1e-300) break; + + for (int p = 0; p < n; p++) { + for (int q = p + 1; q < n; q++) { + const double apq = A[p*n+q]; + if (std::fabs(apq) <= 1e-300) continue; + const double phi = 0.5 * (A[q*n+q] - A[p*n+p]) / apq; // = tau + const double t = (phi >= 0 ? 1.0 : -1.0) / + (std::fabs(phi) + std::sqrt(phi*phi + 1.0)); + const double c = 1.0 / std::sqrt(t*t + 1.0); + const double s = t * c; + // A <- Jᵀ A J : rotate columns p,q then rows p,q (keeps symmetry). + for (int i = 0; i < n; i++) { + const double aip = A[i*n+p], aiq = A[i*n+q]; + A[i*n+p] = c*aip - s*aiq; + A[i*n+q] = s*aip + c*aiq; + } + for (int i = 0; i < n; i++) { + const double api = A[p*n+i], aqi = A[q*n+i]; + A[p*n+i] = c*api - s*aqi; + A[q*n+i] = s*api + c*aqi; + } + for (int i = 0; i < n; i++) { // accumulate eigenvectors V <- V J + const double vip = evec[i*n+p], viq = evec[i*n+q]; + evec[i*n+p] = c*vip - s*viq; + evec[i*n+q] = s*vip + c*viq; + } + } + } + } + for (int i = 0; i < n; i++) eval[i] = A[i*n+i]; +} + +// Smallest-eigenvalue eigenvector of a symmetric n x n matrix (the homogeneous +// least-squares / DLT nullspace). Returns the n-vector. +inline std::vector smallest_eigenvector(const double * A, int n) { + std::vector eval(n), evec((size_t) n * n); + jacobi_eigh(A, n, eval.data(), evec.data()); + int k = 0; + for (int i = 1; i < n; i++) if (eval[i] < eval[k]) k = i; + std::vector v(n); + for (int i = 0; i < n; i++) v[i] = evec[i*n + k]; + return v; +} + +// --- 3x3 SVD --------------------------------------------------------------- +// +// M = U diag(s) Vᵀ with s sorted DESCENDING (numpy/LAPACK convention). Built from +// the eigendecomposition of the symmetric S = MᵀM: its eigenvectors are V, the +// singular values are sqrt(eigenvalues), and U_i = M v_i / s_i. A (near-)zero +// singular direction is completed by Gram-Schmidt so U stays orthonormal. This is +// all the SVD the similarity fit and the PnP decode need. +inline void svd3(const Mat3 & M, Mat3 & U, Vec3 & s, Mat3 & V) { + Mat3 S = mat3_mul(mat3_transpose(M), M); // 3x3 symmetric PSD + double eval[3], evec[9]; + jacobi_eigh(S.a, 3, eval, evec); + + int ord[3] = { 0, 1, 2 }; // sort eigenpairs descending + std::sort(ord, ord + 3, [&](int x, int y) { return eval[x] > eval[y]; }); + + for (int j = 0; j < 3; j++) { + const int e = ord[j]; + s[j] = std::sqrt(std::max(eval[e], 0.0)); + for (int i = 0; i < 3; i++) V(i, j) = evec[i*3 + e]; + } + // Force a right-handed V (proper basis); flip the last column if reflected. + if (det3(V) < 0) for (int i = 0; i < 3; i++) V(i, 2) = -V(i, 2); + + for (int j = 0; j < 3; j++) { + Vec3 vj = { V(0,j), V(1,j), V(2,j) }; + Vec3 uj = mat3_apply(M, vj); + const double nrm = std::sqrt(uj[0]*uj[0] + uj[1]*uj[1] + uj[2]*uj[2]); + if (nrm > 1e-12) for (int i = 0; i < 3; i++) U(i, j) = uj[i] / nrm; + else for (int i = 0; i < 3; i++) U(i, j) = 0.0; // fixed up below + } + // Complete any degenerate U columns via Gram-Schmidt against the filled ones. + for (int j = 0; j < 3; j++) { + double nrm = std::sqrt(U(0,j)*U(0,j) + U(1,j)*U(1,j) + U(2,j)*U(2,j)); + if (nrm > 1e-9) continue; + Vec3 cand[3] = { {1,0,0}, {0,1,0}, {0,0,1} }; + for (auto & e : cand) { + for (int k = 0; k < 3; k++) { + if (k == j) continue; + const double d = e[0]*U(0,k) + e[1]*U(1,k) + e[2]*U(2,k); + e[0] -= d*U(0,k); e[1] -= d*U(1,k); e[2] -= d*U(2,k); + } + const double en = std::sqrt(e[0]*e[0] + e[1]*e[1] + e[2]*e[2]); + if (en > 1e-6) { U(0,j)=e[0]/en; U(1,j)=e[1]/en; U(2,j)=e[2]/en; break; } + } + } +} + +} // namespace fsla + +#endif // FREE_SPLATTER_LINALG_H diff --git a/src/pose.cpp b/src/pose.cpp new file mode 100644 index 0000000..4bcafa8 --- /dev/null +++ b/src/pose.cpp @@ -0,0 +1,565 @@ +// pose.cpp — C++ port of the pose/ prototype (focal.py + align.py + pnp.py). +// +// Faithful to FreeSplatter's scene recipe; the dependency-free numpy reference +// solver is what ships here (verified ~1e-7 vs cv2, bit-exact vs upstream +// estimate_poses on real engine output). All linear algebra goes through the +// self-contained Jacobi eigensolver in linalg.h — no Eigen, no OpenCV. +#include "pose.h" + +#include +#include + +namespace free_splatter { +namespace pose { + +using fsla::smallest_eigenvector; +using fsla::svd3; +using fsla::det3; +using fsla::inv3; +using fsla::mat3_mul; +using fsla::mat3_transpose; +using fsla::mat3_apply; +using fsla::mat3_identity; +using fsla::mat4_identity; +using fsla::inv_rigid4; + +namespace { + +// Deterministic SplitMix64 — so RANSAC is reproducible across platforms (we don't +// need numpy's PCG64 bitstream, only a fixed inlier-converging sampler). +struct Rng { + uint64_t s; + explicit Rng(uint64_t seed) : s(seed) {} + uint64_t next() { + uint64_t z = (s += 0x9E3779B97F4A7C15ULL); + z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ULL; + z = (z ^ (z >> 27)) * 0x94D049BB133111EBULL; + return z ^ (z >> 31); + } + int bounded(int n) { return (int) (next() % (uint64_t) n); } +}; + +void sample_distinct(Rng & rng, int N, int k, int * out) { + for (int i = 0; i < k; i++) { + int v; + bool dup; + do { + v = rng.bounded(N); + dup = false; + for (int j = 0; j < i; j++) if (out[j] == v) { dup = true; break; } + } while (dup); + out[i] = v; + } +} + +double rms3(const double * r, int N) { // sqrt(mean over rows of ||row||^2) + double s = 0; + for (int i = 0; i < N; i++) s += r[3*i]*r[3*i] + r[3*i+1]*r[3*i+1] + r[3*i+2]*r[3*i+2]; + return std::sqrt(s / (double) N); +} + +void mean3(const double * X, int N, double * m) { + m[0] = m[1] = m[2] = 0; + for (int i = 0; i < N; i++) { m[0] += X[3*i]; m[1] += X[3*i+1]; m[2] += X[3*i+2]; } + m[0] /= N; m[1] /= N; m[2] /= N; +} + +// Solve A X = B in place (A n x n row-major, B n x m row-major) by Gaussian +// elimination with partial pivoting. Result overwrites B. Used for the affine +// normal equations in the residual ladder. +void solve_lin(double * A, int n, double * B, int m) { + for (int col = 0; col < n; col++) { + int piv = col; + for (int r = col + 1; r < n; r++) + if (std::fabs(A[r*n+col]) > std::fabs(A[piv*n+col])) piv = r; + if (piv != col) { + for (int c = 0; c < n; c++) std::swap(A[col*n+c], A[piv*n+c]); + for (int c = 0; c < m; c++) std::swap(B[col*m+c], B[piv*m+c]); + } + const double d = A[col*n+col]; + if (std::fabs(d) < 1e-300) continue; + for (int r = 0; r < n; r++) { + if (r == col) continue; + const double f = A[r*n+col] / d; + if (f == 0) continue; + for (int c = col; c < n; c++) A[r*n+c] -= f * A[col*n+c]; + for (int c = 0; c < m; c++) B[r*m+c] -= f * B[col*m+c]; + } + } + for (int col = 0; col < n; col++) { + const double d = A[col*n+col]; + if (std::fabs(d) > 1e-300) for (int c = 0; c < m; c++) B[col*m+c] /= d; + } +} + +Mat3 rodrigues(const Vec3 & axis, double phi) { // rotation about unit axis by phi + const double cphi = std::cos(phi), sphi = std::sin(phi), v = 1.0 - cphi; + const double x = axis[0], y = axis[1], z = axis[2]; + Mat3 R{}; + R(0,0) = cphi + x*x*v; R(0,1) = x*y*v - z*sphi; R(0,2) = x*z*v + y*sphi; + R(1,0) = y*x*v + z*sphi; R(1,1) = cphi + y*y*v; R(1,2) = y*z*v - x*sphi; + R(2,0) = z*x*v - y*sphi; R(2,1) = z*y*v + x*sphi; R(2,2) = cphi + z*z*v; + return R; +} + +} // namespace + +// ---- focal ---------------------------------------------------------------- + +double estimate_focal(const double * pts3d, const double * pixels, int N, + double ppx, double ppy, int weiszfeld_iters) { + std::vector P(2 * N), U(2 * N), pu(N), uu(N); + for (int i = 0; i < N; i++) { + const double X = pts3d[3*i], Y = pts3d[3*i+1], Z = pts3d[3*i+2]; + double ux = (Z != 0.0) ? X / Z : 0.0; + double uy = (Z != 0.0) ? Y / Z : 0.0; + if (!std::isfinite(ux)) ux = 0.0; // mirror np.nan_to_num(posinf/neginf->0) + if (!std::isfinite(uy)) uy = 0.0; + const double Px = pixels[2*i] - ppx; + const double Py = pixels[2*i+1] - ppy; + P[2*i] = Px; P[2*i+1] = Py; + U[2*i] = ux; U[2*i+1] = uy; + pu[i] = Px*ux + Py*uy; + uu[i] = ux*ux + uy*uy; + } + double spu = 0, suu = 0; + for (int i = 0; i < N; i++) { spu += pu[i]; suu += uu[i]; } + double f = (suu != 0.0) ? spu / suu : 0.0; + for (int it = 0; it < weiszfeld_iters; it++) { + double num = 0, den = 0; + for (int i = 0; i < N; i++) { + const double dx = P[2*i] - f*U[2*i], dy = P[2*i+1] - f*U[2*i+1]; + const double r = std::sqrt(dx*dx + dy*dy); + const double w = 1.0 / std::max(r, 1e-8); + num += w * pu[i]; + den += w * uu[i]; + } + f = (den != 0.0) ? num / den : f; + } + return f; +} + +// ---- similarity ----------------------------------------------------------- + +Sim3 sim_identity() { return { 1.0, mat3_identity(), {0,0,0} }; } + +Vec3 sim_apply(const Sim3 & T, const Vec3 & x) { + Vec3 Rx = mat3_apply(T.R, x); + return { T.s*Rx[0] + T.t[0], T.s*Rx[1] + T.t[1], T.s*Rx[2] + T.t[2] }; +} + +Sim3 sim_compose(const Sim3 & T2, const Sim3 & T1) { // apply T1 then T2 + Sim3 r; + r.s = T2.s * T1.s; + r.R = mat3_mul(T2.R, T1.R); + Vec3 R2t1 = mat3_apply(T2.R, T1.t); + r.t = { T2.s*R2t1[0] + T2.t[0], T2.s*R2t1[1] + T2.t[1], T2.s*R2t1[2] + T2.t[2] }; + return r; +} + +Sim3 sim_invert(const Sim3 & T) { + Sim3 r; + r.s = 1.0 / T.s; + r.R = mat3_transpose(T.R); + Vec3 Rit = mat3_apply(r.R, T.t); + r.t = { -r.s*Rit[0], -r.s*Rit[1], -r.s*Rit[2] }; + return r; +} + +Mat4 sim_matrix(const Sim3 & T) { + Mat4 M = mat4_identity(); + for (int i = 0; i < 3; i++) for (int j = 0; j < 3; j++) M(i, j) = T.s * T.R(i, j); + for (int i = 0; i < 3; i++) M(i, 3) = T.t[i]; + return M; +} + +Sim3 fit_similarity(const double * X, const double * Y, int N, bool with_scale) { + double mx[3], my[3]; + mean3(X, N, mx); + mean3(Y, N, my); + // Sigma = (Ycᵀ Xc)/n, cross-covariance mapping X -> Y; var_x = mean ||Xc||^2. + Mat3 Sigma{}; + double var_x = 0; + for (int i = 0; i < N; i++) { + const double xc[3] = { X[3*i]-mx[0], X[3*i+1]-mx[1], X[3*i+2]-mx[2] }; + const double yc[3] = { Y[3*i]-my[0], Y[3*i+1]-my[1], Y[3*i+2]-my[2] }; + for (int r = 0; r < 3; r++) for (int c = 0; c < 3; c++) Sigma(r, c) += yc[r]*xc[c]; + var_x += xc[0]*xc[0] + xc[1]*xc[1] + xc[2]*xc[2]; + } + for (int i = 0; i < 9; i++) Sigma.a[i] /= N; + var_x /= N; + + Mat3 U, V; Vec3 D; + svd3(Sigma, U, D, V); // Sigma = U diag(D) Vᵀ + double sgn = (det3(U) * det3(V) < 0) ? -1.0 : 1.0; + // R = U diag(1,1,sgn) Vᵀ + Mat3 US = U; + for (int r = 0; r < 3; r++) US(r, 2) *= sgn; + Mat3 R = mat3_mul(US, mat3_transpose(V)); + + double s = 1.0; + if (with_scale && var_x > 0) + s = (D[0] + D[1] + sgn * D[2]) / var_x; + + Vec3 mxv = { mx[0], mx[1], mx[2] }; + Vec3 Rmx = mat3_apply(R, mxv); + Sim3 T; + T.s = s; T.R = R; + T.t = { my[0] - s*Rmx[0], my[1] - s*Rmx[1], my[2] - s*Rmx[2] }; + return T; +} + +Sim3 fit_similarity_ransac(const double * X, const double * Y, int N, double thresh, + int iters, std::vector & inliers, + bool with_scale, uint64_t seed) { + Rng rng(seed); + std::vector best(N, 0); + int best_cnt = 0; + std::vector xs(9), ys(9); // 3-point minimal samples + for (int it = 0; it < iters; it++) { + int idx[3]; + sample_distinct(rng, N, 3, idx); + for (int j = 0; j < 3; j++) + for (int c = 0; c < 3; c++) { xs[3*j+c] = X[3*idx[j]+c]; ys[3*j+c] = Y[3*idx[j]+c]; } + Sim3 T = fit_similarity(xs.data(), ys.data(), 3, with_scale); + int cnt = 0; + for (int i = 0; i < N; i++) { + Vec3 xi = { X[3*i], X[3*i+1], X[3*i+2] }; + Vec3 p = sim_apply(T, xi); + const double dx = p[0]-Y[3*i], dy = p[1]-Y[3*i+1], dz = p[2]-Y[3*i+2]; + if (std::sqrt(dx*dx + dy*dy + dz*dz) < thresh) cnt++; + } + if (cnt > best_cnt) { best_cnt = cnt; best.assign(N, 0); + for (int i = 0; i < N; i++) { + Vec3 xi = { X[3*i], X[3*i+1], X[3*i+2] }; + Vec3 p = sim_apply(T, xi); + const double dx = p[0]-Y[3*i], dy = p[1]-Y[3*i+1], dz = p[2]-Y[3*i+2]; + best[i] = (std::sqrt(dx*dx + dy*dy + dz*dz) < thresh) ? 1 : 0; + } + } + } + if (best_cnt < 3) std::fill(best.begin(), best.end(), 1); + // refit on the inlier set + std::vector xi, yi; + for (int i = 0; i < N; i++) if (best[i]) { + for (int c = 0; c < 3; c++) { xi.push_back(X[3*i+c]); yi.push_back(Y[3*i+c]); } + } + inliers = best; + return fit_similarity(xi.data(), yi.data(), (int) (xi.size()/3), with_scale); +} + +// affine Y ~= A X + b (12-DoF), returns prediction rms only via caller; here we +// fill `pred` (N*3). Normal equations on Xh=[X,1]. +static double affine_residual(const double * X, const double * Y, int N) { + double G[16] = {0}; // XhᵀXh (4x4) + double B[12] = {0}; // XhᵀY (4x3) + for (int i = 0; i < N; i++) { + const double xh[4] = { X[3*i], X[3*i+1], X[3*i+2], 1.0 }; + for (int r = 0; r < 4; r++) { + for (int c = 0; c < 4; c++) G[r*4+c] += xh[r]*xh[c]; + for (int c = 0; c < 3; c++) B[r*3+c] += xh[r]*Y[3*i+c]; + } + } + solve_lin(G, 4, B, 3); // B now holds M (4x3): rows are coeffs + double s = 0; + for (int i = 0; i < N; i++) { + const double xh[4] = { X[3*i], X[3*i+1], X[3*i+2], 1.0 }; + for (int c = 0; c < 3; c++) { + double p = 0; + for (int r = 0; r < 4; r++) p += xh[r]*B[r*3+c]; + const double d = p - Y[3*i+c]; + s += d*d; + } + } + return std::sqrt(s / (double) N); +} + +Ladder diagnose(const double * X, const double * Y, int N, double tol, double corr_tol) { + Ladder L{}; + double my[3]; mean3(Y, N, my); + std::vector yc(3 * N); + for (int i = 0; i < N; i++) { yc[3*i]=Y[3*i]-my[0]; yc[3*i+1]=Y[3*i+1]-my[1]; yc[3*i+2]=Y[3*i+2]-my[2]; } + L.scene = rms3(yc.data(), N); + + Sim3 Tr = fit_similarity(X, Y, N, /*with_scale=*/false); + Sim3 Ts = fit_similarity(X, Y, N, /*with_scale=*/true); + std::vector res_r(3*N), res_s(3*N); + for (int i = 0; i < N; i++) { + Vec3 xi = { X[3*i], X[3*i+1], X[3*i+2] }; + Vec3 pr = sim_apply(Tr, xi), ps = sim_apply(Ts, xi); + for (int c = 0; c < 3; c++) { res_r[3*i+c] = pr[c]-Y[3*i+c]; res_s[3*i+c] = ps[c]-Y[3*i+c]; } + } + L.rigid = rms3(res_r.data(), N); + L.similarity = rms3(res_s.data(), N); + L.affine = affine_residual(X, Y, N); + L.scale = Ts.s; + + // depth_corr: Pearson(||X-mean||, ||similarity residual||) + double mx[3]; mean3(X, N, mx); + std::vector depth(N), rmag(N); + for (int i = 0; i < N; i++) { + const double dx = X[3*i]-mx[0], dy = X[3*i+1]-mx[1], dz = X[3*i+2]-mx[2]; + depth[i] = std::sqrt(dx*dx + dy*dy + dz*dz); + rmag[i] = std::sqrt(res_s[3*i]*res_s[3*i] + res_s[3*i+1]*res_s[3*i+1] + res_s[3*i+2]*res_s[3*i+2]); + } + double md = 0, mr = 0; + for (int i = 0; i < N; i++) { md += depth[i]; mr += rmag[i]; } + md /= N; mr /= N; + double cov = 0, vd = 0, vr = 0; + for (int i = 0; i < N; i++) { cov += (depth[i]-md)*(rmag[i]-mr); vd += (depth[i]-md)*(depth[i]-md); vr += (rmag[i]-mr)*(rmag[i]-mr); } + L.depth_corr = (vr > 1e-24) ? cov / std::sqrt(vd*vr) : 0.0; + + const double sc = (L.scene > 0) ? L.scene : 1.0; + const double rr = L.rigid/sc, rs = L.similarity/sc, ra = L.affine/sc; + L.aff_gain = (L.similarity > 0) ? (L.similarity - L.affine)/L.similarity : 0.0; + L.structured = std::fabs(L.depth_corr) > corr_tol || L.aff_gain > 0.25; + if (rr < tol) L.verdict = "rigid_ok"; + else if (rs < tol) L.verdict = "needs_scale"; + else if (ra < tol) L.verdict = "needs_affine"; + else if (L.structured) L.verdict = "nonlinear"; + else L.verdict = "similarity_plus_noise"; + return L; +} + +LoopError loop_closure_error(const std::vector & links) { + Sim3 T = sim_identity(); + for (const Sim3 & Ti : links) T = sim_compose(Ti, T); + LoopError e; + e.scale_err = std::fabs(std::log(T.s)); + const double tr = T.R(0,0) + T.R(1,1) + T.R(2,2); + e.rot_deg = std::acos(std::max(-1.0, std::min(1.0, (tr - 1.0)/2.0))) * 180.0 / M_PI; + e.trans = std::sqrt(T.t[0]*T.t[0] + T.t[1]*T.t[1] + T.t[2]*T.t[2]); + return e; +} + +// M^f for a Sim(3) 4x4 [[sR,t],[0,1]], closed form via the group's one-parameter +// subgroup: A^f = s^f R^f (R^f = same axis, f*angle), translation by +// (A^f - I)(A - I)^{-1} t (the analytic continuation of the integer-power +// geometric series). Equals numpy's eig-based principal value while angle < 180. +Mat4 sim_frac_power(const Mat4 & M, double f) { + Mat3 A{}; + Vec3 t{}; + for (int i = 0; i < 3; i++) { for (int j = 0; j < 3; j++) A(i, j) = M(i, j); t[i] = M(i, 3); } + const double s = std::cbrt(det3(A)); + Mat3 R = A; + for (int i = 0; i < 9; i++) R.a[i] /= s; + const double tr = R(0,0) + R(1,1) + R(2,2); + const double ang = std::acos(std::max(-1.0, std::min(1.0, (tr - 1.0)/2.0))); + + Mat3 Rf; + if (ang < 1e-12) { + Rf = mat3_identity(); + } else { + const double sa = std::sin(ang); + Vec3 axis = { (R(2,1)-R(1,2))/(2*sa), (R(0,2)-R(2,0))/(2*sa), (R(1,0)-R(0,1))/(2*sa) }; + Rf = rodrigues(axis, f * ang); + } + const double sf = std::pow(s, f); + Mat3 Af = Rf; + for (int i = 0; i < 9; i++) Af.a[i] *= sf; + + Mat4 out = mat4_identity(); + for (int i = 0; i < 3; i++) for (int j = 0; j < 3; j++) out(i, j) = Af(i, j); + + // translation: (Af - I)(A - I)^{-1} t, with the A->I limit handled as f*t. + Mat3 AmI = A; + for (int i = 0; i < 3; i++) AmI(i, i) -= 1.0; + if (std::fabs(det3(AmI)) < 1e-12) { + for (int i = 0; i < 3; i++) out(i, 3) = f * t[i]; + } else { + Mat3 AfmI = Af; + for (int i = 0; i < 3; i++) AfmI(i, i) -= 1.0; + Mat3 K = mat3_mul(AfmI, inv3(AmI)); + Vec3 tf = mat3_apply(K, t); + for (int i = 0; i < 3; i++) out(i, 3) = tf[i]; + } + return out; +} + +// ---- PnP ------------------------------------------------------------------ + +namespace { + +// DLT: smallest right singular vector of the 2N-row system, via the smallest +// eigenvector of AᵀA (12x12). idx selects the k points used. Returns the 3x4 +// matrix p (row-major) up to sign/scale. +std::array dlt(const double * Xw, const double * xn, const int * idx, int k) { + double AtA[144] = {0}; + for (int j = 0; j < k; j++) { + const int i = idx[j]; + const double X = Xw[3*i], Y = Xw[3*i+1], Z = Xw[3*i+2]; + const double u = xn[2*i], v = xn[2*i+1]; + const double ra[12] = { X, Y, Z, 1, 0, 0, 0, 0, -u*X, -u*Y, -u*Z, -u }; + const double rb[12] = { 0, 0, 0, 0, X, Y, Z, 1, -v*X, -v*Y, -v*Z, -v }; + for (int r = 0; r < 12; r++) + for (int c = 0; c < 12; c++) AtA[r*12+c] += ra[r]*ra[c] + rb[r]*rb[c]; + } + std::vector v = smallest_eigenvector(AtA, 12); + std::array p; + for (int i = 0; i < 12; i++) p[i] = v[i]; + return p; +} + +struct RT { Mat3 R; Vec3 t; double err; bool ok; }; + +// Resolve the DLT sign and recover a proper (R,t) world2cam, picking the sign with +// lower reprojection error among cheirality-valid candidates (points in front). +RT decode(const std::array & p, const double * Xw, const double * xn, + const int * idx, int k) { + RT best; best.ok = false; best.err = 1e300; + for (double sgn : { 1.0, -1.0 }) { + Mat3 M{}; + Vec3 tau{}; + for (int r = 0; r < 3; r++) { for (int c = 0; c < 3; c++) M(r, c) = sgn*p[r*4+c]; tau[r] = sgn*p[r*4+3]; } + Mat3 U, V; Vec3 Sv; + svd3(M, U, Sv, V); + const double d = (det3(U) * det3(V) < 0) ? -1.0 : 1.0; + Mat3 US = U; + for (int r = 0; r < 3; r++) US(r, 2) *= d; + Mat3 R = mat3_mul(US, mat3_transpose(V)); // nearest proper rotation + const double lam = (Sv[0] + Sv[1] + Sv[2]) / 3.0; + if (lam < 1e-12) continue; + Vec3 t = { tau[0]/lam, tau[1]/lam, tau[2]/lam }; + + double err = 0; bool behind = false; + for (int j = 0; j < k; j++) { + const int i = idx[j]; + Vec3 xi = { Xw[3*i], Xw[3*i+1], Xw[3*i+2] }; + Vec3 Xc = mat3_apply(R, xi); + Xc[0] += t[0]; Xc[1] += t[1]; Xc[2] += t[2]; + if (Xc[2] <= 0) { behind = true; break; } + const double px = Xc[0]/Xc[2], py = Xc[1]/Xc[2]; + const double dx = px - xn[2*i], dy = py - xn[2*i+1]; + err += std::sqrt(dx*dx + dy*dy); + } + if (behind) continue; + err /= k; + if (err < best.err) { best.err = err; best.R = R; best.t = t; best.ok = true; } + } + return best; +} + +} // namespace + +void pixel_grid(int H, int W, std::vector & out) { + out.resize((size_t) 2 * H * W); + for (int r = 0; r < H; r++) + for (int c = 0; c < W; c++) { + const size_t k = (size_t) r * W + c; + out[2*k] = c; out[2*k+1] = r; + } +} + +Mat4 solve_pnp_numpy(const double * Xw, const double * pixels, int N, const Mat3 & K, + std::vector & inliers, double thresh_px, int iters, uint64_t seed) { + const Mat3 Kinv = inv3(K); + std::vector xn((size_t) 2 * N); + for (int i = 0; i < N; i++) { + Vec3 uv = { pixels[2*i], pixels[2*i+1], 1.0 }; + Vec3 n = mat3_apply(Kinv, uv); + xn[2*i] = n[0]; xn[2*i+1] = n[1]; + } + const double thresh = thresh_px / K(0, 0); + + Rng rng(seed); + std::vector best(N, 0); + int best_cnt = 0; + for (int it = 0; it < iters; it++) { + int idx[6]; + sample_distinct(rng, N, 6, idx); + RT rt = decode(dlt(Xw, xn.data(), idx, 6), Xw, xn.data(), idx, 6); + if (!rt.ok) continue; + int cnt = 0; + for (int i = 0; i < N; i++) { + Vec3 xi = { Xw[3*i], Xw[3*i+1], Xw[3*i+2] }; + Vec3 Xc = mat3_apply(rt.R, xi); + Xc[2] += rt.t[2]; + if (Xc[2] <= 0) continue; + Xc[0] += rt.t[0]; Xc[1] += rt.t[1]; + const double dx = Xc[0]/Xc[2] - xn[2*i], dy = Xc[1]/Xc[2] - xn[2*i+1]; + if (std::sqrt(dx*dx + dy*dy) < thresh) cnt++; + } + if (cnt > best_cnt) { + best_cnt = cnt; + best.assign(N, 0); + for (int i = 0; i < N; i++) { + Vec3 xi = { Xw[3*i], Xw[3*i+1], Xw[3*i+2] }; + Vec3 Xc = mat3_apply(rt.R, xi); + Xc[0] += rt.t[0]; Xc[1] += rt.t[1]; Xc[2] += rt.t[2]; + if (Xc[2] <= 0) continue; + const double dx = Xc[0]/Xc[2] - xn[2*i], dy = Xc[1]/Xc[2] - xn[2*i+1]; + best[i] = (std::sqrt(dx*dx + dy*dy) < thresh) ? 1 : 0; + } + } + } + std::vector sel; + for (int i = 0; i < N; i++) if (best[i]) sel.push_back(i); + if ((int) sel.size() < 6) { sel.clear(); for (int i = 0; i < N; i++) sel.push_back(i); } + RT rt = decode(dlt(Xw, xn.data(), sel.data(), (int) sel.size()), + Xw, xn.data(), sel.data(), (int) sel.size()); + + Mat4 world2cam = mat4_identity(); + for (int i = 0; i < 3; i++) { for (int j = 0; j < 3; j++) world2cam(i, j) = rt.R(i, j); world2cam(i, 3) = rt.t[i]; } + inliers = best; + return inv_rigid4(world2cam); +} + +// ---- estimate_poses orchestration ----------------------------------------- + +PoseResult estimate_poses(const std::vector & points, + const std::vector & opacities, + int H, int W, double opacity_threshold, + double focal, int pnp_iter, bool normalize, uint64_t seed) { + const int Nv = (int) points.size(); + const int P = H * W; + const double ppx = W / 2.0, ppy = H / 2.0; + std::vector grid; + pixel_grid(H, W, grid); + + // per-view opacity masks + std::vector> masks(Nv); + for (int v = 0; v < Nv; v++) { + masks[v].assign(P, 1); + if (v < (int) opacities.size() && opacities[v]) { + for (int i = 0; i < P; i++) masks[v][i] = (opacities[v][i] > opacity_threshold) ? 1 : 0; + } + } + + // focal: view 0 only, ALL pixels (use_first_focal, scene recipe) + if (focal <= 0.0) { + std::vector pts(3 * P), pix(2 * P); + for (int i = 0; i < P; i++) { + pts[3*i] = points[0][3*i]; pts[3*i+1] = points[0][3*i+1]; pts[3*i+2] = points[0][3*i+2]; + pix[2*i] = grid[2*i]; pix[2*i+1] = grid[2*i+1]; + } + focal = estimate_focal(pts.data(), pix.data(), P, ppx, ppy); + } + Mat3 K = mat3_identity(); + K(0,0) = focal; K(1,1) = focal; K(0,2) = ppx; K(1,2) = ppy; + + PoseResult res; + res.focal = focal; + res.scale = 1.0; + for (int v = 0; v < Nv; v++) { + std::vector Xw, px; + for (int i = 0; i < P; i++) if (masks[v][i]) { + Xw.push_back(points[v][3*i]); Xw.push_back(points[v][3*i+1]); Xw.push_back(points[v][3*i+2]); + px.push_back(grid[2*i]); px.push_back(grid[2*i+1]); + } + std::vector inl; + Mat4 c2w = solve_pnp_numpy(Xw.data(), px.data(), (int) (Xw.size()/3), K, inl, 5.0, pnp_iter, seed); + res.cam2world.push_back(c2w); + } + + if (normalize && Nv >= 2) { + Vec3 t0 = { res.cam2world.front()(0,3), res.cam2world.front()(1,3), res.cam2world.front()(2,3) }; + Vec3 tl = { res.cam2world.back()(0,3), res.cam2world.back()(1,3), res.cam2world.back()(2,3) }; + const double baseline = std::sqrt((tl[0]-t0[0])*(tl[0]-t0[0]) + (tl[1]-t0[1])*(tl[1]-t0[1]) + (tl[2]-t0[2])*(tl[2]-t0[2])) + 1e-2; + res.scale = 1.0 / baseline; + for (auto & c : res.cam2world) for (int i = 0; i < 3; i++) c(i, 3) *= res.scale; + } + return res; +} + +} // namespace pose +} // namespace free_splatter diff --git a/src/pose.h b/src/pose.h new file mode 100644 index 0000000..e511e32 --- /dev/null +++ b/src/pose.h @@ -0,0 +1,104 @@ +// pose.h — downstream camera-pose recovery + cross-run alignment (C++ port). +// +// This is the C++ port of the proven `pose/` research prototype (focal.py + +// align.py + pnp.py): given the engine's per-pixel gaussians it recovers each +// view's camera (PnP), and aligns successive runs into one accumulating world +// (Sim(3)). It is dependency-free — only the self-contained linalg.h — so it +// ships from the CLI / C API with no Python and no Eigen/OpenCV (see CLAUDE.md). +// +// Faithful to FreeSplatter's scene recipe (estimate_poses -> DUSt3R fast_pnp): +// the numpy reference solver, verified ~1e-7 against cv2 and bit-exact to upstream +// estimate_poses on real engine output (pose/check_upstream_parity.py). +// +// Conventions: f64 throughout; a similarity acts as x -> s*(R@x)+t; gaussian +// channel layout (scene, 23ch) is xyz[0:3] SH[3:15] opacity[15] scale[16:19] +// rotation[19:23], opacity already activated (sigmoid) in [0,1]. +#ifndef FREE_SPLATTER_POSE_H +#define FREE_SPLATTER_POSE_H + +#include "linalg.h" + +#include +#include + +namespace free_splatter { +namespace pose { + +using fsla::Vec3; +using fsla::Mat3; +using fsla::Mat4; + +// ---- focal (Weiszfeld shared-focal, mirrors pose/focal.py) ----------------- +// Robust (L1) focal from camera-frame points and their pixels, principal point +// pp. N points, pts3d is N*3 (x,y,z), pixels is N*2 (col,row), all row-major. +double estimate_focal(const double * pts3d, const double * pixels, int N, + double ppx, double ppy, int weiszfeld_iters = 10); + +// ---- similarity alignment (mirrors pose/align.py) -------------------------- +struct Sim3 { double s; Mat3 R; Vec3 t; }; // x -> s*(R@x)+t + +Sim3 sim_identity(); +Vec3 sim_apply(const Sim3 & T, const Vec3 & x); +Sim3 sim_compose(const Sim3 & T2, const Sim3 & T1); // apply T1 then T2 +Sim3 sim_invert(const Sim3 & T); +Mat4 sim_matrix(const Sim3 & T); // 4x4 [[sR,t],[0,1]] + +// Umeyama closed-form (s,R,t) minimizing ||sRX+t - Y||^2. with_scale=false -> rigid. +// X, Y are N*3 row-major. +Sim3 fit_similarity(const double * X, const double * Y, int N, bool with_scale = true); + +// RANSAC similarity robust to gross outliers; fills `inliers` (size N, 0/1). +Sim3 fit_similarity_ransac(const double * X, const double * Y, int N, double thresh, + int iters, std::vector & inliers, + bool with_scale = true, uint64_t seed = 0); + +// Residual ladder (rigid -> similarity -> affine) + depth correlation, and the +// verdict that classifies the A<->B mismatch. Mirrors align.diagnose. +struct Ladder { + double scene, rigid, similarity, affine, scale, depth_corr, aff_gain; + bool structured; + std::string verdict; // rigid_ok | needs_scale | needs_affine | nonlinear | similarity_plus_noise +}; +Ladder diagnose(const double * X, const double * Y, int N, + double tol = 1e-3, double corr_tol = 0.3); + +// Loop-closure metrics for a closed chain of similarities (deviation from I). +struct LoopError { double scale_err, rot_deg, trans; }; +LoopError loop_closure_error(const std::vector & links); + +// Fractional power M^f of a Sim(3) 4x4 (even loop-closure relaxation). Real part +// of the eigendecomposition; valid while rotation < 180deg. Mirrors sim_frac_power. +Mat4 sim_frac_power(const Mat4 & M, double f); + +// ---- PnP (mirrors pose/pnp.py numpy backend) ------------------------------- +// RANSAC DLT PnP with known intrinsics K. Xw is N*3 world points, pixels is N*2 +// (col,row). Returns cam2world (4x4) and fills `inliers` (size N, 0/1). +Mat4 solve_pnp_numpy(const double * Xw, const double * pixels, int N, const Mat3 & K, + std::vector & inliers, + double thresh_px = 5.0, int iters = 100, uint64_t seed = 0); + +// Integer pixel grid (col,row), row-major over H*W — matches upstream xy_grid +// (no half-pixel offset). Fills `out` with 2*H*W doubles. +void pixel_grid(int H, int W, std::vector & out); + +// ---- estimate_poses orchestration (scene recipe, mirrors pnp.estimate_poses) - +struct PoseResult { + std::vector cam2world; + double focal; + double scale; +}; +// points: N views, each H*W*3 (gaussian centers, view-0 frame). opacities: N +// views, each H*W (activated; pass nullptr entries for no mask). focal<=0 -> +// estimate from view 0 over all pixels (use_first_focal). normalize -> the +// runner's 1/baseline camera rescale. +PoseResult estimate_poses(const std::vector & points, + const std::vector & opacities, + int H, int W, + double opacity_threshold = 0.05, + double focal = -1.0, int pnp_iter = 100, + bool normalize = false, uint64_t seed = 0); + +} // namespace pose +} // namespace free_splatter + +#endif // FREE_SPLATTER_POSE_H diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 7f9f3dc..49ddd7a 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -13,6 +13,7 @@ endfunction() free_splatter_add_test(test_loader) # GGUF KV/hparams round-trip free_splatter_add_test(test_graph_blocks) # golden-op pins (ggml ops we depend on) free_splatter_add_test(test_image) # untrusted image-input validation +free_splatter_add_test(test_pose) # pose math golden tests (focal/align/PnP) # Per-layer parity vs the float64 PyTorch reference. Labelled "model": runs only # under `ctest -L model` and self-skips (77) unless FREE_SPLATTER_GGUF and diff --git a/tests/test_pose.cpp b/tests/test_pose.cpp new file mode 100644 index 0000000..5341ae1 --- /dev/null +++ b/tests/test_pose.cpp @@ -0,0 +1,352 @@ +// Asset-free golden tests for the C++ pose component (src/pose.{h,cpp}) — the +// direct mirror of pose/test_pose.py. No model, no fixtures, no cv2: synthetic +// geometry with KNOWN ground truth, the way CLAUDE.md wants new ops verified +// ("pinned to hand-computed references... a wrong op should fail with zero +// fixtures"). Runs in the fast tier (ctest -LE model). +// +// The RNG differs from numpy's (the checks are tolerance-based on recovered +// quantities, not bitstream-matched), so this validates the C++ port to the same +// correctness bar the Python prototype meets, independently of numpy. +#include "pose.h" + +#include +#include +#include +#include +#include + +using namespace free_splatter::pose; +using fsla::Vec3; +using fsla::Mat3; +using fsla::Mat4; + +static int failures = 0; +static std::mt19937_64 RNG(12345); + +static void check(const char * name, bool cond, const char * detail = "") { + std::printf(" [%s] %s%s%s\n", cond ? "PASS" : "FAIL", name, + detail[0] ? " " : "", detail); + if (!cond) failures++; +} + +// --- helpers --------------------------------------------------------------- + +static Mat3 rand_rotation() { + std::normal_distribution g(0, 1); + Vec3 ax = { g(RNG), g(RNG), g(RNG) }; + double n = std::sqrt(ax[0]*ax[0] + ax[1]*ax[1] + ax[2]*ax[2]); + ax = { ax[0]/n, ax[1]/n, ax[2]/n }; + std::uniform_real_distribution a(0.2, M_PI - 0.2); + const double ang = a(RNG), c = std::cos(ang), s = std::sin(ang), v = 1 - c; + const double x = ax[0], y = ax[1], z = ax[2]; + return Mat3{ { c+x*x*v, x*y*v-z*s, x*z*v+y*s, + y*x*v+z*s, c+y*y*v, y*z*v-x*s, + z*x*v-y*s, z*y*v+x*s, c+z*z*v } }; +} + +static double rot_angle(const Mat3 & R) { + const double tr = R(0,0) + R(1,1) + R(2,2); + return std::acos(std::max(-1.0, std::min(1.0, (tr-1)/2))) * 180.0 / M_PI; +} + +static Mat3 mul(const Mat3 & A, const Mat3 & B) { return fsla::mat3_mul(A, B); } +static Mat3 T(const Mat3 & A) { return fsla::mat3_transpose(A); } + +// apply (s,R,t) to a row-major N*3 array -> out +static void apply_sim_arr(double s, const Mat3 & R, const Vec3 & t, + const double * X, int N, double * out) { + for (int i = 0; i < N; i++) { + Vec3 x = { X[3*i], X[3*i+1], X[3*i+2] }; + Vec3 Rx = fsla::mat3_apply(R, x); + out[3*i] = s*Rx[0] + t[0]; + out[3*i+1] = s*Rx[1] + t[1]; + out[3*i+2] = s*Rx[2] + t[2]; + } +} + +static double rms3(const double * a, const double * b, int N) { + double s = 0; + for (int i = 0; i < 3*N; i++) { const double d = a[i]-b[i]; s += d*d; } + return std::sqrt(s / N); +} + +static Vec3 apply_mat4(const Mat4 & M, const Vec3 & p) { + return { M(0,0)*p[0]+M(0,1)*p[1]+M(0,2)*p[2]+M(0,3), + M(1,0)*p[0]+M(1,1)*p[1]+M(1,2)*p[2]+M(1,3), + M(2,0)*p[0]+M(2,1)*p[1]+M(2,2)*p[2]+M(2,3) }; +} + +// =========================================================================== +// Coordinate-system normalization (the scale question) +// =========================================================================== + +static void test_similarity_roundtrip() { + std::printf("test_similarity_roundtrip\n"); + const int N = 200; + std::normal_distribution g(0, 1); + std::vector X(3*N), Y(3*N); + for (int i = 0; i < N; i++) { X[3*i]=g(RNG)*3; X[3*i+1]=g(RNG)*2; X[3*i+2]=g(RNG)*5; } + const double s_t = 1.7; Mat3 R_t = rand_rotation(); Vec3 t_t = { 4.0, -1.0, 2.0 }; + apply_sim_arr(s_t, R_t, t_t, X.data(), N, Y.data()); + + Sim3 T_ = fit_similarity(X.data(), Y.data(), N); + char buf[64]; + std::snprintf(buf, sizeof buf, "%.12f vs %g", T_.s, s_t); + check("scale", std::fabs(T_.s - s_t) < 1e-9, buf); + check("rotation", rot_angle(mul(T_.R, T(R_t))) < 1e-7); + check("translation", std::sqrt((T_.t[0]-t_t[0])*(T_.t[0]-t_t[0]) + + (T_.t[1]-t_t[1])*(T_.t[1]-t_t[1]) + (T_.t[2]-t_t[2])*(T_.t[2]-t_t[2])) < 1e-7); + std::vector Yp(3*N); + apply_sim_arr(T_.s, T_.R, T_.t, X.data(), N, Yp.data()); + check("residual~0", rms3(Yp.data(), Y.data(), N) < 1e-9); +} + +static void test_scale_detection() { + std::printf("test_scale_detection\n"); + const int N = 300; + std::normal_distribution g(0, 1); + std::vector X(3*N), Y(3*N); + for (int i = 0; i < 3*N; i++) X[i] = g(RNG)*2; + const double s_t = 2.5; + apply_sim_arr(s_t, rand_rotation(), {1.0, 2.0, -3.0}, X.data(), N, Y.data()); + Ladder L = diagnose(X.data(), Y.data(), N); + check("recovered scale", std::fabs(L.scale - s_t) < 1e-8); + check("rigid residual large", L.rigid / L.scene > 0.1); + check("similarity residual ~0", L.similarity / L.scene < 1e-9); + check("verdict needs_scale", L.verdict == "needs_scale", L.verdict.c_str()); +} + +static void test_nonlinear_detection() { + std::printf("test_nonlinear_detection\n"); + const int N = 400; + std::normal_distribution g(0, 1); + std::vector X(3*N); + for (int i = 0; i < N; i++) { X[3*i]=g(RNG)*2; X[3*i+1]=g(RNG)*2; X[3*i+2]=g(RNG)*2 + 6.0; } + + // (a) anisotropic (affine) warp + Mat3 A = mul(Mat3{ {1.0,0,0, 0,1.6,0, 0,0,0.7} }, rand_rotation()); + std::vector Yaff(3*N); + for (int i = 0; i < N; i++) { + Vec3 x = { X[3*i], X[3*i+1], X[3*i+2] }; + Vec3 Ax = fsla::mat3_apply(A, x); + Yaff[3*i]=Ax[0]+1.0; Yaff[3*i+1]=Ax[1]; Yaff[3*i+2]=Ax[2]+2.0; + } + Ladder La = diagnose(X.data(), Yaff.data(), N); + check("affine: similarity fails", La.similarity / La.scene > 1e-3); + check("affine: affine fits", La.affine / La.scene < 1e-9); + check("affine: verdict needs_affine", La.verdict == "needs_affine", La.verdict.c_str()); + + // (b) depth-quadratic (focal-error fingerprint) + double zmean = 0; for (int i = 0; i < N; i++) zmean += X[3*i+2]; zmean /= N; + double zvar = 0; for (int i = 0; i < N; i++) zvar += (X[3*i+2]-zmean)*(X[3*i+2]-zmean); zvar /= N; + std::vector Ynl(X); + for (int i = 0; i < N; i++) { + const double Z = X[3*i+2]; + Ynl[3*i] *= (1.0 + 0.15 * (Z-zmean)*(Z-zmean) / zvar); + } + Ladder Ln = diagnose(X.data(), Ynl.data(), N); + check("nonlinear: affine fails", Ln.affine / Ln.scene > 1e-3); + check("nonlinear: verdict nonlinear", Ln.verdict == "nonlinear", Ln.verdict.c_str()); + check("nonlinear: depth-correlated residual", Ln.structured); +} + +static void test_outlier_robustness() { + std::printf("test_outlier_robustness\n"); + const int N = 400; + std::normal_distribution g(0, 1); + std::vector X(3*N), Y(3*N); + for (int i = 0; i < 3*N; i++) X[i] = g(RNG)*2; + const double s_t = 1.3; Mat3 R_t = rand_rotation(); Vec3 t_t = { 0.5, -2.0, 1.0 }; + apply_sim_arr(s_t, R_t, t_t, X.data(), N, Y.data()); + const int n_out = (int)(0.3 * N); + std::vector oidx(N); for (int i = 0; i < N; i++) oidx[i] = i; + std::shuffle(oidx.begin(), oidx.end(), RNG); + for (int k = 0; k < n_out; k++) { int i = oidx[k]; Y[3*i]+=g(RNG)*10; Y[3*i+1]+=g(RNG)*10; Y[3*i+2]+=g(RNG)*10; } + std::vector inl; + Sim3 T_ = fit_similarity_ransac(X.data(), Y.data(), N, 0.05, 300, inl); + check("scale recovered", std::fabs(T_.s - s_t) < 1e-3); + check("rotation recovered", rot_angle(mul(T_.R, T(R_t))) < 0.1); + check("translation recovered", std::sqrt((T_.t[0]-t_t[0])*(T_.t[0]-t_t[0]) + + (T_.t[1]-t_t[1])*(T_.t[1]-t_t[1]) + (T_.t[2]-t_t[2])*(T_.t[2]-t_t[2])) < 1e-2); + int rejected = 0; for (int k = 0; k < n_out; k++) if (!inl[oidx[k]]) rejected++; + check("outliers excluded", rejected > 0.9 * n_out); +} + +static void test_loop_correction() { + std::printf("test_loop_correction\n"); + Mat3 R = rand_rotation(); + Mat4 M = sim_matrix({ 1.07, R, {0.3, -0.2, 0.1} }); + Mat4 m0 = sim_frac_power(M, 0.0), m1 = sim_frac_power(M, 1.0), mh = sim_frac_power(M, 0.5); + bool id_ok = true, m1_ok = true; + for (int i = 0; i < 16; i++) { id_ok &= std::fabs(m0.a[i] - fsla::mat4_identity().a[i]) < 1e-9; + m1_ok &= std::fabs(m1.a[i] - M.a[i]) < 1e-9; } + Mat4 mh2 = fsla::mat4_mul(mh, mh); + bool half_ok = true; for (int i = 0; i < 16; i++) half_ok &= std::fabs(mh2.a[i] - M.a[i]) < 1e-9; + check("M^0 == I", id_ok); + check("M^1 == M", m1_ok); + check("(M^.5)^2 == M", half_ok); + + const int n = 12; + Mat4 D = sim_matrix({ 1.15, R, {0.4, -0.2, 0.1} }); + std::vector clean(n+1), drift(n+1), corr(n+1); + for (int k = 0; k <= n; k++) { + const double tt = 2*M_PI * k / n; + clean[k] = { std::cos(tt), std::sin(tt), 0.1*tt }; + drift[k] = apply_mat4(sim_frac_power(D, -(double)k/n), clean[k]); // D^{-k/n} + corr[k] = apply_mat4(sim_frac_power(D, (double)k/n), drift[k]); // D^{ k/n} + } + double pre = 0, post = 0; + for (int k = 0; k <= n; k++) { + for (int c = 0; c < 3; c++) { pre += (drift[k][c]-clean[k][c])*(drift[k][c]-clean[k][c]); + post += (corr[k][c]-clean[k][c])*(corr[k][c]-clean[k][c]); } + } + check("drift present before", std::sqrt(pre/(n+1)) > 0.1); + char buf[64]; std::snprintf(buf, sizeof buf, "ATE %.1e", std::sqrt(post/(n+1))); + check("recovers clean loop", std::sqrt(post/(n+1)) < 1e-9, buf); +} + +static void test_loop_closure() { + std::printf("test_loop_closure\n"); + std::uniform_real_distribution us(0.8, 1.2); + std::normal_distribution g(0, 1); + std::vector links; + for (int i = 0; i < 4; i++) links.push_back({ us(RNG), rand_rotation(), {g(RNG), g(RNG), g(RNG)} }); + Sim3 acc = sim_identity(); + for (int i = 0; i < 3; i++) acc = sim_compose(links[i], acc); + std::vector closed = { links[0], links[1], links[2], sim_invert(acc) }; + LoopError e = loop_closure_error(closed); + // rot_deg has a ~sqrt(eps) (~1e-6 deg) floor: it is acos(~1) of RᵀR, whose + // deviation from I is at the fp roundoff level — scale_err and trans are + // exactly 0 here, so 1e-5 deg cleanly separates "closed" from any real drift. + check("closed loop ~ identity", e.scale_err < 1e-9 && e.rot_deg < 1e-5 && e.trans < 1e-9); + std::vector bad = closed; + bad[0].s *= 1.12; + LoopError e2 = loop_closure_error(bad); + check("scale drift detected", std::fabs(e2.scale_err - std::log(1.12)) < 1e-9); +} + +// =========================================================================== +// PnP (verified against analytic ground truth) +// =========================================================================== + +static Mat3 make_K(double f, double W, double H) { + Mat3 K = fsla::mat3_identity(); + K(0,0) = f; K(1,1) = f; K(0,2) = W/2; K(1,2) = H/2; + return K; +} + +// project world points by (R,t) world2cam through K; fill px (N*2) and z (N) +static void project(const double * Pw, int N, const Mat3 & R, const Vec3 & t, + const Mat3 & K, double * px, double * z) { + for (int i = 0; i < N; i++) { + Vec3 p = { Pw[3*i], Pw[3*i+1], Pw[3*i+2] }; + Vec3 Xc = fsla::mat3_apply(R, p); + Xc[0]+=t[0]; Xc[1]+=t[1]; Xc[2]+=t[2]; + px[2*i] = K(0,0)*Xc[0]/Xc[2] + K(0,2); + px[2*i+1] = K(1,1)*Xc[1]/Xc[2] + K(1,2); + z[i] = Xc[2]; + } +} + +static void test_focal_recovery() { + std::printf("test_focal_recovery\n"); + const int N = 500; const double f = 437.0, W = 512, H = 512; + std::uniform_real_distribution ux(-2,2), uy(-2,2), uz(4,9); + std::vector Xc(3*N), px(2*N); + for (int i = 0; i < N; i++) { + Xc[3*i]=ux(RNG); Xc[3*i+1]=uy(RNG); Xc[3*i+2]=uz(RNG); + px[2*i] = f*Xc[3*i] /Xc[3*i+2] + W/2; + px[2*i+1] = f*Xc[3*i+1]/Xc[3*i+2] + H/2; + } + double fe = estimate_focal(Xc.data(), px.data(), N, W/2, H/2); + char buf[64]; std::snprintf(buf, sizeof buf, "%.9f vs %g", fe, f); + check("exact focal", std::fabs(fe - f) < 1e-6, buf); + std::normal_distribution g(0, 0.5); + std::vector pn(px); + for (int i = 0; i < 2*N; i++) pn[i] += g(RNG); + double fn = estimate_focal(Xc.data(), pn.data(), N, W/2, H/2); + check("focal under noise", std::fabs(fn - f)/f < 0.02); +} + +static void test_pnp_recovery() { + std::printf("test_pnp_recovery\n"); + const int N = 500; Mat3 K = make_K(400, 512, 512); + std::uniform_real_distribution ux(-2,2), uy(-2,2), uz(4,8); + std::vector Pw(3*N); + for (int i = 0; i < N; i++) { Pw[3*i]=ux(RNG); Pw[3*i+1]=uy(RNG); Pw[3*i+2]=uz(RNG); } + std::normal_distribution g(0, 1); + std::uniform_real_distribution ut(-0.6, 0.6), ua(3, 20); + double worst_R = 0, worst_t = 0; + for (int trial = 0; trial < 8; trial++) { + Vec3 ax = { g(RNG), g(RNG), g(RNG) }; + double n = std::sqrt(ax[0]*ax[0]+ax[1]*ax[1]+ax[2]*ax[2]); + ax = { ax[0]/n, ax[1]/n, ax[2]/n }; + const double ang = ua(RNG) * M_PI/180.0, c = std::cos(ang), s = std::sin(ang), v = 1-c; + Mat3 R{ { c+ax[0]*ax[0]*v, ax[0]*ax[1]*v-ax[2]*s, ax[0]*ax[2]*v+ax[1]*s, + ax[1]*ax[0]*v+ax[2]*s, c+ax[1]*ax[1]*v, ax[1]*ax[2]*v-ax[0]*s, + ax[2]*ax[0]*v-ax[1]*s, ax[2]*ax[1]*v+ax[0]*s, c+ax[2]*ax[2]*v } }; + Vec3 t = { ut(RNG), ut(RNG), ut(RNG) }; + std::vector px(2*N), z(N); + project(Pw.data(), N, R, t, K, px.data(), z.data()); + bool behind = false; for (int i = 0; i < N; i++) if (z[i] <= 0) behind = true; + if (behind) continue; + std::vector inl; + Mat4 c2w = solve_pnp_numpy(Pw.data(), px.data(), N, K, inl); + Mat4 w2c = fsla::inv_rigid4(c2w); + Mat3 Rr{}; Vec3 tr{}; + for (int i = 0; i < 3; i++) { for (int j = 0; j < 3; j++) Rr(i,j)=w2c(i,j); tr[i]=w2c(i,3); } + worst_R = std::max(worst_R, (double) rot_angle(mul(Rr, T(R)))); + worst_t = std::max(worst_t, std::sqrt((tr[0]-t[0])*(tr[0]-t[0])+(tr[1]-t[1])*(tr[1]-t[1])+(tr[2]-t[2])*(tr[2]-t[2]))); + } + char buf[64]; std::snprintf(buf, sizeof buf, "worst %.2e deg", worst_R); + check("rotation recovered", worst_R < 1e-3, buf); + check("translation recovered", worst_t < 1e-4); +} + +static void test_pnp_outliers() { + std::printf("test_pnp_outliers\n"); + const int N = 600; Mat3 K = make_K(400, 512, 512); + std::uniform_real_distribution ux(-2,2), uy(-2,2), uz(4,8); + std::vector Pw(3*N); + for (int i = 0; i < N; i++) { Pw[3*i]=ux(RNG); Pw[3*i+1]=uy(RNG); Pw[3*i+2]=uz(RNG); } + Mat3 R = rand_rotation(); + // shrink towards identity so all points stay in front + for (int i = 0; i < 3; i++) for (int j = 0; j < 3; j++) R(i,j) = (i==j?1.0:0.0)*0.85 + R(i,j)*0.15; + { fsla::Mat3 U,V; Vec3 s; fsla::svd3(R, U, s, V); R = mul(U, T(V)); if (fsla::det3(R)<0) for(int i=0;i<3;i++) R(i,2)=-R(i,2); } + Vec3 t = { 0.3, -0.2, 0.4 }; + std::vector px(2*N), z(N); + project(Pw.data(), N, R, t, K, px.data(), z.data()); + // keep only points in front + std::vector Pk, pk; + for (int i = 0; i < N; i++) if (z[i] > 0) { for(int c=0;c<3;c++) Pk.push_back(Pw[3*i+c]); pk.push_back(px[2*i]); pk.push_back(px[2*i+1]); } + const int M = (int) pk.size()/2; + const int n_out = (int)(0.25 * M); + std::vector oidx(M); for (int i = 0; i < M; i++) oidx[i]=i; + std::shuffle(oidx.begin(), oidx.end(), RNG); + std::uniform_real_distribution corrupt(-150, 150); + for (int k = 0; k < n_out; k++) { int i = oidx[k]; pk[2*i]+=corrupt(RNG); pk[2*i+1]+=corrupt(RNG); } + std::vector inl; + Mat4 c2w = solve_pnp_numpy(Pk.data(), pk.data(), M, K, inl, 2.0, 300); + Mat4 w2c = fsla::inv_rigid4(c2w); + Mat3 Rr{}; Vec3 tr{}; + for (int i = 0; i < 3; i++) { for (int j = 0; j < 3; j++) Rr(i,j)=w2c(i,j); tr[i]=w2c(i,3); } + char buf[64]; std::snprintf(buf, sizeof buf, "%.3f deg", (double) rot_angle(mul(Rr, T(R)))); + check("rotation under outliers", rot_angle(mul(Rr, T(R))) < 0.2, buf); + check("translation under outliers", std::sqrt((tr[0]-t[0])*(tr[0]-t[0])+(tr[1]-t[1])*(tr[1]-t[1])+(tr[2]-t[2])*(tr[2]-t[2])) < 0.02); + int rejected = 0; for (int k = 0; k < n_out; k++) if (!inl[oidx[k]]) rejected++; + check("outliers rejected", rejected > 0.85 * n_out); +} + +int main() { + test_similarity_roundtrip(); + test_scale_detection(); + test_nonlinear_detection(); + test_outlier_robustness(); + test_loop_correction(); + test_loop_closure(); + test_focal_recovery(); + test_pnp_recovery(); + test_pnp_outliers(); + std::printf(failures ? "\ntest_pose: %d FAILURES\n" : "\ntest_pose: ok\n", failures); + return failures ? 1 : 0; +} From 1142a5192c45d18a34074383308db7a7438401a5 Mon Sep 17 00:00:00 2001 From: Richard Palethorpe Date: Sat, 27 Jun 2026 10:53:00 +0100 Subject: [PATCH 09/33] pose/: robust C++ PnP (EPnP + Gauss-Newton) -- cv2-parity on real scenes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The DLT/RANSAC solver inherits the planar/mirror degeneracy on real dumps (seed-dependent: ~3/5 seeds near cv2, ~2/5 a ~135-152deg flip). Add the shipped real-data solver, solve_pnp: * EPnP (Lepetit/Moreno-Noguer/Fua) for the init -- barycentric control points, the camera-frame control points from the 12x12 MᵀM null space (the same Jacobi eigensolver), beta solve for N=1,2,3 with cheirality sign-fix, R,t via the rigid Umeyama on the 4 control points, best-N by reprojection. Non-iterative, uses ALL points (no random minimal samples -> no seed-dependent flips), planar-robust by construction. * Huber-robust Gauss-Newton reprojection refine (6-DoF left perturbation, J = [-[Xc]_x | I]) to polish to the reprojection minimum and downweight outliers -- the deterministic analogue of cv2's RANSAC+SQPNP+refine. On the real scene dump (A_scn): deterministic across all RANSAC seeds, and within 0.73deg rotation / 0.74deg translation-direction of the upstream cv2/SQPNP -- vs the numpy DLT's 175deg miss there. estimate_poses now uses solve_pnp; the DLT solve_pnp_numpy stays as the asset-free golden reference. Golden tests (all green under ASan/UBSan, ctest -LE model): exact recovery on clean data, a near-planar slab (where the DLT's coplanar minimal samples flip), and 15% gross-outlier rejection. Co-Authored-By: Claude Opus 4.8 (1M context) --- pose/README.md | 23 ++-- src/pose.cpp | 277 +++++++++++++++++++++++++++++++++++++++++++- src/pose.h | 18 ++- tests/test_pose.cpp | 92 +++++++++++++++ 4 files changed, 396 insertions(+), 14 deletions(-) diff --git a/pose/README.md b/pose/README.md index e827981..8bcf72c 100644 --- a/pose/README.md +++ b/pose/README.md @@ -24,16 +24,19 @@ test tier (`tests/test_pose.cpp`, `ctest -LE model`): `sim_frac_power`** — `sim_frac_power` is a closed-form Sim(3) one-parameter subgroup (`A^f=s^f R^f`, translation `(A^f−I)(A−I)⁻¹t`), no complex eig needed. All 9 golden tests (the mirror of `test_pose.py`) pass under ASan/UBSan. -- ⚠ **`pnp.py` (numpy backend) → `solve_pnp_numpy` + `estimate_poses`** — DLT (via - the 12×12 `AᵀA` nullspace, which sidesteps the numpy reference's full-SVD OOM) - + RANSAC + cheirality decode. **Correct on clean synthetic data** (golden - tests), but on a real scene dump it inherits the **DLT algorithm's planar/mirror - degeneracy**: ~3/5 RANSAC seeds land near the cv2/SQPNP answer (58° vs cv2's - 57° relative rotation), ~2/5 collapse to a ~135–152° flip. This is the *same* - instability the numpy reference has — which is exactly why the prototype used - **cv2 (SQPNP) for every real-data result**. **Next step:** a robust in-house PnP - (EPnP/SQPNP + Gauss-Newton refine) to reach cv2-parity on real scenes with no - OpenCV dependency. +- ✅ **Robust PnP → `solve_pnp` (EPnP + Gauss-Newton)** — `solve_pnp_numpy` (DLT + + RANSAC, via the 12×12 `AᵀA` nullspace) is kept as the asset-free golden-test + reference, but on real scenes the DLT inherits the textbook **planar/mirror + degeneracy** (seed-dependent: ~3/5 RANSAC seeds near cv2, ~2/5 a ~135–152° + flip). The shipped real-data solver is `solve_pnp`: **EPnP** (planar-robust, + non-iterative, uses all points so there is no random minimal-sample flip — + reuses the 12×12 `MᵀM` Jacobi eigendecomposition) for the init, then a + **Huber-robust Gauss-Newton** reprojection refine. On the real scene dump it is + **deterministic** (identical across all RANSAC seeds) and lands at **0.73° of + the upstream cv2/SQPNP** relative rotation (0.74° translation direction) — i.e. + **cv2-parity on real data with no OpenCV dependency**, versus the numpy DLT's + 175° miss on the same dump. Golden tests cover exact recovery, a near-planar + config (where DLT flips), and 15% gross-outlier rejection. - ⬜ Not yet ported: accumulation chaining, loop closure, consensus fusion (these compose the primitives above), and the CLI / C-API surface. diff --git a/src/pose.cpp b/src/pose.cpp index 4bcafa8..fc39a23 100644 --- a/src/pose.cpp +++ b/src/pose.cpp @@ -6,6 +6,7 @@ // self-contained Jacobi eigensolver in linalg.h — no Eigen, no OpenCV. #include "pose.h" +#include #include #include @@ -504,12 +505,286 @@ Mat4 solve_pnp_numpy(const double * Xw, const double * pixels, int N, const Mat3 return inv_rigid4(world2cam); } +// ---- robust PnP: EPnP + Gauss-Newton -------------------------------------- +// +// EPnP (Lepetit/Moreno-Noguer/Fua 2009): express each world point as a +// barycentric combination of 4 control points; the camera-frame control points +// live in the null space of a 2n x 12 system, recovered from the eigenvectors of +// the 12x12 MᵀM (the same Jacobi eigensolver as everywhere else). It is +// non-iterative, uses ALL points (no random minimal samples), and handles +// near-planar configs — so it does not suffer the DLT's seed-dependent mirror +// flips. We try N=1,2,3 null vectors, pick the lowest reprojection error, then +// polish (R,t) with a Huber-robust Gauss-Newton on the reprojection residual. + +namespace { + +Mat3 skew_neg(const Vec3 & x) { // -[x]_× = ∂Xc/∂ω for left perturbation + return Mat3{ { 0, x[2], -x[1], + -x[2], 0, x[0], + x[1], -x[0], 0 } }; +} + +Mat3 so3_exp(const Vec3 & w) { // rotation from an axis-angle (rotation vector) + const double th = std::sqrt(w[0]*w[0] + w[1]*w[1] + w[2]*w[2]); + if (th < 1e-12) { // I + [w]_× to first order + Mat3 R = mat3_identity(); + R(0,1) = -w[2]; R(0,2) = w[1]; R(1,0) = w[2]; R(1,2) = -w[0]; R(2,0) = -w[1]; R(2,1) = w[0]; + return R; + } + return rodrigues({ w[0]/th, w[1]/th, w[2]/th }, th); +} + +double reproj_rms(const double * Xw, const double * px, int N, const Mat3 & K, + const Mat3 & R, const Vec3 & t) { + const double fu = K(0,0), fv = K(1,1), cu = K(0,2), cv = K(1,2); + double s = 0; int m = 0; + for (int i = 0; i < N; i++) { + Vec3 p = { Xw[3*i], Xw[3*i+1], Xw[3*i+2] }; + Vec3 Xc = mat3_apply(R, p); Xc[0]+=t[0]; Xc[1]+=t[1]; Xc[2]+=t[2]; + if (Xc[2] <= 1e-6) { s += 1e6; m++; continue; } + const double dx = fu*Xc[0]/Xc[2] + cu - px[2*i]; + const double dy = fv*Xc[1]/Xc[2] + cv - px[2*i+1]; + s += dx*dx + dy*dy; m++; + } + return std::sqrt(s / std::max(m, 1)); +} + +// Recover camera-frame control points from the null-space vector x (12), fix the +// global sign by cheirality (scene in front), and get world2cam (R,t) from the +// rigid fit between the 4 world and 4 camera control points. +void recover_Rt(const double * x12, const double cw[4][3], const double * alpha, int N, + Mat3 & R, Vec3 & t) { + double cc[12]; + for (int i = 0; i < 12; i++) cc[i] = x12[i]; + // mean camera-frame depth over the points (pc = sum_j alpha_ij cc_j) + double meanz = 0; + for (int i = 0; i < N; i++) { + double z = 0; + for (int j = 0; j < 4; j++) z += alpha[4*i+j] * cc[3*j+2]; + meanz += z; + } + if (meanz < 0) for (int i = 0; i < 12; i++) cc[i] = -cc[i]; + double cwf[12]; + for (int j = 0; j < 4; j++) for (int k = 0; k < 3; k++) cwf[3*j+k] = cw[j][k]; + Sim3 T = fit_similarity(cwf, cc, 4, /*with_scale=*/false); // rigid world->camera + R = T.R; t = T.t; +} + +// EPnP core: returns the best (R,t) world2cam over N=1,2,3 by reprojection error. +void epnp(const double * Xw, const double * px, int N, const Mat3 & K, Mat3 & Rout, Vec3 & tout) { + // 1. control points: centroid + principal axes (sqrt-eigenvalue scaled). + double c0[3] = {0,0,0}; + for (int i = 0; i < N; i++) { c0[0]+=Xw[3*i]; c0[1]+=Xw[3*i+1]; c0[2]+=Xw[3*i+2]; } + c0[0]/=N; c0[1]/=N; c0[2]/=N; + double Cov[9] = {0}; + for (int i = 0; i < N; i++) { + const double d[3] = { Xw[3*i]-c0[0], Xw[3*i+1]-c0[1], Xw[3*i+2]-c0[2] }; + for (int r = 0; r < 3; r++) for (int c = 0; c < 3; c++) Cov[r*3+c] += d[r]*d[c]; + } + for (int i = 0; i < 9; i++) Cov[i] /= N; + double ev[3], evec[9]; + fsla::jacobi_eigh(Cov, 3, ev, evec); + double lam_max = std::max({ev[0], ev[1], ev[2], 1e-12}); + double cw[4][3]; + for (int k = 0; k < 3; k++) cw[0][k] = c0[k]; + for (int j = 0; j < 3; j++) { + // floor the axis length so a near-planar scene keeps 4 affinely-independent + // control points (an invertible barycentric matrix). + const double s = std::sqrt(std::max(ev[j], 1e-6 * lam_max)); + for (int k = 0; k < 3; k++) cw[j+1][k] = c0[k] + s * evec[k*3 + j]; + } + + // 2. barycentric coordinates: alpha_i = C^{-1} [p_i; 1], C = [[cw^T];[1...]]. + double C[16]; + for (int j = 0; j < 4; j++) { for (int k = 0; k < 3; k++) C[k*4+j] = cw[j][k]; C[3*4+j] = 1.0; } + double Cinv[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1}; + solve_lin(C, 4, Cinv, 4); // Cinv now holds C^{-1} + std::vector alpha((size_t) 4 * N); + for (int i = 0; i < N; i++) { + const double p[4] = { Xw[3*i], Xw[3*i+1], Xw[3*i+2], 1.0 }; + for (int j = 0; j < 4; j++) { + double a = 0; for (int k = 0; k < 4; k++) a += Cinv[j*4+k]*p[k]; + alpha[4*i+j] = a; + } + } + + // 3. M (2n x 12) -> MtM (12 x 12). + const double fu = K(0,0), fv = K(1,1), cu = K(0,2), cv = K(1,2); + double MtM[144] = {0}; + for (int i = 0; i < N; i++) { + double r1[12] = {0}, r2[12] = {0}; + const double uu = px[2*i], vv = px[2*i+1]; + for (int j = 0; j < 4; j++) { + const double a = alpha[4*i+j]; + r1[3*j+0] = a*fu; r1[3*j+2] = a*(cu - uu); + r2[3*j+1] = a*fv; r2[3*j+2] = a*(cv - vv); + } + for (int r = 0; r < 12; r++) for (int c = 0; c < 12; c++) MtM[r*12+c] += r1[r]*r1[c] + r2[r]*r2[c]; + } + double mev[12], mevec[144]; + fsla::jacobi_eigh(MtM, 12, mev, mevec); + int ord[12]; for (int i = 0; i < 12; i++) ord[i] = i; + std::sort(ord, ord + 12, [&](int a, int b){ return mev[a] < mev[b]; }); + double V[3][12]; // the 3 smallest-eigenvalue vectors + for (int n = 0; n < 3; n++) for (int i = 0; i < 12; i++) V[n][i] = mevec[i*12 + ord[n]]; + + // control-point pairs and their world distances + const int pr[6][2] = {{0,1},{0,2},{0,3},{1,2},{1,3},{2,3}}; + double dw2[6]; + for (int p = 0; p < 6; p++) { + const int i = pr[p][0], j = pr[p][1]; + double d = 0; for (int k = 0; k < 3; k++) { const double e = cw[i][k]-cw[j][k]; d += e*e; } + dw2[p] = d; + } + auto blkdiff = [&](int n, int p, double out[3]) { + const int i = pr[p][0], j = pr[p][1]; + for (int k = 0; k < 3; k++) out[k] = V[n][3*i+k] - V[n][3*j+k]; + }; + auto dot3 = [](const double a[3], const double b[3]){ return a[0]*b[0]+a[1]*b[1]+a[2]*b[2]; }; + + Mat3 bestR{}; Vec3 bestT{}; double best_err = 1e300; bool have = false; + auto consider = [&](const double x12[12]) { + Mat3 R; Vec3 t; + recover_Rt(x12, cw, alpha.data(), N, R, t); + const double e = reproj_rms(Xw, px, N, K, R, t); + if (e < best_err) { best_err = e; bestR = R; bestT = t; have = true; } + }; + + // N=1: x = beta * v0, beta from the 6 distance constraints (least squares). + { + double num = 0, den = 0; + for (int p = 0; p < 6; p++) { double d0[3]; blkdiff(0,p,d0); + const double nd = std::sqrt(dot3(d0,d0)); num += nd*std::sqrt(dw2[p]); den += nd*nd; } + const double beta = (den > 0) ? num/den : 0.0; + double x[12]; for (int i = 0; i < 12; i++) x[i] = beta*V[0][i]; + consider(x); + } + // N=2: x = b1 v0 + b2 v1; distance constraints linear in (b1^2,b1b2,b2^2). + { + double L[18], rho[6]; + for (int p = 0; p < 6; p++) { + double d0[3], d1[3]; blkdiff(0,p,d0); blkdiff(1,p,d1); + L[p*3+0] = dot3(d0,d0); L[p*3+1] = 2*dot3(d0,d1); L[p*3+2] = dot3(d1,d1); + rho[p] = dw2[p]; + } + double G[9] = {0}, g[3] = {0}; // normal equations (3x3) + for (int p = 0; p < 6; p++) { for (int r = 0; r < 3; r++) { for (int c = 0; c < 3; c++) G[r*3+c] += L[p*3+r]*L[p*3+c]; g[r] += L[p*3+r]*rho[p]; } } + solve_lin(G, 3, g, 1); // g = [b11,b12,b22] + const double b1 = std::sqrt(std::fabs(g[0])); + double b2 = std::sqrt(std::fabs(g[2])); + if (g[1] < 0) b2 = -b2; + double x[12]; for (int i = 0; i < 12; i++) x[i] = b1*V[0][i] + b2*V[1][i]; + consider(x); + } + // N=3: x = b1 v0 + b2 v1 + b3 v2; 6 constraints in 6 quadratic unknowns. + { + double L[36], rho[6]; + for (int p = 0; p < 6; p++) { + double d0[3], d1[3], d2[3]; blkdiff(0,p,d0); blkdiff(1,p,d1); blkdiff(2,p,d2); + L[p*6+0]=dot3(d0,d0); L[p*6+1]=2*dot3(d0,d1); L[p*6+2]=2*dot3(d0,d2); + L[p*6+3]=dot3(d1,d1); L[p*6+4]=2*dot3(d1,d2); L[p*6+5]=dot3(d2,d2); + rho[p]=dw2[p]; + } + double Lc[36], rc[6]; + for (int i = 0; i < 36; i++) Lc[i] = L[i]; + for (int i = 0; i < 6; i++) rc[i] = rho[i]; + solve_lin(Lc, 6, rc, 1); // rc = [b11,b12,b13,b22,b23,b33] + const double b1 = std::sqrt(std::fabs(rc[0])); + const double b2 = (b1 > 1e-12) ? rc[1]/b1 : 0.0; + const double b3 = (b1 > 1e-12) ? rc[2]/b1 : 0.0; + double x[12]; for (int i = 0; i < 12; i++) x[i] = b1*V[0][i] + b2*V[1][i] + b3*V[2][i]; + consider(x); + } + + if (!have) { Rout = mat3_identity(); tout = {0,0,0}; return; } + Rout = bestR; tout = bestT; +} + +// Huber-robust Gauss-Newton on the reprojection residual; 6-DoF left perturbation +// (R <- exp(dw) R, t += dt). Refines an EPnP init to the reprojection minimum and +// downweights gross outliers, so the deterministic EPnP+GN matches cv2's +// RANSAC+SQPNP+refine without random sampling. +void gauss_newton_pnp(const double * Xw, const double * px, int N, const Mat3 & K, + Mat3 & R, Vec3 & t, int iters, double huber_px) { + const double fu = K(0,0), fv = K(1,1), cu = K(0,2), cv = K(1,2); + for (int it = 0; it < iters; it++) { + double H[36] = {0}, b[6] = {0}; + for (int i = 0; i < N; i++) { + Vec3 p = { Xw[3*i], Xw[3*i+1], Xw[3*i+2] }; + Vec3 Xc = mat3_apply(R, p); Xc[0]+=t[0]; Xc[1]+=t[1]; Xc[2]+=t[2]; + if (Xc[2] <= 1e-6) continue; + const double iz = 1.0/Xc[2]; + const double rx = fu*Xc[0]*iz + cu - px[2*i]; + const double ry = fv*Xc[1]*iz + cv - px[2*i+1]; + const double rn = std::sqrt(rx*rx + ry*ry); + const double w = (rn <= huber_px || rn < 1e-12) ? 1.0 : huber_px/rn; + // dproj/dXc (2x3) + const double Jp[2][3] = { { fu*iz, 0, -fu*Xc[0]*iz*iz }, + { 0, fv*iz, -fv*Xc[1]*iz*iz } }; + // dXc/d(delta) (3x6) = [ -[Xc]_x | I ] + const Mat3 S = skew_neg(Xc); + double Jd[3][6]; + for (int r = 0; r < 3; r++) { Jd[r][0]=S(r,0); Jd[r][1]=S(r,1); Jd[r][2]=S(r,2); + Jd[r][3]=(r==0); Jd[r][4]=(r==1); Jd[r][5]=(r==2); } + double J[2][6]; + for (int a = 0; a < 2; a++) for (int c = 0; c < 6; c++) { + double s = 0; for (int k = 0; k < 3; k++) s += Jp[a][k]*Jd[k][c]; J[a][c] = s; } + const double r2[2] = { rx, ry }; + for (int a = 0; a < 2; a++) { + for (int c = 0; c < 6; c++) { + b[c] += w * J[a][c] * r2[a]; + for (int d = 0; d < 6; d++) H[c*6+d] += w * J[a][c] * J[a][d]; + } + } + } + double rhs[6]; for (int i = 0; i < 6; i++) rhs[i] = -b[i]; + solve_lin(H, 6, rhs, 1); // solve H delta = -b + Vec3 dw = { rhs[0], rhs[1], rhs[2] }; + R = mat3_mul(so3_exp(dw), R); + t[0]+=rhs[3]; t[1]+=rhs[4]; t[2]+=rhs[5]; + if (std::fabs(rhs[0])+std::fabs(rhs[1])+std::fabs(rhs[2])+ + std::fabs(rhs[3])+std::fabs(rhs[4])+std::fabs(rhs[5]) < 1e-12) break; + } +} + +} // namespace + +Mat4 solve_pnp_epnp(const double * Xw, const double * pixels, int N, const Mat3 & K) { + Mat3 R; Vec3 t; + epnp(Xw, pixels, N, K, R, t); + Mat4 w2c = mat4_identity(); + for (int i = 0; i < 3; i++) { for (int j = 0; j < 3; j++) w2c(i,j) = R(i,j); w2c(i,3) = t[i]; } + return inv_rigid4(w2c); +} + +Mat4 solve_pnp(const double * Xw, const double * pixels, int N, const Mat3 & K, + std::vector & inliers, double thresh_px, int gn_iters) { + Mat3 R; Vec3 t; + epnp(Xw, pixels, N, K, R, t); + gauss_newton_pnp(Xw, pixels, N, K, R, t, gn_iters, thresh_px); + + inliers.assign(N, 0); + const double fu = K(0,0), fv = K(1,1), cu = K(0,2), cv = K(1,2); + for (int i = 0; i < N; i++) { + Vec3 p = { Xw[3*i], Xw[3*i+1], Xw[3*i+2] }; + Vec3 Xc = mat3_apply(R, p); Xc[0]+=t[0]; Xc[1]+=t[1]; Xc[2]+=t[2]; + if (Xc[2] <= 1e-6) continue; + const double dx = fu*Xc[0]/Xc[2] + cu - pixels[2*i]; + const double dy = fv*Xc[1]/Xc[2] + cv - pixels[2*i+1]; + if (std::sqrt(dx*dx + dy*dy) < thresh_px) inliers[i] = 1; + } + Mat4 w2c = mat4_identity(); + for (int i = 0; i < 3; i++) { for (int j = 0; j < 3; j++) w2c(i,j) = R(i,j); w2c(i,3) = t[i]; } + return inv_rigid4(w2c); +} + // ---- estimate_poses orchestration ----------------------------------------- PoseResult estimate_poses(const std::vector & points, const std::vector & opacities, int H, int W, double opacity_threshold, double focal, int pnp_iter, bool normalize, uint64_t seed) { + (void) pnp_iter; (void) seed; // robust PnP is deterministic (EPnP+GN) const int Nv = (int) points.size(); const int P = H * W; const double ppx = W / 2.0, ppy = H / 2.0; @@ -547,7 +822,7 @@ PoseResult estimate_poses(const std::vector & points, px.push_back(grid[2*i]); px.push_back(grid[2*i+1]); } std::vector inl; - Mat4 c2w = solve_pnp_numpy(Xw.data(), px.data(), (int) (Xw.size()/3), K, inl, 5.0, pnp_iter, seed); + Mat4 c2w = solve_pnp(Xw.data(), px.data(), (int) (Xw.size()/3), K, inl, 5.0, 10); res.cam2world.push_back(c2w); } diff --git a/src/pose.h b/src/pose.h index e511e32..3da4ac2 100644 --- a/src/pose.h +++ b/src/pose.h @@ -70,13 +70,25 @@ LoopError loop_closure_error(const std::vector & links); // of the eigendecomposition; valid while rotation < 180deg. Mirrors sim_frac_power. Mat4 sim_frac_power(const Mat4 & M, double f); -// ---- PnP (mirrors pose/pnp.py numpy backend) ------------------------------- -// RANSAC DLT PnP with known intrinsics K. Xw is N*3 world points, pixels is N*2 -// (col,row). Returns cam2world (4x4) and fills `inliers` (size N, 0/1). +// ---- PnP ------------------------------------------------------------------- +// RANSAC DLT PnP with known intrinsics K (mirrors pose/pnp.py numpy backend). +// Xw is N*3 world points, pixels is N*2 (col,row). Returns cam2world (4x4) and +// fills `inliers` (size N, 0/1). Kept as the asset-free golden-test reference; on +// real scenes the DLT inherits the planar/mirror degeneracy — use solve_pnp. Mat4 solve_pnp_numpy(const double * Xw, const double * pixels, int N, const Mat3 & K, std::vector & inliers, double thresh_px = 5.0, int iters = 100, uint64_t seed = 0); +// Robust PnP: EPnP (planar-robust, deterministic — uses all points, no random +// minimal samples) for the initial pose, then a Gauss-Newton / Huber reprojection +// refine. Reaches cv2/SQPNP-grade poses on real scenes with no OpenCV dependency. +// Fills `inliers` (reprojection < thresh_px). This is what estimate_poses uses. +Mat4 solve_pnp(const double * Xw, const double * pixels, int N, const Mat3 & K, + std::vector & inliers, double thresh_px = 5.0, int gn_iters = 10); + +// EPnP-only (no refine) — exposed for testing the planar-robust initialization. +Mat4 solve_pnp_epnp(const double * Xw, const double * pixels, int N, const Mat3 & K); + // Integer pixel grid (col,row), row-major over H*W — matches upstream xy_grid // (no half-pixel offset). Fills `out` with 2*H*W doubles. void pixel_grid(int H, int W, std::vector & out); diff --git a/tests/test_pose.cpp b/tests/test_pose.cpp index 5341ae1..4a8f433 100644 --- a/tests/test_pose.cpp +++ b/tests/test_pose.cpp @@ -337,6 +337,95 @@ static void test_pnp_outliers() { check("outliers rejected", rejected > 0.85 * n_out); } +// =========================================================================== +// Robust PnP: EPnP + Gauss-Newton (the shipped real-data solver) +// =========================================================================== + +static void test_pnp_robust_recovery() { + std::printf("test_pnp_robust_recovery\n"); + const int N = 500; Mat3 K = make_K(400, 512, 512); + std::uniform_real_distribution ux(-2,2), uy(-2,2), uz(4,8); + std::vector Pw(3*N); + for (int i = 0; i < N; i++) { Pw[3*i]=ux(RNG); Pw[3*i+1]=uy(RNG); Pw[3*i+2]=uz(RNG); } + std::normal_distribution g(0, 1); + std::uniform_real_distribution ut(-0.6, 0.6); + double worst_R = 0, worst_t = 0; + for (int trial = 0; trial < 8; trial++) { + Mat3 R = rand_rotation(); + for (int i = 0; i < 3; i++) for (int j = 0; j < 3; j++) R(i,j) = (i==j?1.0:0.0)*0.8 + R(i,j)*0.2; + { fsla::Mat3 U,V; Vec3 s; fsla::svd3(R,U,s,V); R = mul(U,T(V)); if (fsla::det3(R)<0) for(int i=0;i<3;i++) R(i,2)=-R(i,2); } + Vec3 t = { ut(RNG), ut(RNG), ut(RNG) }; + std::vector px(2*N), z(N); + project(Pw.data(), N, R, t, K, px.data(), z.data()); + bool behind = false; for (int i = 0; i < N; i++) if (z[i] <= 0) behind = true; + if (behind) continue; + std::vector inl; + Mat4 c2w = solve_pnp(Pw.data(), px.data(), N, K, inl); + Mat4 w2c = fsla::inv_rigid4(c2w); + Mat3 Rr{}; Vec3 tr{}; + for (int i = 0; i < 3; i++) { for (int j = 0; j < 3; j++) Rr(i,j)=w2c(i,j); tr[i]=w2c(i,3); } + worst_R = std::max(worst_R, (double) rot_angle(mul(Rr, T(R)))); + worst_t = std::max(worst_t, std::sqrt((tr[0]-t[0])*(tr[0]-t[0])+(tr[1]-t[1])*(tr[1]-t[1])+(tr[2]-t[2])*(tr[2]-t[2]))); + } + char buf[64]; std::snprintf(buf, sizeof buf, "worst %.2e deg", worst_R); + check("rotation recovered", worst_R < 1e-3, buf); + check("translation recovered", worst_t < 1e-5); +} + +static void test_pnp_robust_planar() { + // Near-planar scene (a thin slab) — the config where the DLT's coplanar + // minimal samples flip. EPnP's all-point control basis stays well-posed. + std::printf("test_pnp_robust_planar\n"); + const int N = 500; Mat3 K = make_K(400, 512, 512); + std::uniform_real_distribution ux(-2,2), uy(-2,2); + std::normal_distribution thin(0, 0.02); // ~planar, slight thickness + std::vector Pw(3*N); + for (int i = 0; i < N; i++) { Pw[3*i]=ux(RNG); Pw[3*i+1]=uy(RNG); Pw[3*i+2]=6.0 + thin(RNG); } + Mat3 R = rand_rotation(); + for (int i = 0; i < 3; i++) for (int j = 0; j < 3; j++) R(i,j) = (i==j?1.0:0.0)*0.85 + R(i,j)*0.15; + { fsla::Mat3 U,V; Vec3 s; fsla::svd3(R,U,s,V); R = mul(U,T(V)); if (fsla::det3(R)<0) for(int i=0;i<3;i++) R(i,2)=-R(i,2); } + Vec3 t = { 0.2, -0.1, 0.3 }; + std::vector px(2*N), z(N); + project(Pw.data(), N, R, t, K, px.data(), z.data()); + std::vector inl; + Mat4 w2c = fsla::inv_rigid4(solve_pnp(Pw.data(), px.data(), N, K, inl)); + Mat3 Rr{}; Vec3 tr{}; + for (int i = 0; i < 3; i++) { for (int j = 0; j < 3; j++) Rr(i,j)=w2c(i,j); tr[i]=w2c(i,3); } + char buf[64]; std::snprintf(buf, sizeof buf, "%.4f deg", (double) rot_angle(mul(Rr, T(R)))); + check("planar rotation recovered", rot_angle(mul(Rr, T(R))) < 0.05, buf); + check("planar translation recovered", std::sqrt((tr[0]-t[0])*(tr[0]-t[0])+(tr[1]-t[1])*(tr[1]-t[1])+(tr[2]-t[2])*(tr[2]-t[2])) < 0.01); +} + +static void test_pnp_robust_outliers() { + std::printf("test_pnp_robust_outliers\n"); + const int N = 600; Mat3 K = make_K(400, 512, 512); + std::uniform_real_distribution ux(-2,2), uy(-2,2), uz(4,8); + std::vector Pw(3*N); + for (int i = 0; i < N; i++) { Pw[3*i]=ux(RNG); Pw[3*i+1]=uy(RNG); Pw[3*i+2]=uz(RNG); } + Mat3 R = rand_rotation(); + for (int i = 0; i < 3; i++) for (int j = 0; j < 3; j++) R(i,j) = (i==j?1.0:0.0)*0.85 + R(i,j)*0.15; + { fsla::Mat3 U,V; Vec3 s; fsla::svd3(R,U,s,V); R = mul(U,T(V)); if (fsla::det3(R)<0) for(int i=0;i<3;i++) R(i,2)=-R(i,2); } + Vec3 t = { 0.3, -0.2, 0.4 }; + std::vector px(2*N), z(N); + project(Pw.data(), N, R, t, K, px.data(), z.data()); + std::vector Pk, pk; + for (int i = 0; i < N; i++) if (z[i] > 0) { for(int c=0;c<3;c++) Pk.push_back(Pw[3*i+c]); pk.push_back(px[2*i]); pk.push_back(px[2*i+1]); } + const int M = (int) pk.size()/2; + const int n_out = (int)(0.15 * M); // 15% gross pixel corruption + std::vector oidx(M); for (int i = 0; i < M; i++) oidx[i]=i; + std::shuffle(oidx.begin(), oidx.end(), RNG); + std::uniform_real_distribution corrupt(-120, 120); + for (int k = 0; k < n_out; k++) { int i = oidx[k]; pk[2*i]+=corrupt(RNG); pk[2*i+1]+=corrupt(RNG); } + std::vector inl; + Mat4 w2c = fsla::inv_rigid4(solve_pnp(Pk.data(), pk.data(), M, K, inl, 3.0)); + Mat3 Rr{}; Vec3 tr{}; + for (int i = 0; i < 3; i++) { for (int j = 0; j < 3; j++) Rr(i,j)=w2c(i,j); tr[i]=w2c(i,3); } + char buf[64]; std::snprintf(buf, sizeof buf, "%.3f deg", (double) rot_angle(mul(Rr, T(R)))); + check("rotation under outliers", rot_angle(mul(Rr, T(R))) < 0.5, buf); + int rejected = 0; for (int k = 0; k < n_out; k++) if (!inl[oidx[k]]) rejected++; + check("outliers rejected", rejected > 0.85 * n_out); +} + int main() { test_similarity_roundtrip(); test_scale_detection(); @@ -347,6 +436,9 @@ int main() { test_focal_recovery(); test_pnp_recovery(); test_pnp_outliers(); + test_pnp_robust_recovery(); + test_pnp_robust_planar(); + test_pnp_robust_outliers(); std::printf(failures ? "\ntest_pose: %d FAILURES\n" : "\ntest_pose: ok\n", failures); return failures ? 1 : 0; } From 93393e8ecb1a9ed34b08783a0bf6c73cccc48eb3 Mon Sep 17 00:00:00 2001 From: Richard Palethorpe Date: Sat, 27 Jun 2026 15:45:21 +0100 Subject: [PATCH 10/33] pose/: C++ accumulation chaining (Accumulator) -- one world from a photo stream Port accumulate.py to src/pose.{h,cpp}: the sliding-window accumulating- reconstruction loop. Accumulator::add_pair takes each consecutive pair's [2,H,W,gc] engine output, recovers the pair's cameras (estimate_poses), fits the cross-run Sim(3) on the shared frame (fit_similarity_ransac), composes a global chain, and drops every new frame's gaussians into one world. Exposes cloud(), camera_path(), and per-link ChainLink diagnostics (scale/inlier/valid/resid). Validation: - Asset-free golden (test_accumulate_chain): synthetic pinhole clip with distinct per-run scales -> trajectory ATE 7.6e-8 of extent, per-link scale to 3e-9, fit residual ~1e-7. Green under ASan/UBSan. - Real-data parity on the 13 cached pair_*.f32 dumps vs the numpy/cv2 prototype: cloud size bit-exact (2,633,725), per-link valid% identical (deterministic mask), per-link Sim(3) scale to mean 0.5% (11/12 links <1%), trajectory within 6.6% of the cv2 chain. Residual = known RANSAC-RNG + EPnP-vs-SQPNP delta. Co-Authored-By: Claude Opus 4.8 (1M context) --- pose/README.md | 15 +++- src/pose.cpp | 200 ++++++++++++++++++++++++++++++++++++++++++++ src/pose.h | 81 ++++++++++++++++++ tests/test_pose.cpp | 155 ++++++++++++++++++++++++++++++++++ 4 files changed, 449 insertions(+), 2 deletions(-) diff --git a/pose/README.md b/pose/README.md index 8bcf72c..866cb20 100644 --- a/pose/README.md +++ b/pose/README.md @@ -37,8 +37,19 @@ test tier (`tests/test_pose.cpp`, `ctest -LE model`): **cv2-parity on real data with no OpenCV dependency**, versus the numpy DLT's 175° miss on the same dump. Golden tests cover exact recovery, a near-planar config (where DLT flips), and 15% gross-outlier rejection. -- ⬜ Not yet ported: accumulation chaining, loop closure, consensus fusion (these - compose the primitives above), and the CLI / C-API surface. +- ✅ **Accumulation chaining → `Accumulator` (accumulate.py)** — the live + sliding-window loop: feed each consecutive pair's `[2,H,W,gc]` engine output, + it recovers the pair's cameras (`estimate_poses`), fits the cross-run Sim(3) on + the shared frame, composes a global chain, and drops every new frame's gaussians + into one world (`cloud()`, `camera_path()`). Golden-tested (synthetic clip: + trajectory ATE 7.6e-8 of extent, per-link scale to 3e-9). **Real-data parity** + on the 13 cached `pair_*.f32` dumps vs the numpy/cv2 prototype: cloud size + **bit-exact** (2,633,725) and per-link valid% identical (deterministic mask); + per-link Sim(3) scale agrees to **mean 0.5%** (11/12 links <1%, worst on the + documented low-inlier leg); trajectory within **6.6%** of the cv2 chain — the + residual is the known RANSAC-RNG + EPnP-vs-SQPNP delta, not a port gap. +- ⬜ Not yet ported: loop closure, consensus fusion (these compose the primitives + above), and the CLI / C-API surface. ## Why it's needed diff --git a/src/pose.cpp b/src/pose.cpp index fc39a23..01ea13e 100644 --- a/src/pose.cpp +++ b/src/pose.cpp @@ -9,6 +9,7 @@ #include #include #include +#include namespace free_splatter { namespace pose { @@ -836,5 +837,204 @@ PoseResult estimate_poses(const std::vector & points, return res; } +// ---- accumulation --------------------------------------------------------- + +namespace { +inline float clamp01(float v) { return v < 0.0f ? 0.0f : (v > 1.0f ? 1.0f : v); } +const double SH_C0 = 0.28209479177387814; // SH degree-0 basis (DC -> rgb) +} // namespace + +Accumulator::Accumulator(int H, int W, double opacity_threshold, + double ransac_thresh_frac, int ransac_iters, uint64_t seed) + : H_(H), W_(W), thr_(opacity_threshold), rthr_(ransac_thresh_frac), + riters_(ransac_iters), seed_(seed), T_(sim_identity()), final_cam_(mat4_identity()) {} + +void Accumulator::add_view(const float * pts, const float * op, const float * rgb, + const Sim3 & T, int frame) { + const int P = H_ * W_; + for (int i = 0; i < P; i++) { + if (op[i] <= thr_) continue; + Vec3 x = { pts[3*i], pts[3*i+1], pts[3*i+2] }; + Vec3 w = sim_apply(T, x); + AccumPoint p; + p.x = (float) w[0]; p.y = (float) w[1]; p.z = (float) w[2]; + p.r = rgb[3*i]; p.g = rgb[3*i+1]; p.b = rgb[3*i+2]; + p.frame = frame; + cloud_.push_back(p); + } +} + +ChainLink Accumulator::add_pair(const float * g, int gc, double focal) { + const int P = H_ * W_; + // de-interleave the two views: points (ch 0:3), opacity (ch 15, already + // sigmoid-activated), rgb = clip(SH_DC * C0 + 0.5, 0, 1) (ch 3:6). + std::vector pts0(3*P), pts1(3*P), op0(P), op1(P), rgb0(3*P), rgb1(3*P); + for (int i = 0; i < P; i++) { + const float * a0 = g + (size_t) i * gc; + const float * a1 = g + (size_t) (P + i) * gc; + for (int c = 0; c < 3; c++) { pts0[3*i+c] = a0[c]; pts1[3*i+c] = a1[c]; } + op0[i] = a0[15]; op1[i] = a1[15]; + for (int c = 0; c < 3; c++) { + rgb0[3*i+c] = clamp01((float) (a0[3+c] * SH_C0 + 0.5)); + rgb1[3*i+c] = clamp01((float) (a1[3+c] * SH_C0 + 0.5)); + } + } + + // recover this pair's cameras (view-0 frame, use_first_focal) + std::vector pts = { pts0.data(), pts1.data() }; + std::vector ops = { op0.data(), op1.data() }; + PoseResult pr = estimate_poses(pts, ops, H_, W_, thr_, focal, 100, false, seed_); + const Mat4 c2w0 = pr.cam2world[0], c2w1 = pr.cam2world[1]; + + ChainLink link{}; + if (!have_prev_) { + T_ = sim_identity(); + link.sim = sim_identity(); + link.global = T_; + link.scale = 1.0; link.inlier_frac = 1.0; link.valid_frac = 1.0; link.resid_frac = 0.0; + add_view(pts0.data(), op0.data(), rgb0.data(), T_, next_frame_++); // f_0 + add_view(pts1.data(), op1.data(), rgb1.data(), T_, next_frame_++); // f_1 + } else { + // fit Sim(3) from this run's view 0 (the shared frame, run-k coords) to the + // previous run's view 1 (same frame, run-(k-1) coords). Mask = valid in both. + std::vector XA, XB; + for (int i = 0; i < P; i++) if (op0[i] > thr_ && prev_mask1_[i]) { + XA.push_back(pts0[3*i]); XA.push_back(pts0[3*i+1]); XA.push_back(pts0[3*i+2]); + XB.push_back(prev_pts1_[3*i]); XB.push_back(prev_pts1_[3*i+1]); XB.push_back(prev_pts1_[3*i+2]); + } + const int n = (int) (XA.size() / 3); + double myB[3]; mean3(XB.data(), n, myB); + std::vector ycB(3*n); + for (int i = 0; i < n; i++) for (int c = 0; c < 3; c++) ycB[3*i+c] = XB[3*i+c] - myB[c]; + const double scene = rms3(ycB.data(), n); + + std::vector inl; + Sim3 S = fit_similarity_ransac(XA.data(), XB.data(), n, rthr_ * scene, riters_, inl, true, seed_); + T_ = sim_compose(T_, S); // run k -> global + + int n_inl = 0; + std::vector rA, rB; + for (int i = 0; i < n; i++) if (inl[i]) { + n_inl++; + Vec3 a = { XA[3*i], XA[3*i+1], XA[3*i+2] }; + Vec3 sa = sim_apply(S, a); + for (int c = 0; c < 3; c++) { rA.push_back(sa[c] - XB[3*i+c]); } + (void) rB; + } + link.sim = S; + link.global = T_; + link.scale = S.s; + link.inlier_frac = (n > 0) ? (double) n_inl / n : 0.0; + link.valid_frac = (double) n / P; + link.resid_frac = (scene > 0 && n_inl > 0) ? rms3(rA.data(), n_inl) / scene : 0.0; + + add_view(pts1.data(), op1.data(), rgb1.data(), T_, next_frame_++); // f_{k+1} + } + + cams_.push_back(mat4_mul(sim_matrix(T_), c2w0)); // frame f_k camera (view 0) + final_cam_ = mat4_mul(sim_matrix(T_), c2w1); // latest view-1 camera + + // this run's view 1 becomes the next run's shared frame + prev_pts1_.assign(pts1.begin(), pts1.end()); + prev_mask1_.assign(P, 0); + for (int i = 0; i < P; i++) prev_mask1_[i] = (op1[i] > thr_) ? 1 : 0; + have_prev_ = true; + links_.push_back(link); + return link; +} + +std::vector Accumulator::camera_path() const { + std::vector path = cams_; + if (!cams_.empty()) path.push_back(final_cam_); + return path; +} + +// ---- consensus fusion ----------------------------------------------------- + +FuseStats consensus_fuse(const std::vector & cloud, double voxel_frac, int k, + std::vector & fused) { + fused.clear(); + FuseStats st{}; + st.raw_points = (int64_t) cloud.size(); + if (cloud.empty()) return st; + + // extent = rms distance to centroid; voxel = voxel_frac * extent; coords from min. + double mean[3] = {0,0,0}, lo[3] = {1e300,1e300,1e300}; + for (const auto & p : cloud) { + mean[0]+=p.x; mean[1]+=p.y; mean[2]+=p.z; + lo[0]=std::min(lo[0],(double)p.x); lo[1]=std::min(lo[1],(double)p.y); lo[2]=std::min(lo[2],(double)p.z); + } + for (int c = 0; c < 3; c++) mean[c] /= cloud.size(); + double ss = 0; + for (const auto & p : cloud) { + const double dx=p.x-mean[0], dy=p.y-mean[1], dz=p.z-mean[2]; + ss += dx*dx + dy*dy + dz*dz; + } + const double ext = std::sqrt(ss / cloud.size()); + const double v = voxel_frac * ext; + if (!(v > 0)) return st; + + struct Vox { double sx=0,sy=0,sz=0,sr=0,sg=0,sb=0; int64_t cnt=0; std::vector frames; }; + struct Key { int32_t i,j,k; bool operator==(const Key&o) const { return i==o.i&&j==o.j&&k==o.k; } }; + struct KeyHash { size_t operator()(const Key&q) const { + uint64_t h = (uint64_t)(uint32_t)q.i * 0x9E3779B97F4A7C15ULL; + h ^= (uint64_t)(uint32_t)q.j + 0x9E3779B9U + (h<<6) + (h>>2); + h ^= (uint64_t)(uint32_t)q.k + 0x85EBCA6BU + (h<<6) + (h>>2); + return (size_t) h; } }; + std::unordered_map grid; + grid.reserve(cloud.size() / 2 + 1); + + for (const auto & p : cloud) { + Key key{ (int32_t) std::floor((p.x - lo[0]) / v), + (int32_t) std::floor((p.y - lo[1]) / v), + (int32_t) std::floor((p.z - lo[2]) / v) }; + Vox & vx = grid[key]; + vx.sx+=p.x; vx.sy+=p.y; vx.sz+=p.z; vx.sr+=p.r; vx.sg+=p.g; vx.sb+=p.b; vx.cnt++; + bool seen = false; + for (int32_t f : vx.frames) if (f == p.frame) { seen = true; break; } + if (!seen) vx.frames.push_back(p.frame); + } + + st.voxels = (int64_t) grid.size(); + for (const auto & kv : grid) { + const Vox & vx = kv.second; + if ((int) vx.frames.size() < k) continue; + st.kept_voxels++; + st.points_kept += vx.cnt; + AccumPoint p; + p.x = (float) (vx.sx / vx.cnt); p.y = (float) (vx.sy / vx.cnt); p.z = (float) (vx.sz / vx.cnt); + p.r = (float) (vx.sr / vx.cnt); p.g = (float) (vx.sg / vx.cnt); p.b = (float) (vx.sb / vx.cnt); + p.frame = (int32_t) vx.frames.size(); // support count (informational) + fused.push_back(p); + } + st.points_dropped = st.raw_points - st.points_kept; + return st; +} + +// ---- loop closure --------------------------------------------------------- + +Mat4 sim4_invert(const Mat4 & M) { + Mat3 A{}; Vec3 t{}; + for (int i = 0; i < 3; i++) { for (int j = 0; j < 3; j++) A(i,j) = M(i,j); t[i] = M(i,3); } + const double s = std::cbrt(det3(A)); + Mat3 R = A; + for (int i = 0; i < 9; i++) R.a[i] /= s; + Sim3 inv = sim_invert(Sim3{ s, R, t }); + return sim_matrix(inv); +} + +std::vector distribute_drift(const Mat4 & D, const std::vector & global_poses) { + const int m = (int) global_poses.size(); + std::vector out(m); + const int n = std::max(m - 1, 1); + for (int k = 0; k < m; k++) { + Mat4 Dk = sim_frac_power(D, (double) k / n); + Vec3 c = { global_poses[k](0,3), global_poses[k](1,3), global_poses[k](2,3) }; + Vec3 r = mat3_apply(Mat3{ { Dk(0,0),Dk(0,1),Dk(0,2), Dk(1,0),Dk(1,1),Dk(1,2), Dk(2,0),Dk(2,1),Dk(2,2) } }, c); + out[k] = { r[0] + Dk(0,3), r[1] + Dk(1,3), r[2] + Dk(2,3) }; + } + return out; +} + } // namespace pose } // namespace free_splatter diff --git a/src/pose.h b/src/pose.h index 3da4ac2..dbe1a23 100644 --- a/src/pose.h +++ b/src/pose.h @@ -110,6 +110,87 @@ PoseResult estimate_poses(const std::vector & points, double focal = -1.0, int pnp_iter = 100, bool normalize = false, uint64_t seed = 0); +// ---- accumulation: chain successive runs into one world (mirrors accumulate.py) +// One accumulated point in the global frame, tagged with the source frame it came +// from (consensus fusion needs the frame id). rgb is in [0,1]. +struct AccumPoint { + float x, y, z; + float r, g, b; + int32_t frame; +}; + +// Diagnostics for chaining one run into the global frame. +struct ChainLink { + Sim3 sim; // run k -> run k-1 (local similarity); identity for run 0 + Sim3 global; // run k -> global frame (T_k = T_{k-1} . sim) + double scale; // sim.s — per-link scale drift + double inlier_frac; // RANSAC inliers / shared-frame valid points + double valid_frac; // pixels valid (opacity) in BOTH shared views + double resid_frac; // inlier RANSAC residual / target scene extent +}; + +// Sliding-window accumulator: feed it each consecutive pair's engine output +// ([2,H,W,gc], where view 0 shares a frame with the previous pair's view 1) and +// it recovers each pair's cameras (estimate_poses), fits the cross-run Sim(3) on +// the shared frame, composes a global chain, and drops every new frame's gaussians +// into one world. This is the live accumulating-reconstruction loop, in C++. +class Accumulator { +public: + Accumulator(int H, int W, double opacity_threshold = 0.05, + double ransac_thresh_frac = 0.02, int ransac_iters = 300, + uint64_t seed = 0); + + // gaussians: 2*H*W*gc floats (two views, engine channel layout). Recovers the + // pair's cameras, chains the Sim(3), accumulates the new frame(s). focal<=0 -> + // estimate per pair (use_first_focal). Returns the chain link just added. + ChainLink add_pair(const float * gaussians, int gaussian_channels, double focal = -1.0); + + const std::vector & cloud() const { return cloud_; } + const std::vector & links() const { return links_; } + // count+1 global cam2world (similarity) matrices: frame f_k = T_k . c2w_k[view0], + // plus the final frame f_n = T_{n-1} . c2w_{n-1}[view1]. Mirrors accumulate.py's `rec`. + std::vector camera_path() const; + int frame_count() const { return next_frame_; } + int pair_count() const { return (int) links_.size(); } + const Sim3 & global_transform() const { return T_; } + +private: + void add_view(const float * pts, const float * op, const float * rgb, + const Sim3 & T, int frame); + + int H_, W_; + double thr_, rthr_; + int riters_; + uint64_t seed_; + Sim3 T_; + bool have_prev_ = false; + std::vector prev_pts1_; // 3*P previous view-1 points (the shared frame) + std::vector prev_mask1_; // P previous view-1 opacity mask + std::vector cloud_; + std::vector cams_; // per-pair view-0 global cam2world + Mat4 final_cam_; // latest view-1 global cam2world (appended last) + std::vector links_; + int next_frame_ = 0; +}; + +// ---- consensus fusion (mirrors fuse.py) ----------------------------------- +struct FuseStats { + int64_t raw_points, voxels, kept_voxels, points_kept, points_dropped; +}; +// Voxelize the cloud at voxel_frac * cloud-extent and keep only voxels corroborated +// by >= k DISTINCT source frames, averaging the agreeing predictions (which also +// denoises the surface). `fused` receives one averaged point per consensus voxel. +FuseStats consensus_fuse(const std::vector & cloud, double voxel_frac, int k, + std::vector & fused); + +// ---- loop closure (mirrors loop_closure.py) ------------------------------- +// Inverse of a similarity 4x4 [[sR,t],[0,1]] (sR = s*rotation): [[(1/s)Rᵀ,...],[0,1]]. +Mat4 sim4_invert(const Mat4 & M); +// Distribute an accumulated Sim(3) drift D over a chain of n+1 nodes by applying +// D^(k/n) at node k (even relaxation, sim_frac_power). Returns the corrected camera +// centers. global_poses are the open-loop global cam2world (similarity) matrices. +std::vector distribute_drift(const Mat4 & D, const std::vector & global_poses); + } // namespace pose } // namespace free_splatter diff --git a/tests/test_pose.cpp b/tests/test_pose.cpp index 4a8f433..53e5ec9 100644 --- a/tests/test_pose.cpp +++ b/tests/test_pose.cpp @@ -426,6 +426,158 @@ static void test_pnp_robust_outliers() { check("outliers rejected", rejected > 0.85 * n_out); } +// =========================================================================== +// Accumulation / fusion / loop closure (the orchestration on top of the +// primitives, mirrors accumulate.py / fuse.py / loop_closure.py) +// =========================================================================== + +// Synthetic multi-frame clip with KNOWN geometry: a pinhole camera on a smooth +// trajectory; each pair's gaussians are exact back-projections (engine channel +// layout), and consecutive runs are rescaled by a distinct per-run factor (the +// 1/baseline normalization that makes cross-run alignment a *similarity*). The +// accumulator must recover the camera trajectory (up to a global Sim(3)) and the +// per-link scale ratios. +static Mat4 cam2world_frame(int i) { + const double ang = i * 4.0 * M_PI / 180.0; // small yaw per frame + const double c = std::cos(ang), s = std::sin(ang); + Mat4 M = fsla::mat4_identity(); + M(0,0)=c; M(0,2)=s; M(2,0)=-s; M(2,2)=c; + M(0,3) = i * 0.4; // translate along x + return M; +} +static double depth_at(int i, int r, int c) { + return 5.0 + 0.3*std::sin(0.7*i) + 0.05*r - 0.03*c + + 0.4 * (((r*7 + c*13 + i*5) % 11) / 11.0); // non-coplanar spread +} + +static void test_accumulate_chain() { + std::printf("test_accumulate_chain\n"); + const int H = 16, W = 16, gc = 23, P = H*W, F = 5; // 5 frames -> 4 pairs + const double f = 18.0; + Mat3 K = make_K(f, W, H), Kinv = fsla::inv3(K); + auto backproj = [&](int i, int r, int c) -> Vec3 { + const double d = depth_at(i, r, c); + Vec3 uv = { (double) c, (double) r, 1.0 }; + Vec3 n = fsla::mat3_apply(Kinv, uv); + return { n[0]*d, n[1]*d, n[2]*d }; + }; + + Accumulator acc(H, W, /*opacity_threshold=*/0.05); + std::vector sigma(F); + for (int k = 0; k < F-1; k++) { // pair (k, k+1) -> run k + const double sk = 1.0 + 0.1*k; // distinct per-run scale + sigma[k] = sk; + Mat4 w2c_k = fsla::inv_rigid4(cam2world_frame(k)); + Mat4 rel = fsla::mat4_mul(w2c_k, cam2world_frame(k+1)); // cam_k <- cam_{k+1} + std::vector buf((size_t) 2 * P * gc, 0.0f); + for (int r = 0; r < H; r++) for (int c = 0; c < W; c++) { + const int i = r*W + c; + Vec3 X0 = backproj(k, r, c); // camera-k frame + Vec3 Xn = backproj(k+1, r, c); // camera-(k+1) frame + Vec3 X1 = apply_mat4(rel, Xn); // -> camera-k frame + float * v0 = &buf[(size_t) i * gc]; + float * v1 = &buf[(size_t) (P + i) * gc]; + for (int t = 0; t < 3; t++) { v0[t] = (float)(sk*X0[t]); v1[t] = (float)(sk*X1[t]); } + v0[15] = v1[15] = 0.9f; // opacity (activated) + for (int t = 0; t < 3; t++) { v0[3+t] = 0.1f*t; v1[3+t] = 0.1f*t; } // color + } + acc.add_pair(buf.data(), gc); + } + + check("pair count", acc.pair_count() == F-1); + check("frame count", acc.frame_count() == F); + check("cloud non-empty", acc.cloud().size() == (size_t) F * P); // all opacities valid + + // per-link scale recovers sigma_{k-1}/sigma_k + bool scales_ok = true; double worst_scale = 0; + for (int k = 1; k < F-1; k++) { + const double expect = sigma[k-1]/sigma[k]; + worst_scale = std::max(worst_scale, std::fabs(acc.links()[k].scale - expect)); + scales_ok &= std::fabs(acc.links()[k].scale - expect) < 1e-6; + } + { char buf[64]; std::snprintf(buf, sizeof buf, "worst |Δs| %.2e", worst_scale); + check("per-link scale recovered", scales_ok, buf); } + double worst_resid = 0; + for (int k = 1; k < F-1; k++) worst_resid = std::max(worst_resid, acc.links()[k].resid_frac); + { char buf[64]; std::snprintf(buf, sizeof buf, "%.2e", worst_resid); + check("per-link residual ~0", worst_resid < 1e-6, buf); } + + // recovered trajectory vs GT, after a global Sim(3) alignment (ATE) + std::vector path = acc.camera_path(); + std::vector rec(3*F), gt(3*F); + for (int i = 0; i < F; i++) { + rec[3*i]=path[i](0,3); rec[3*i+1]=path[i](1,3); rec[3*i+2]=path[i](2,3); + Mat4 g = cam2world_frame(i); + gt[3*i]=g(0,3); gt[3*i+1]=g(1,3); gt[3*i+2]=g(2,3); + } + Sim3 A = fit_similarity(rec.data(), gt.data(), F); + std::vector rec_al(3*F); + apply_sim_arr(A.s, A.R, A.t, rec.data(), F, rec_al.data()); + double mg[3]={0,0,0}; for (int i=0;i cloud; + // 5 surface points, each corroborated by frames {0,1,2} (tight cluster -> one + // voxel, support 3); 20 isolated single-frame floaters far apart (support 1). + for (int loc = 0; loc < 5; loc++) + for (int fr = 0; fr < 3; fr++) + cloud.push_back({ (float)(2.0*loc) + 0.001f*fr, 0.0f, 0.0f, 0.5f, 0.5f, 0.5f, fr }); + for (int j = 0; j < 20; j++) + cloud.push_back({ 40.0f + 2.0f*j, 0.0f, 0.0f, 0.2f, 0.2f, 0.2f, j % 4 }); + + std::vector fused; + FuseStats st = consensus_fuse(cloud, /*voxel_frac=*/0.02, /*k=*/2, fused); + check("raw points", st.raw_points == 35); + check("kept voxels (the surface)", st.kept_voxels == 5); + check("fused size", fused.size() == 5); + check("points kept", st.points_kept == 15); + check("floaters dropped", st.points_dropped == 20); + // a fused point sits at its cluster centroid (~2*loc), color preserved + bool centroid_ok = true; + for (const auto & p : fused) { + const double loc = std::round(p.x / 2.0); + centroid_ok &= std::fabs(p.x - 2.0*loc) < 0.01 && std::fabs(p.r - 0.5) < 1e-6; + } + check("fused centroid + color", centroid_ok); +} + +static void test_loop_distribute() { + std::printf("test_loop_distribute\n"); + Mat3 R = rand_rotation(); + Mat4 D = sim_matrix({ 1.15, R, {0.4, -0.2, 0.1} }); + // sim4_invert is a true inverse of a similarity 4x4 + Mat4 S = sim_matrix({ 1.3, rand_rotation(), {1.0, -2.0, 0.5} }); + Mat4 SI = fsla::mat4_mul(sim4_invert(S), S); + bool inv_ok = true; for (int i = 0; i < 16; i++) inv_ok &= std::fabs(SI.a[i] - fsla::mat4_identity().a[i]) < 1e-9; + check("sim4_invert is an inverse", inv_ok); + + // a clean loop, drifted by D^{-k/n}; distribute_drift(D) must cancel it. + const int n = 8; + std::vector poses(n+1); + std::vector clean(n+1); + for (int k = 0; k <= n; k++) { + const double tt = 2*M_PI * k / n; + clean[k] = { std::cos(tt), std::sin(tt), 0.1*tt }; + Vec3 drifted = apply_mat4(sim_frac_power(D, -(double)k/n), clean[k]); + Mat4 Pk = fsla::mat4_identity(); + Pk(0,3)=drifted[0]; Pk(1,3)=drifted[1]; Pk(2,3)=drifted[2]; + poses[k] = Pk; + } + std::vector corr = distribute_drift(D, poses); + double post = 0; + for (int k = 0; k <= n; k++) for (int c = 0; c < 3; c++) post += (corr[k][c]-clean[k][c])*(corr[k][c]-clean[k][c]); + char buf[64]; std::snprintf(buf, sizeof buf, "ATE %.1e", std::sqrt(post/(n+1))); + check("distribute_drift recovers clean loop", std::sqrt(post/(n+1)) < 1e-9, buf); +} + int main() { test_similarity_roundtrip(); test_scale_detection(); @@ -439,6 +591,9 @@ int main() { test_pnp_robust_recovery(); test_pnp_robust_planar(); test_pnp_robust_outliers(); + test_accumulate_chain(); + test_consensus_fuse(); + test_loop_distribute(); std::printf(failures ? "\ntest_pose: %d FAILURES\n" : "\ntest_pose: ok\n", failures); return failures ? 1 : 0; } From e3f316f7edb01641ff5b3d953b87c3c46237f588 Mon Sep 17 00:00:00 2001 From: Richard Palethorpe Date: Sat, 27 Jun 2026 15:49:23 +0100 Subject: [PATCH 11/33] pose/: validate C++ loop closure (sim4_invert + distribute_drift) vs Python The loop-closure machinery (sim4_invert, distribute_drift, sim_frac_power) shipped with the Accumulator commit; this records its parity validation. - sim_frac_power C++ closed-form == numpy eig-based to 5e-10 across f in [-0.5,1.3] (so distribute_drift is bit-identical to the prototype's distribution). - sim4_invert is an exact similarity inverse (1e-9); golden recovers a known drifted loop to 4e-16. - Real-data parity on the loopcache (13 chain pairs + close_0_260): recovered drift matches the prototype's loop error (scale 1.09 vs 1.12, 4.6 vs 4.4 deg), deterministic valid% identical. The corrected-trajectory delta is the known EPnP-vs-cv2 PnP backend feeding D, not the distribution math. Co-Authored-By: Claude Opus 4.8 (1M context) --- pose/README.md | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/pose/README.md b/pose/README.md index 866cb20..f1769a2 100644 --- a/pose/README.md +++ b/pose/README.md @@ -48,8 +48,21 @@ test tier (`tests/test_pose.cpp`, `ctest -LE model`): per-link Sim(3) scale agrees to **mean 0.5%** (11/12 links <1%, worst on the documented low-inlier leg); trajectory within **6.6%** of the cv2 chain — the residual is the known RANSAC-RNG + EPnP-vs-SQPNP delta, not a port gap. -- ⬜ Not yet ported: loop closure, consensus fusion (these compose the primitives - above), and the CLI / C-API surface. +- ✅ **Loop closure → `sim4_invert` + `distribute_drift` (loop_closure.py)** — the + even Sim(3) relaxation: from an accumulated drift `D` (the loop-closure + measurement `P_n_loop · P_last⁻¹`), distribute `D^(k/n)` over the chain. + `sim_frac_power` (the closed-form one-parameter subgroup) is **numerically + identical to numpy's eig-based version (5e-10)** across f ∈ {−0.5 … 1.3}; + `sim4_invert` is an exact similarity inverse (1e-9); golden test recovers a + known drifted loop to 4e-16. **Real-data parity** on the cached loopcache (13 + chain pairs + `close_0_260`): the recovered drift matches the prototype's loop + error (scale 1.09 vs 1.12, **4.6° vs 4.4°**, README "4.4°"), deterministic + valid% identical; the corrected-trajectory delta is the (already-characterized) + EPnP-vs-cv2 PnP backend feeding `D`, since the distribution math itself is + bit-identical to numpy. Confirms the prototype's finding: on this short clip the + chain already closes, so loop closure is a near-identity correction. +- ⬜ Not yet ported: consensus fusion (composes the primitives above), and the + CLI / C-API surface. ## Why it's needed From 3fdf84488343e9192e796548bdc6cbac7dcf5512 Mon Sep 17 00:00:00 2001 From: Richard Palethorpe Date: Sat, 27 Jun 2026 15:50:59 +0100 Subject: [PATCH 12/33] pose/: validate C++ consensus fusion (consensus_fuse) vs Python fuse.py consensus_fuse shipped with the Accumulator commit (one hash-grid pass over the frame-tagged cloud: >=K distinct-frame voxels kept, agreeing predictions averaged). This records its parity validation. - Golden (test_consensus_fuse): exact counts on controlled synthetic support. - Real-data parity vs fuse.py on the 13 acc dumps (voxel 0.02, K>=2): raw points bit-exact (2,633,725), per-point floater drop 14.0% vs 14%, raw->fused reduction 93.9% vs 94%, kept-voxel fraction 53.8% vs 54%. Sub-1% voxel-count delta is the chaining RANSAC-RNG, not the fusion math. Reproduces the prototype's "remove the 14% single-frame edge-haze floaters" result. Co-Authored-By: Claude Opus 4.8 (1M context) --- pose/README.md | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/pose/README.md b/pose/README.md index f1769a2..b5474ec 100644 --- a/pose/README.md +++ b/pose/README.md @@ -61,8 +61,17 @@ test tier (`tests/test_pose.cpp`, `ctest -LE model`): EPnP-vs-cv2 PnP backend feeding `D`, since the distribution math itself is bit-identical to numpy. Confirms the prototype's finding: on this short clip the chain already closes, so loop closure is a near-identity correction. -- ⬜ Not yet ported: consensus fusion (composes the primitives above), and the - CLI / C-API surface. +- ✅ **Consensus fusion → `consensus_fuse` (fuse.py)** — voxelize the accumulated + cloud at `voxel_frac · extent` and keep only voxels corroborated by ≥ K distinct + source frames, averaging the agreeing predictions (the per-point `frame` tag the + `Accumulator` carries makes this a single hash-grid pass). Golden test pins the + counts exactly on controlled synthetic support. **Real-data parity** vs `fuse.py` + on the 13 `acc` dumps (voxel 0.02, K≥2): raw points **bit-exact** (2,633,725), + per-point floater drop **14.0% vs 14%**, raw→fused reduction 93.9% vs 94%, + kept-voxel fraction 53.8% vs 54% — the sub-1% voxel-count delta is the chaining + RANSAC-RNG shifting positions at voxel boundaries, not the fusion math. The + prototype's "remove the 14% single-frame edge-haze floaters" result reproduces. +- ⬜ Not yet ported: the CLI / C-API surface (then this prototype is deleted). ## Why it's needed From 8529cd40e228321c0b876bbb4833a347484ff3d7 Mon Sep 17 00:00:00 2001 From: Richard Palethorpe Date: Sat, 27 Jun 2026 15:58:16 +0100 Subject: [PATCH 13/33] CLI + C-API: pose recovery + accumulation surface (no Python) Expose the now-ported pose pipeline from the C API and the CLI, with no Python runtime dependency (per CLAUDE.md): C API (include/free_splatter.h): - free_splatter_estimate_poses: recover per-view cam2world from an engine buffer. - free_splatter_accumulator_{new,free,add_pair,frame_count,cloud,fuse, camera_path}: opaque sliding-window accumulator wrapping pose::Accumulator; add_pair takes each consecutive pair's [2,H,W,gc] engine output, returns the growing global cloud (free_splatter_point: xyz+rgb+frame), the consensus-fused cloud, and the global camera trajectory. FFI-friendly, malloc'd buffers freed with free_splatter_buf_free. CLI (free_splatter-cli --accumulate): - runs the engine over each consecutive image pair, chains the runs into one world, and writes PREFIX_.splat after each pair (the evolving reconstruction) plus, with --fuse, a consensus-fused PREFIX_fused.splat. New write_cloud_splat emits the xyz+rgb cloud as small isotropic .splat gaussians. Verified end-to-end on real frames (5 frames -> evolving 519K/779K/.. splats + 103K fused), sanitizer-clean under the debug preset; asset-free ctest tier green. Co-Authored-By: Claude Opus 4.8 (1M context) --- include/free_splatter.h | 56 +++++++++++++++++ src/free_splatter.cpp | 119 ++++++++++++++++++++++++++++++++++++ tools/free_splatter-cli.cpp | 108 +++++++++++++++++++++++++++++++- 3 files changed, 281 insertions(+), 2 deletions(-) diff --git a/include/free_splatter.h b/include/free_splatter.h index 4707170..2913c94 100644 --- a/include/free_splatter.h +++ b/include/free_splatter.h @@ -77,6 +77,62 @@ FREE_SPLATTER_API int free_splatter_run(free_splatter_ctx * ctx, const float * int32_t height, int32_t width, float ** out, size_t * n_out); FREE_SPLATTER_API void free_splatter_buf_free(void * buf); // NULL-safe +// ---- downstream pose recovery + accumulation ---- +// The pose consumer of the engine seam: given the engine's [N,H,W,gaussian_channels] +// output it recovers each view's camera (PnP) and stitches successive runs into one +// accumulating world (cross-run Sim(3)). Dependency-free (no Python, no OpenCV). + +// Recover each view's camera from an engine output buffer. gaussians: +// n_views*height*width*gaussian_channels float32 (the layout free_splatter_run +// returns). cam2world_out: caller-owned n_views*16 float32, filled with a +// row-major 4x4 cam2world per view. If focal_out is non-NULL it receives the +// estimated shared focal. opacity_threshold gates valid pixels (engine output is +// already activated). Returns 0 on success, -1 on failure. +FREE_SPLATTER_API int free_splatter_estimate_poses( + const float * gaussians, int32_t n_views, int32_t height, int32_t width, + int32_t gaussian_channels, float opacity_threshold, + float * cam2world_out, float * focal_out); + +// One accumulated point in the global frame: position, color in [0,1], and the +// source frame it came from (for consensus fusion). +typedef struct { + float x, y, z; + float r, g, b; + int32_t frame; +} free_splatter_point; + +// Sliding-window accumulator: feed each consecutive pair's engine output and it +// chains the runs into one world. Opaque handle, not thread-safe. +typedef struct free_splatter_accumulator free_splatter_accumulator; +FREE_SPLATTER_API free_splatter_accumulator * free_splatter_accumulator_new( + int32_t height, int32_t width, float opacity_threshold); +FREE_SPLATTER_API void free_splatter_accumulator_free(free_splatter_accumulator * acc); // NULL-safe + +// Add one pair's engine output: 2*height*width*gaussian_channels float32, where +// view 0 shares a frame with the previous pair's view 1. Returns 0 on success. +FREE_SPLATTER_API int free_splatter_accumulator_add_pair( + free_splatter_accumulator * acc, const float * gaussians, int32_t gaussian_channels); + +// Number of frames accumulated so far (== pairs_added + 1, or 0 before any pair). +FREE_SPLATTER_API int free_splatter_accumulator_frame_count(const free_splatter_accumulator * acc); + +// Copy the current accumulated cloud: *out is malloc'd free_splatter_point[*n_out] +// (free with free_splatter_buf_free). Returns 0 on success, -1 on failure. +FREE_SPLATTER_API int free_splatter_accumulator_cloud( + const free_splatter_accumulator * acc, free_splatter_point ** out, size_t * n_out); + +// Consensus-fuse the current cloud: keep only voxels (size voxel_frac * extent) +// corroborated by >= k distinct source frames, averaging the agreeing predictions. +// *out malloc'd as above. Returns 0 on success, -1 on failure. +FREE_SPLATTER_API int free_splatter_accumulator_fuse( + const free_splatter_accumulator * acc, float voxel_frac, int32_t k, + free_splatter_point ** out, size_t * n_out); + +// Copy the global camera trajectory: *out malloc'd (*n_frames)*16 float32, a +// row-major 4x4 cam2world (similarity) per frame. Returns 0 on success. +FREE_SPLATTER_API int free_splatter_accumulator_camera_path( + const free_splatter_accumulator * acc, float ** out, int32_t * n_frames); + #ifdef __cplusplus } #endif diff --git a/src/free_splatter.cpp b/src/free_splatter.cpp index 3927622..53fd94f 100644 --- a/src/free_splatter.cpp +++ b/src/free_splatter.cpp @@ -3,12 +3,14 @@ #include "image.h" #include "model.h" #include "options.h" +#include "pose.h" #include #include #include #include #include +#include // ---- opaque types ---------------------------------------------------------- @@ -156,3 +158,120 @@ int free_splatter_run(free_splatter_ctx * ctx, const float * images, int32_t n_v } void free_splatter_buf_free(void * buf) { free(buf); } + +// ---- downstream pose recovery + accumulation ------------------------------- + +namespace { + +// De-interleave per-view contiguous points (3*P) + opacity (P) from a +// [n_views,H,W,gc] engine buffer. Returns per-view base pointers into `store`. +void deinterleave(const float * g, int nv, int P, int gc, + std::vector> & pts, + std::vector> & ops, + std::vector & pptr, + std::vector & optr) { + pts.assign(nv, {}); ops.assign(nv, {}); + pptr.resize(nv); optr.resize(nv); + for (int v = 0; v < nv; v++) { + pts[v].resize((size_t) 3 * P); ops[v].resize(P); + for (int i = 0; i < P; i++) { + const float * a = &g[(size_t) (v * P + i) * gc]; + pts[v][3*i] = a[0]; pts[v][3*i+1] = a[1]; pts[v][3*i+2] = a[2]; + ops[v][i] = a[15]; + } + pptr[v] = pts[v].data(); optr[v] = ops[v].data(); + } +} + +// Copy a pose vector's cloud into a malloc'd C array (ABI-stable layout). +int emit_points(const std::vector & src, + free_splatter_point ** out, size_t * n_out) { + *out = (free_splatter_point *) malloc(src.size() * sizeof(free_splatter_point)); + if (!*out && !src.empty()) return -1; + for (size_t i = 0; i < src.size(); i++) { + (*out)[i].x = src[i].x; (*out)[i].y = src[i].y; (*out)[i].z = src[i].z; + (*out)[i].r = src[i].r; (*out)[i].g = src[i].g; (*out)[i].b = src[i].b; + (*out)[i].frame = src[i].frame; + } + *n_out = src.size(); + return 0; +} + +} // namespace + +struct free_splatter_accumulator { + free_splatter::pose::Accumulator acc; + int gc = 0; + free_splatter_accumulator(int H, int W, double thr) : acc(H, W, thr) {} +}; + +int free_splatter_estimate_poses(const float * gaussians, int32_t n_views, int32_t height, + int32_t width, int32_t gaussian_channels, + float opacity_threshold, float * cam2world_out, float * focal_out) { + if (!gaussians || !cam2world_out || n_views < 1 || height < 1 || width < 1 || gaussian_channels < 16) + return -1; + const int P = height * width; + std::vector> pts, ops; + std::vector pptr, optr; + deinterleave(gaussians, n_views, P, gaussian_channels, pts, ops, pptr, optr); + free_splatter::pose::PoseResult pr = + free_splatter::pose::estimate_poses(pptr, optr, height, width, opacity_threshold); + for (int v = 0; v < n_views; v++) + for (int i = 0; i < 4; i++) for (int j = 0; j < 4; j++) + cam2world_out[v*16 + i*4 + j] = (float) pr.cam2world[v](i, j); + if (focal_out) *focal_out = (float) pr.focal; + return 0; +} + +free_splatter_accumulator * free_splatter_accumulator_new(int32_t height, int32_t width, + float opacity_threshold) { + if (height < 1 || width < 1) return nullptr; + return new (std::nothrow) free_splatter_accumulator(height, width, opacity_threshold); +} + +void free_splatter_accumulator_free(free_splatter_accumulator * acc) { delete acc; } + +int free_splatter_accumulator_add_pair(free_splatter_accumulator * acc, const float * gaussians, + int32_t gaussian_channels) { + if (!acc || !gaussians || gaussian_channels < 16) return -1; + acc->gc = gaussian_channels; + acc->acc.add_pair(gaussians, gaussian_channels); + return 0; +} + +int free_splatter_accumulator_frame_count(const free_splatter_accumulator * acc) { + return acc ? acc->acc.frame_count() : 0; +} + +int free_splatter_accumulator_cloud(const free_splatter_accumulator * acc, + free_splatter_point ** out, size_t * n_out) { + if (out) *out = nullptr; + if (n_out) *n_out = 0; + if (!acc || !out || !n_out) return -1; + return emit_points(acc->acc.cloud(), out, n_out); +} + +int free_splatter_accumulator_fuse(const free_splatter_accumulator * acc, float voxel_frac, + int32_t k, free_splatter_point ** out, size_t * n_out) { + if (out) *out = nullptr; + if (n_out) *n_out = 0; + if (!acc || !out || !n_out || k < 1 || !(voxel_frac > 0)) return -1; + std::vector fused; + free_splatter::pose::consensus_fuse(acc->acc.cloud(), voxel_frac, k, fused); + return emit_points(fused, out, n_out); +} + +int free_splatter_accumulator_camera_path(const free_splatter_accumulator * acc, + float ** out, int32_t * n_frames) { + if (out) *out = nullptr; + if (n_frames) *n_frames = 0; + if (!acc || !out || !n_frames) return -1; + std::vector path = acc->acc.camera_path(); + *out = (float *) malloc(path.size() * 16 * sizeof(float)); + if (!*out && !path.empty()) return -1; + for (size_t f = 0; f < path.size(); f++) + for (int i = 0; i < 4; i++) for (int j = 0; j < 4; j++) + (*out)[f*16 + i*4 + j] = (float) path[f](i, j); + *n_frames = (int32_t) path.size(); + return 0; +} diff --git a/tools/free_splatter-cli.cpp b/tools/free_splatter-cli.cpp index 495f15a..683db95 100644 --- a/tools/free_splatter-cli.cpp +++ b/tools/free_splatter-cli.cpp @@ -93,19 +93,66 @@ static bool write_splat(const float * g, size_t n, int gc, float opac_thr, return true; } +// Write an accumulated point cloud (xyz + rgb, no per-point covariance) as an +// antimatter15 .splat, each point a small isotropic gaussian. Caps to max_splats +// by stride-subsampling. scale_frac sets the splat radius as a fraction of the +// cloud's rms extent. +static bool write_cloud_splat(const free_splatter_point * pts, size_t n, size_t max_splats, + float scale_frac, const char * path) { + // rms extent about the centroid -> per-point isotropic scale + double m[3] = {0,0,0}; + for (size_t i = 0; i < n; i++) { m[0]+=pts[i].x; m[1]+=pts[i].y; m[2]+=pts[i].z; } + if (n) { m[0]/=n; m[1]/=n; m[2]/=n; } + double ss = 0; + for (size_t i = 0; i < n; i++) { const double dx=pts[i].x-m[0], dy=pts[i].y-m[1], dz=pts[i].z-m[2]; ss += dx*dx+dy*dy+dz*dz; } + const float ext = (float) std::sqrt(ss / (n ? n : 1)); + const float scale = std::max(scale_frac * ext, 1e-6f); + + const size_t stride = (max_splats > 0 && n > max_splats) ? (n + max_splats - 1) / max_splats : 1; + std::ofstream f(path, std::ios::binary); + if (!f) { std::fprintf(stderr, "cannot write %s\n", path); return false; } + auto u8 = [](float v) -> unsigned char { float t = v < 0 ? 0 : v > 255 ? 255 : v; return (unsigned char) t; }; + size_t written = 0; + for (size_t i = 0; i < n; i += stride) { + // OpenCV (y down, z forward) -> viewer OpenGL (y up): negate y,z. + float pos[3] = { pts[i].x, -pts[i].y, -pts[i].z }; + float sc[3] = { scale, scale, scale }; + unsigned char rgba[4] = { u8(pts[i].r*255.0f), u8(pts[i].g*255.0f), u8(pts[i].b*255.0f), 255 }; + unsigned char rot[4] = { 255, 128, 128, 128 }; // identity quaternion (w,x,y,z) + f.write((const char *) pos, 12); + f.write((const char *) sc, 12); + f.write((const char *) rgba, 4); + f.write((const char *) rot, 4); + written++; + } + std::printf("wrote %s: %zu splats (of %zu cloud points)\n", path, written, n); + return true; +} + static int usage(const char * a0) { std::fprintf(stderr, "usage: %s [--device DEV] [--splat OUT.splat] [--out OUT.f32]\n" " [--opacity-threshold T] [--max-splats N] [--dump-taps DIR]\n" - " MODEL.gguf (IMAGES... | INPUT.f32)\n", a0); + " MODEL.gguf (IMAGES... | INPUT.f32)\n" + "\n" + "accumulate mode (>=2 images -> one world from the photo stream):\n" + " %s --accumulate [--splat-prefix P] [--fuse] [--voxel V] [--fuse-k K]\n" + " [--splat-scale S] [--max-splats N] MODEL.gguf IMG0 IMG1 IMG2 ...\n" + " runs the engine on each consecutive pair, chains the runs (Sim(3)) into\n" + " one accumulating cloud; writes P_.splat after each pair (the\n" + " evolving reconstruction) and, with --fuse, a consensus-fused P_fused.splat.\n", + a0, a0); return 2; } int main(int argc, char ** argv) { const char * device = nullptr, * taps_dir = nullptr, * out_path = nullptr, * splat_path = nullptr; - const char * model = nullptr; + const char * model = nullptr, * splat_prefix = nullptr; float opac_thr = 5e-3f; long max_splats = 0; + bool accumulate = false, fuse = false; + float voxel = 0.02f, splat_scale = 0.0015f; + int fuse_k = 2; std::vector inputs; for (int i = 1; i < argc; i++) { @@ -116,6 +163,12 @@ int main(int argc, char ** argv) { else if (a == "--splat" && i+1 < argc) splat_path = argv[++i]; else if (a == "--opacity-threshold" && i+1=2 images\n"); free_splatter_free(ctx); return 2; } + const int sz = geo.image_width; + // decode every frame once (CHW per view) + std::vector> frames; + for (const std::string & p : inputs) { + std::vector img; + if (!load_image_chw(p.c_str(), sz, img)) { free_splatter_free(ctx); return 1; } + frames.push_back(std::move(img)); + } + free_splatter_accumulator * acc = free_splatter_accumulator_new(sz, sz, opac_thr); + if (!acc) { std::fprintf(stderr, "accumulator alloc failed\n"); free_splatter_free(ctx); return 1; } + + const int64_t per_view = (int64_t) geo.in_channels * sz * sz; + for (size_t k = 0; k + 1 < frames.size(); k++) { + std::vector pair(2 * per_view); + std::memcpy(&pair[0], frames[k].data(), per_view * sizeof(float)); + std::memcpy(&pair[per_view], frames[k+1].data(), per_view * sizeof(float)); + float * g = nullptr; size_t ng = 0; + if (free_splatter_run(ctx, pair.data(), 2, sz, sz, &g, &ng) != 0) { + std::fprintf(stderr, "run pair %zu failed: %s\n", k, free_splatter_last_error(ctx)); + free_splatter_accumulator_free(acc); free_splatter_free(ctx); return 1; + } + free_splatter_accumulator_add_pair(acc, g, geo.gaussian_channels); + free_splatter_buf_free(g); + const int nframes = free_splatter_accumulator_frame_count(acc); + std::printf("pair %zu (%s, %s) -> %d frames\n", k, + inputs[k].c_str(), inputs[k+1].c_str(), nframes); + if (splat_prefix) { + free_splatter_point * cloud = nullptr; size_t nc = 0; + free_splatter_accumulator_cloud(acc, &cloud, &nc); + char path[1024]; + std::snprintf(path, sizeof path, "%s_%d.splat", splat_prefix, nframes); + write_cloud_splat(cloud, nc, (size_t) max_splats, splat_scale, path); + free_splatter_buf_free(cloud); + } + } + if (fuse && splat_prefix) { + free_splatter_point * fc = nullptr; size_t nf = 0; + free_splatter_accumulator_fuse(acc, voxel, fuse_k, &fc, &nf); + char path[1024]; + std::snprintf(path, sizeof path, "%s_fused.splat", splat_prefix); + write_cloud_splat(fc, nf, (size_t) max_splats, splat_scale, path); + free_splatter_buf_free(fc); + } + free_splatter_accumulator_free(acc); + free_splatter_free(ctx); + return 0; + } + // assemble input: one raw .f32, or several decoded images std::vector buf; int32_t n_views = 0; From 3408afc1588ce7e8f1eff63b7a6e167e04052045 Mon Sep 17 00:00:00 2001 From: Richard Palethorpe Date: Sat, 27 Jun 2026 16:00:56 +0100 Subject: [PATCH 14/33] pose/: delete the Python prototype -- fully ported to C++ and shipped Per CLAUDE.md: the pose/ research prototype (focal/align/pnp + accumulate/loop/ fuse + the re10k/T&T validation harnesses) proved the accumulating-reconstruction approach. That whole pipeline is now rewritten in C++ (src/pose.{h,cpp}), exposed via free_splatter-cli + include/free_splatter.h with no Python, and validated (asset-free golden tests + real-data parity recorded in the prior commits). The prototype was a throwaway, not a parallel implementation to maintain -- so it is removed. Git history preserves it and its layer-by-layer parity harnesses. Updates the dangling references (CLAUDE.md, src/linalg.h, src/pose.{h,cpp}) to point at git history instead of the deleted files. Co-Authored-By: Claude Opus 4.8 (1M context) --- CLAUDE.md | 30 ++-- pose/README.md | 226 ----------------------------- pose/_upstream.py | 107 -------------- pose/accumulate.py | 157 --------------------- pose/align.py | 234 ------------------------------ pose/check_cv2_parity.py | 73 ---------- pose/check_upstream_parity.py | 154 -------------------- pose/empirical.py | 83 ----------- pose/find_loop.py | 70 --------- pose/focal.py | 48 ------- pose/fuse.py | 134 ------------------ pose/loop_closure.py | 150 -------------------- pose/pnp.py | 245 -------------------------------- pose/re10k_control.py | 105 -------------- pose/re10k_crossrun.py | 69 --------- pose/re10k_experiment.py | 76 ---------- pose/re10k_fetch.py | 84 ----------- pose/render_ply.py | 54 ------- pose/test_pose.py | 258 ---------------------------------- pose/tt_control.py | 124 ---------------- pose/tt_experiment.py | 79 ----------- src/linalg.h | 5 +- src/pose.cpp | 14 +- src/pose.h | 10 +- 24 files changed, 34 insertions(+), 2555 deletions(-) delete mode 100644 pose/README.md delete mode 100644 pose/_upstream.py delete mode 100644 pose/accumulate.py delete mode 100644 pose/align.py delete mode 100644 pose/check_cv2_parity.py delete mode 100644 pose/check_upstream_parity.py delete mode 100644 pose/empirical.py delete mode 100644 pose/find_loop.py delete mode 100644 pose/focal.py delete mode 100644 pose/fuse.py delete mode 100644 pose/loop_closure.py delete mode 100644 pose/pnp.py delete mode 100644 pose/re10k_control.py delete mode 100644 pose/re10k_crossrun.py delete mode 100644 pose/re10k_experiment.py delete mode 100644 pose/re10k_fetch.py delete mode 100644 pose/render_ply.py delete mode 100644 pose/test_pose.py delete mode 100644 pose/tt_control.py delete mode 100644 pose/tt_experiment.py diff --git a/CLAUDE.md b/CLAUDE.md index d4e3a71..15bc41a 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -33,11 +33,14 @@ across all three variants, so object/object-2dgs are cheap follow-ons. 1. **Dev-time reference / conversion / validation** that runs in `docker/Dockerfile.cuda` (`scripts/hf_dump.py`, `convert.py`, `compare_taps.py`, …) — the only place torch runs; never a runtime dependency. - 2. **The `pose/` research prototype, TEMPORARILY.** It may continue in Python - (numpy + cv2) **only until the accumulating-reconstruction approach is - proven**. Once proven it is **rewritten in C++**, exposed via the CLI + C API, - and **the Python is deleted.** `pose/` is not wired into CMake/ctest and is a - throwaway prototype, not a parallel implementation to maintain. + 2. **The `pose/` research prototype — now DONE and DELETED.** It was the + temporary Python (numpy + cv2) prototype that proved the + accumulating-reconstruction approach. That approach is proven, the whole + pipeline (focal, Sim(3) align, robust PnP, accumulation, loop closure, + consensus fusion) is **rewritten in C++** (`src/pose.{h,cpp}`), exposed via + `free_splatter-cli` + `include/free_splatter.h`, and the Python prototype has + been **removed** (see git history for it and its layer-by-layer parity + harnesses). No Python remains in the pose path. ## Validation is the backbone (non-negotiable) @@ -71,15 +74,16 @@ This is a numerical port. Correctness means matching the PyTorch reference ## Per-component discipline -Each component (`image`, `gguf_loader`, `backend`, `model`, the head, and — once -ported to C++ — **`pose` = PnP + focal + Sim(3) alignment**) has its own unit test -and is made independently green **before** cross-component parity. Component -boundaries match the file layout. Keep the seam at the +Each component (`image`, `gguf_loader`, `backend`, `model`, the head, and +**`pose` = PnP + focal + Sim(3) alignment + accumulation/loop/fusion**) has its own +unit test and is made independently green **before** cross-component parity. +Component boundaries match the file layout. Keep the seam at the `[N,H,W,gaussian_channels]` tensor clean — that is the contract between the engine -and the pose/rendering consumers. The C++ `pose` component inherits the parity -discipline the Python prototype already established: **bit-exact to upstream -`estimate_poses`** (see `pose/check_upstream_parity.py`) and **validated against -independent ground-truth poses** (`pose/re10k_experiment.py`). +and the pose/rendering consumers. The C++ `pose` component (`tests/test_pose.cpp`, +asset-free golden tier) carries the parity discipline the Python prototype +established: **bit-exact to upstream `estimate_poses`** and **validated against +independent ground-truth poses** — see git history for the prototype's +`check_upstream_parity.py` / `re10k_experiment.py` harnesses. ## Debugging philosophy (mandatory sequence) diff --git a/pose/README.md b/pose/README.md deleted file mode 100644 index b5474ec..0000000 --- a/pose/README.md +++ /dev/null @@ -1,226 +0,0 @@ -# pose/ — camera recovery + cross-run alignment (downstream prototype) - -**Status: TEMPORARY Python prototype — to be rewritten in C++ once proven, then -deleted.** - -PnP pose recovery and cross-run alignment are now **in scope** (see CLAUDE.md). -The shipped implementation will be **C++**, reachable from `free_splatter-cli` and -the C API with **no Python dependency**. This directory is the throwaway research -prototype that proves the accumulating-reconstruction approach first: a -self-contained numpy + cv2 consumer of the engine's `[N, H, W, 23]` output, -**not wired into the CMake build or `ctest`**. Once the approach is proven it is -ported to C++ and this Python is removed — it is not a parallel implementation to -maintain. - -## C++ port status (the ship target — `src/pose.{h,cpp}`) - -The port has begun, dependency-free (only the self-contained `src/linalg.h` -Jacobi eigensolver — no Eigen/OpenCV), wired into the library and the asset-free -test tier (`tests/test_pose.cpp`, `ctest -LE model`): - -- ✅ **`focal.py` → `estimate_focal`** — Weiszfeld; **bit-exact** to the numpy - reference on a real scene dump (596.408591886 both). -- ✅ **`align.py` → Umeyama / RANSAC / residual-ladder / Sim(3) chaining / - `sim_frac_power`** — `sim_frac_power` is a closed-form Sim(3) one-parameter - subgroup (`A^f=s^f R^f`, translation `(A^f−I)(A−I)⁻¹t`), no complex eig needed. - All 9 golden tests (the mirror of `test_pose.py`) pass under ASan/UBSan. -- ✅ **Robust PnP → `solve_pnp` (EPnP + Gauss-Newton)** — `solve_pnp_numpy` (DLT + - RANSAC, via the 12×12 `AᵀA` nullspace) is kept as the asset-free golden-test - reference, but on real scenes the DLT inherits the textbook **planar/mirror - degeneracy** (seed-dependent: ~3/5 RANSAC seeds near cv2, ~2/5 a ~135–152° - flip). The shipped real-data solver is `solve_pnp`: **EPnP** (planar-robust, - non-iterative, uses all points so there is no random minimal-sample flip — - reuses the 12×12 `MᵀM` Jacobi eigendecomposition) for the init, then a - **Huber-robust Gauss-Newton** reprojection refine. On the real scene dump it is - **deterministic** (identical across all RANSAC seeds) and lands at **0.73° of - the upstream cv2/SQPNP** relative rotation (0.74° translation direction) — i.e. - **cv2-parity on real data with no OpenCV dependency**, versus the numpy DLT's - 175° miss on the same dump. Golden tests cover exact recovery, a near-planar - config (where DLT flips), and 15% gross-outlier rejection. -- ✅ **Accumulation chaining → `Accumulator` (accumulate.py)** — the live - sliding-window loop: feed each consecutive pair's `[2,H,W,gc]` engine output, - it recovers the pair's cameras (`estimate_poses`), fits the cross-run Sim(3) on - the shared frame, composes a global chain, and drops every new frame's gaussians - into one world (`cloud()`, `camera_path()`). Golden-tested (synthetic clip: - trajectory ATE 7.6e-8 of extent, per-link scale to 3e-9). **Real-data parity** - on the 13 cached `pair_*.f32` dumps vs the numpy/cv2 prototype: cloud size - **bit-exact** (2,633,725) and per-link valid% identical (deterministic mask); - per-link Sim(3) scale agrees to **mean 0.5%** (11/12 links <1%, worst on the - documented low-inlier leg); trajectory within **6.6%** of the cv2 chain — the - residual is the known RANSAC-RNG + EPnP-vs-SQPNP delta, not a port gap. -- ✅ **Loop closure → `sim4_invert` + `distribute_drift` (loop_closure.py)** — the - even Sim(3) relaxation: from an accumulated drift `D` (the loop-closure - measurement `P_n_loop · P_last⁻¹`), distribute `D^(k/n)` over the chain. - `sim_frac_power` (the closed-form one-parameter subgroup) is **numerically - identical to numpy's eig-based version (5e-10)** across f ∈ {−0.5 … 1.3}; - `sim4_invert` is an exact similarity inverse (1e-9); golden test recovers a - known drifted loop to 4e-16. **Real-data parity** on the cached loopcache (13 - chain pairs + `close_0_260`): the recovered drift matches the prototype's loop - error (scale 1.09 vs 1.12, **4.6° vs 4.4°**, README "4.4°"), deterministic - valid% identical; the corrected-trajectory delta is the (already-characterized) - EPnP-vs-cv2 PnP backend feeding `D`, since the distribution math itself is - bit-identical to numpy. Confirms the prototype's finding: on this short clip the - chain already closes, so loop closure is a near-identity correction. -- ✅ **Consensus fusion → `consensus_fuse` (fuse.py)** — voxelize the accumulated - cloud at `voxel_frac · extent` and keep only voxels corroborated by ≥ K distinct - source frames, averaging the agreeing predictions (the per-point `frame` tag the - `Accumulator` carries makes this a single hash-grid pass). Golden test pins the - counts exactly on controlled synthetic support. **Real-data parity** vs `fuse.py` - on the 13 `acc` dumps (voxel 0.02, K≥2): raw points **bit-exact** (2,633,725), - per-point floater drop **14.0% vs 14%**, raw→fused reduction 93.9% vs 94%, - kept-voxel fraction 53.8% vs 54% — the sub-1% voxel-count delta is the chaining - RANSAC-RNG shifting positions at voxel boundaries, not the fusion math. The - prototype's "remove the 14% single-frame edge-haze floaters" result reproduces. -- ⬜ Not yet ported: the CLI / C-API surface (then this prototype is deleted). - -## Why it's needed - -Each `free_splatter_run` over a different photo pair produces gaussians in its -**own** coordinate system: positions are predicted in the first view's camera -frame, then the whole scene is rescaled by `s = 1/mean(‖point‖)` (see upstream -`estimate_poses`). To stitch successive splats into one world you need: - -1. **PnP** — recover each view's camera from the per-pixel 2D↔3D correspondences - the gaussians hand you for free. -2. **Cross-run alignment** — because two runs normalize by a *different* `1/d`, - the shared content comes out at a different **scale**. Aligning run B into run - A is therefore a **similarity** (rotation + translation + one uniform scale, - 7-DoF), not a rigid transform. - -## Upstream recipe being mirrored (TencentARC/FreeSplatter) - -`freesplatter/models/model.py::estimate_poses` → DUSt3R-derived `fast_pnp` -(`utils/recon_util.py`): - -Verified against the cached upstream source (`.cache/upstream/{model,runner}.py`) -and the fetched `recon_util.py`, the **scene** recipe (the live use case) is: - -| step | upstream (scene path) | here | -|---|---|---| -| correspondences | pixel ↔ gaussian center `X` (ch 0:3) | `pnp.pixel_grid` (integer px) + `points` | -| validity mask (PnP) | `sigmoid(opacity) > 0.05` | `estimate_poses(opacity_threshold=)` (engine output is already activated) | -| focal | Weiszfeld, **view 0 only** (`use_first_focal=True`), **all pixels** (`masks=None`) | `estimate_poses(use_first_focal=True)` → `focal.estimate_focal` | -| pose | `cv2.solvePnPRansac(reprojErr=5, SOLVEPNP_SQPNP, iters=10)`, `cam2world=inv(world2cam)` | `pnp.solve_pnp_cv2` (exact) / `solve_pnp_numpy` (reference) | -| anchor | none — view 0 is ~identity by construction (gaussians live in view 0's frame) | `estimate_poses` returns raw poses | -| scene scale | runner rescales **camera centers** so the view0→view1 **baseline = 1**: `s = 1/(‖t₁−t₀‖+1e-2)` | `pnp.rescale_to_baseline` / `estimate_poses(normalize=True)` | - -> An earlier draft had three of these wrong (focal averaged over all views, an -> `inv(c2w[0])` re-anchor, and a `1/mean‖pts‖` scene scale). Driving the *actual* -> upstream `estimate_poses` on real engine output (see `check_upstream_parity.py`) -> surfaced and fixed all three — the focal one mattered: averaging a low-overlap -> second view dragged focal 507 vs the correct 596, a 15% error → 1.35° pose error. - -`solve_pnp_cv2` is the byte-for-byte upstream call (needs `opencv`, in the CUDA -container or the nix shell). `solve_pnp_numpy` is a DLT + RANSAC reference with no -third-party deps, verified against analytic ground truth so the pipeline runs -anywhere numpy is. - -## Files - -- `focal.py` — Weiszfeld shared-focal estimation (mirrors upstream `estimate_focal`). -- `pnp.py` — DLT/RANSAC + cv2 PnP backends, `estimate_poses` orchestration, and - `rescale_to_baseline` (the runner's scene normalization). -- `align.py` — Umeyama similarity fit, RANSAC variant, the **residual ladder** - (`diagnose`: rigid→similarity→affine + depth-correlation) that detects whether a - single uniform scale suffices or the mismatch is non-uniform, and similarity - chaining / loop-closure metrics. -- `_upstream.py` — vendored upstream kernels (`fast_pnp` verbatim, `estimate_focal` - numpy-transcribed) used ONLY by the parity check; not our code. -- `test_pose.py` — asset-free golden tests (no model, no fixtures, no cv2). -- `check_cv2_parity.py` — `solve_pnp_numpy` vs the exact `cv2.solvePnPRansac` on - synthetic ground truth (needs cv2). -- `check_upstream_parity.py` — our whole orchestration vs the upstream recipe on - REAL engine output (needs cv2 + a dumped `[N,H,W,23]` `.f32`). -- `empirical.py` — cross-run residual-ladder on real engine output (the scale test). - -### Dense GT-posed control (is it us, or the data/model?) - -- `tt_control.py` / `tt_experiment.py` — Tanks-and-Temples (NSVF) loader + engine - vs GT-pose check. **Verdict: T&T is OUT OF DISTRIBUTION** for FreeSplatter-scene - (narrow-FOV object orbits): opacity confident on only 8–17% of pixels, pose error - 28–145°. Kept as the harness + the negative result. -- `re10k_control.py` / `re10k_fetch.py` / `re10k_experiment.py` — RealEstate10K - loader (parser + GT geometry), yt-dlp/ffmpeg frame fetch, and the engine vs - GT-pose check. **In distribution → the control works**: relative pose recovered - to **0.4–1.5°** vs GT, opacity confident on 68–75% of pixels. -- `re10k_crossrun.py` — cross-run consistency vs baseline (the accumulation - question): a shared frame's two reconstructions agree on **98% of pixels within - 10%** of scene extent at small baseline. - -### Accumulation prototype (the live idea, assembled) - -- `accumulate.py` — slide a window over a clip, recover each pair (PnP), chain a - Sim(3) between consecutive runs via their shared-frame correspondences - (`align.compose`), drop every frame's gaussians into one world, and measure the - recovered camera trajectory vs GT. `render_ply.py` — pinhole projection of the - colored cloud to a PNG (visual coherence check). -- **Result (13 pairs, stride 20):** per-link Sim(3) residual ~1.0–1.4%; trajectory - ATE **11%** of extent with drift growing 7%→13% (monotone monocular scale drift, - 0.755 over 12 links); the 2.6M-point cloud renders as a **coherent room** matching - the input. The accumulating-reconstruction idea is **proven** end-to-end. -- `find_loop.py` — search re10k poses for a clip that revisits its start (the loop - substrate). `loop_closure.py` — chain open-loop, measure the loop error via a - closing pair, then distribute it (even Sim(3) relaxation, `align.sim_frac_power`) - and report ATE before/after. **Finding:** loop closure helps only when drift - *accumulates* into a large loop error. On the real loop clip tested it did NOT - help — the open-loop chain already closes (loop error 4.4° / scale 1.12 / 8% - trans), so the ~34% ATE is per-link **odometry noise** (inlier% as low as 17% - on fast legs) + the focal-bias warp, *self-consistent but distorted vs GT*, which - loop closure can't fix. The correction machinery itself is verified to recover - synthetic accumulated drift to ~0 (`test_pose.py::test_loop_correction`). -- `fuse.py` — **consensus fusion** (the edge-noise answer): voxelize the accumulated - cloud at the consistency scale and keep only voxels corroborated by ≥K distinct - frames, averaging the agreeing predictions. On the forward clip: **14% of points - are single-frame floaters** that render as incoherent edge-haze, while the ≥2-frame - consensus renders as a **clean, crisp room** with the haze gone. Tradeoff: dropping - single-frame points also trims the single-view periphery (coverage vs cleanliness). - -## Run the tests - -```sh -nix develop -c python3 pose/test_pose.py # golden, numpy only -nix develop -c python3 pose/check_cv2_parity.py # solver vs cv2 (needs cv2) -EMP_DIR=/path/to/dumps nix develop -c python3 \ - pose/check_upstream_parity.py A_scn.f32 # orchestration vs upstream -# dense control (needs the engine + a GGUF + network for re10k frames): -nix develop -c python3 pose/re10k_fetch.py CLIP.txt OUTDIR 10 -FS_DEVICE=cpu nix develop -c python3 \ - pose/re10k_experiment.py CLIP.txt OUTDIR 40 20,40,80,160 -``` - -## Status - -- ✅ **Solver parity** — `solve_pnp_numpy` ≡ `cv2.solvePnPRansac` (~1e-7 clean). -- ✅ **Orchestration parity** — our `estimate_poses` is **bit-exact** to upstream's - scene recipe on real engine output (focal/PnP/relative-pose all 0.00° at matched - precision; the only residual is upstream's float32-K vs our float64, a RANSAC - inlier-boundary effect of ≤0.5° on near-degenerate object data, 0° on scene). -- ✅ **Empirical scale test** — cross-run mismatch is a uniform-scale similarity - (scene ~11% scale drift), no nonlinear warp; `diagnose` classifies it. -- ✅ **Dense GT-posed control** — pipeline validated against INDEPENDENT GT on - in-distribution re10k (relative pose to 0.4–1.5°); T&T confirmed OOD. Finding: - the model has a constant wide-FOV focal bias (recovers ~274 vs GT ~439) — benign - for relative accumulation, off for metric scale. -- ✅ **Accumulation prototype** — sliding-window Sim(3) chaining over a clip builds - one coherent world; trajectory tracks GT to ~11% ATE with bounded-able drift. The - idea is proven; per CLAUDE.md the next implementation step is the **C++ port**. -- ✅ **Loop closure** — implemented + machinery verified on synthetic drift (recovers - to ~0). On a real loop clip it did NOT help: the chain already closes, so the error - is odometry noise + focal warp, not accumulated drift. Lesson: better odometry - (smaller baselines / fusion / focal) is the lever for short loops; loop closure - pays off on long trajectories with consistent accumulated drift. -- ✅ **Consensus fusion** — gating the accumulated cloud on ≥2-frame agreement removes - the 14% single-frame floaters (incoherent edge-haze) and yields a clean surface — - the definitive "yes, accumulation removes the edge noise." Trades single-view - periphery for a cleaner core. - -## Not done yet (honest) - -- The **focal bias** (recovered ~274 vs GT ~440+) — the remaining geometry-distortion - source; worth understanding whether it's fixable. -- A **higher-motion clip** wide-baseline sweep, and a **long** trajectory where loop - closure demonstrably pays off. -- **Fuse-then-align**: feed the consensus surface back into the odometry (lift the - low-inlier links the loop-closure diagnosis flagged), not just the final cloud. -- The **C++ port** (CLI + C API, no Python) once the design is locked — then this - prototype is deleted. diff --git a/pose/_upstream.py b/pose/_upstream.py deleted file mode 100644 index 97f7199..0000000 --- a/pose/_upstream.py +++ /dev/null @@ -1,107 +0,0 @@ -"""Vendored upstream reference for the parity check -- NOT our code. - -These are the two numerical kernels FreeSplatter's `estimate_poses` calls, copied -from TencentARC/FreeSplatter `freesplatter/utils/recon_util.py` -(https://raw.githubusercontent.com/TencentARC/FreeSplatter/main/freesplatter/utils/recon_util.py, -fetched 2026-06-26). They exist here ONLY so `check_upstream_parity.py` can run -the original algorithm against our `focal.py`/`pnp.py` on real engine output. - - * `fast_pnp` -- copied VERBATIM (it is cv2 + numpy only; runs as-is in nix). - * `estimate_focal_np` -- a faithful numpy transcription of upstream - `estimate_focal` (the original is torch; the math is identical -- integer - `xy_grid` pixels, nan_to_num, mean/mean init, 10 IRLS reweights, the - `focal_base` clip which is a no-op at the default min/max). Kept line-by-line - so any divergence is a real algorithmic difference, not a re-derivation. - -Do not "improve" or refactor this file; its value is being unmodified upstream. -""" -import cv2 -import numpy as np - - -def xy_grid(W, H): - """Upstream xy_grid (numpy branch): (H,W,2) int, [...,0]=col(x), [...,1]=row(y).""" - tw, th = np.arange(W), np.arange(H) - gx, gy = np.meshgrid(tw, th, indexing="xy") # both (H,W) - return np.stack([gx, gy], axis=-1) - - -def estimate_focal_np(pts3d, pp=None, mask=None, min_focal=0.0, max_focal=np.inf): - """Faithful numpy transcription of upstream recon_util.estimate_focal.""" - H, W, THREE = pts3d.shape - assert THREE == 3 - if pp is None: - pp = np.array([W / 2.0, H / 2.0], dtype=float) - pp = np.asarray(pp, float) - - pixels = xy_grid(W, H).reshape(-1, 2).astype(float) - pp.reshape(1, 2) # (HW,2) - pts = pts3d.reshape(H * W, 3).astype(float) - - if mask is not None: - m = np.asarray(mask).ravel().astype(bool) - assert len(m) == pts.shape[0] - pts = pts[m] - pixels = pixels[m] - - xy_over_z = pts[:, :2] / pts[:, 2:3] - xy_over_z = np.nan_to_num(xy_over_z, posinf=0.0, neginf=0.0) - - dot_xy_px = (xy_over_z * pixels).sum(axis=-1) - dot_xy_xy = (xy_over_z ** 2).sum(axis=-1) - focal = dot_xy_px.mean() / dot_xy_xy.mean() - - for _ in range(10): - dis = np.linalg.norm(pixels - focal * xy_over_z, axis=-1) - w = 1.0 / np.clip(dis, 1e-8, None) - focal = (w * dot_xy_px).mean() / (w * dot_xy_xy).mean() - - focal_base = max(H, W) / (2 * np.tan(np.deg2rad(60) / 2)) - focal = float(np.clip(focal, min_focal * focal_base, max_focal * focal_base)) - return focal - - -def fast_pnp(pts3d, mask, focal=None, pp=None, niter_PnP=10, k_dtype=np.float32): - """VERBATIM copy of upstream recon_util.fast_pnp (cv2 + numpy). - - The default `k_dtype=np.float32` reproduces upstream byte-for-byte (upstream - writes `K = np.float32(...)`). `check_upstream_parity.py` passes float64 to - isolate the one precision choice that differs from our wrapper: on near- - threshold-dense object data the float32 K nudges a few hundred points across - the 5px inlier boundary, shifting the pose ~0.5deg. Matched precision -> exact.""" - H, W, _ = pts3d.shape - pixels = np.mgrid[:W, :H].T.astype(float) - - if focal is None: - S = max(W, H) - tentative_focals = np.geomspace(S / 2, S * 3, 21) - else: - tentative_focals = [focal] - - if pp is None: - pp = (W / 2, H / 2) - - best = 0, - for focal in tentative_focals: - K = np.array([(focal, 0, pp[0]), (0, focal, pp[1]), (0, 0, 1)], dtype=k_dtype) - - success, R, T, inliers = cv2.solvePnPRansac( - pts3d[mask], pixels[mask], K, None, - iterationsCount=niter_PnP, reprojectionError=5, flags=cv2.SOLVEPNP_SQPNP) - if not success: - continue - - score = len(inliers) - if success and score > best[0]: - best = score, R, T, focal - - if not best[0]: - return None - - _, R, T, best_focal = best - R = cv2.Rodrigues(R)[0] # world to cam - world2cam = np.eye(4).astype(float) - world2cam[:3, :3] = R - world2cam[:3, 3] = T.reshape(3) - cam2world = np.linalg.inv(world2cam) - - return best_focal, cam2world diff --git a/pose/accumulate.py b/pose/accumulate.py deleted file mode 100644 index decab3a..0000000 --- a/pose/accumulate.py +++ /dev/null @@ -1,157 +0,0 @@ -"""Sliding-window accumulation prototype -- assemble the validated pieces. - -Slide a window of overlapping pairs across a re10k clip: - pair k = (f_k, f_{k+1}) -> gaussians in f_k's camera frame (run k's world) -Consecutive runs share a frame (f_{k+1} is view1 of pair k and view0 of pair k+1), -so their per-pixel gaussians for that frame are exact correspondences. Fit a Sim(3) -S_k mapping run k's frame into run k-1's frame, and COMPOSE into a global transform - T_0 = I , T_k = T_{k-1} . S_k -that drops every run's gaussians into ONE world (frame f_0's). Accumulate them, -and measure the recovered camera trajectory against ground truth (drift). - -This is the live pipeline minus realtime: PnP (pnp.py) + Sim(3) align (align.py) + -chaining (align.compose) over a real sequence, justified by re10k_crossrun.py. - - FS_DEVICE=cpu nix develop -c python3 pose/accumulate.py CLIP.txt FRAME_DIR \\ - --start 0 --stride 20 --count 13 --cache DIR --ply out.ply - -Engine dumps are cached (pair_*.f32); re-runs skip inference. -""" -import os -import argparse -import numpy as np - -import re10k_control as rc -import re10k_fetch as rf -import re10k_experiment as ex # run_engine -import pnp -import align - -C0 = 0.28209479177387814 -THR = 0.05 - - -def load_dump(path): - a = np.fromfile(path, np.float32).reshape(2, 512, 512, 23) - pts = [a[v, ..., 0:3].astype(np.float64) for v in (0, 1)] - op = [a[v, ..., 15].astype(np.float64) for v in (0, 1)] - rgb = [np.clip(a[v, ..., 3:6] * C0 + 0.5, 0, 1) for v in (0, 1)] - return pts, op, rgb - - -def write_ply(path, xyz, rgb): - n = len(xyz) - hdr = ("ply\nformat binary_little_endian 1.0\n" - f"element vertex {n}\n" - "property float x\nproperty float y\nproperty float z\n" - "property uchar red\nproperty uchar green\nproperty uchar blue\n" - "end_header\n") - rec = np.zeros(n, dtype=[("x", " global (frame f_0) --- - T = [align.identity()] - chain_info = [] - for k in range(1, a.count): - XA, oA = runs[k]["pts"][0].reshape(-1, 3), runs[k]["op"][0].reshape(-1) # f_k as view0 of run k - XB, oB = runs[k-1]["pts"][1].reshape(-1, 3), runs[k-1]["op"][1].reshape(-1) # f_k as view1 of run k-1 - m = (oA > THR) & (oB > THR) - scene = align._rms(XB[m] - XB[m].mean(0)) - s, R, t, inl = align.fit_similarity_ransac(XA[m], XB[m], thresh=0.02 * scene, iters=300) - S_k = (s, R, t) - T.append(align.compose(T[k-1], S_k)) - resid = align._rms(align.apply_sim(S_k, XA[m][inl]) - XB[m][inl]) / scene - chain_info.append((k, s, 100*inl.mean(), 100*m.mean(), 100*resid)) - - # --- accumulate gaussians into the global frame (each frame once) --- - XYZ, RGB = [], [] - def add(run_idx, view, Tk): - pts = runs[run_idx]["pts"][view].reshape(-1, 3) - op = runs[run_idx]["op"][view].reshape(-1) - rgb = runs[run_idx]["rgb"][view].reshape(-1, 3) - keep = op > THR - XYZ.append(align.apply_sim(Tk, pts[keep])); RGB.append(rgb[keep]) - add(0, 0, T[0]) # f_0 (view0 of pair 0) - for k in range(a.count): # f_{k+1} = view1 of pair k - add(k, 1, T[k]) - XYZ = np.concatenate(XYZ); RGB = np.concatenate(RGB) - - # --- recovered camera trajectory (global) vs GT --- - rec = [align.apply_sim(T[k], runs[k]["c2w"][0][:3, 3]) for k in range(a.count)] - rec.append(align.apply_sim(T[a.count-1], runs[a.count-1]["c2w"][1][:3, 3])) - rec = np.array(rec) - gt = np.array([frames[i]["c2w"][:3, 3] for i in idx]) - sA, RA, tA = align.fit_similarity(rec, gt) # align recovered traj -> GT - rec_al = align.apply_sim((sA, RA, tA), rec) - perr = np.linalg.norm(rec_al - gt, axis=1) - gt_ext = align._rms(gt - gt.mean(0)) - - # --- report --- - print("\nchain (per shared frame): step scale inlier% valid% resid%scene") - for k, s, inlp, vp, rp in chain_info: - print(f" {k:>3} {s:>6.3f} {inlp:>6.1f} {vp:>6.1f} {rp:>7.2f}") - total_scale = np.prod([ci[1] for ci in chain_info]) if chain_info else 1.0 - print(f"cumulative scale drift over {a.count-1} links: {total_scale:.3f} " - f"(per-link mean {total_scale**(1/max(a.count-1,1)):.4f})") - - print(f"\ntrajectory vs GT ({len(idx)} frames, after global Sim(3) align):") - print(f" ATE rms = {align._rms(rec_al-gt):.4f} ({100*align._rms(rec_al-gt)/gt_ext:.1f}% " - f"of GT extent {gt_ext:.4f})") - print(f" per-frame error % of extent: " + - " ".join(f"{100*e/gt_ext:.0f}" for e in perr)) - print(f" drift: first-half {100*perr[:len(perr)//2].mean()/gt_ext:.1f}% " - f"second-half {100*perr[len(perr)//2:].mean()/gt_ext:.1f}%") - - print(f"\naccumulated cloud: {len(XYZ):,} points (opacity>{THR})") - if a.ply: - if len(XYZ) > a.max_points: - sel = np.random.default_rng(0).choice(len(XYZ), a.max_points, replace=False) - XYZ, RGB = XYZ[sel], RGB[sel] - write_ply(a.ply, XYZ, RGB) - print(f" wrote {a.ply} ({len(XYZ):,} points)") - - -if __name__ == "__main__": - main() diff --git a/pose/align.py b/pose/align.py deleted file mode 100644 index 0beec07..0000000 --- a/pose/align.py +++ /dev/null @@ -1,234 +0,0 @@ -"""Coordinate-system normalization: align one inference's world into another's. - -DOWNSTREAM of the engine seam. Each free_splatter_run over a different photo -pair invents its OWN coordinate system (gaussians in the first view's camera -frame, then rescaled by s = 1/mean-point-distance -- see FreeSplatter -estimate_poses). Two runs that share a photo therefore describe the SAME surface -at a DIFFERENT scale, because each run's 1/d normalization is computed over a -different point support. So aligning run B into run A is a SIMILARITY transform -(rotation + translation + ONE uniform scale, 7 DoF), not a rigid one. - -This module fits that similarity in closed form (Umeyama 1991), provides a -RANSAC variant for floaters, and -- crucially -- a residual ladder that *detects* -whether a single uniform scale actually suffices or whether the mismatch is -non-uniform (focal error / affine depth ambiguity), which is the open question. - -Pure numpy. A similarity is represented as the tuple T = (s, R, t) acting on -column-stacked points by x -> s * R @ x + t. -""" -import numpy as np - - -def apply_sim(T, X): - """Apply T=(s,R,t) to X (N,3) -> (N,3).""" - s, R, t = T - return (s * (R @ X.T)).T + t - - -def _rms(resid): - """Root-mean-square point error of an (N,3) residual array.""" - return float(np.sqrt((resid ** 2).sum(axis=1).mean())) - - -def fit_similarity(X, Y, with_scale=True): - """Umeyama: best (s,R,t) minimizing ||s R X + t - Y||^2 in closed form. - - with_scale=False degenerates to rigid Kabsch (s fixed at 1). Reflections are - rejected (proper rotation, det R = +1) exactly as Umeyama prescribes. - """ - X = np.asarray(X, float) - Y = np.asarray(Y, float) - n = X.shape[0] - mx = X.mean(0) - my = Y.mean(0) - Xc = X - mx - Yc = Y - my - Sigma = (Yc.T @ Xc) / n # 3x3 cross-covariance, maps X -> Y - U, D, Vt = np.linalg.svd(Sigma) - S = np.eye(3) - if np.linalg.det(U) * np.linalg.det(Vt) < 0: # reflection guard - S[-1, -1] = -1.0 - R = U @ S @ Vt - if with_scale: - var_x = (Xc ** 2).sum() / n - s = float((D * np.diag(S)).sum() / var_x) if var_x > 0 else 1.0 - else: - s = 1.0 - t = my - s * (R @ mx) - return s, R, t - - -def fit_rigid(X, Y): - """Rigid (6-DoF) fit -- similarity with the scale DoF removed.""" - return fit_similarity(X, Y, with_scale=False) - - -def fit_affine(X, Y): - """Full affine (12-DoF) fit: Y ~= A X + b. Returns ((A,b), prediction). - - Used only as the next rung of the residual ladder: how much of the leftover - error after a similarity is explained by *non-uniform* (shear/anisotropic) - deformation -- the signature of focal error or affine-invariant depth. - """ - n = X.shape[0] - Xh = np.hstack([X, np.ones((n, 1))]) # (n,4) - M, *_ = np.linalg.lstsq(Xh, Y, rcond=None) # (4,3) - pred = Xh @ M - A = M[:3].T - b = M[3] - return (A, b), pred - - -def fit_similarity_ransac(X, Y, thresh, iters=300, with_scale=True, seed=0): - """RANSAC similarity: robust to gross outliers (floaters in the overlap). - - Minimal sample is 3 point pairs (a similarity has 7 DoF; 3 pairs give 9 - constraints). Returns (s,R,t, inlier_mask). - """ - rng = np.random.default_rng(seed) - n = len(X) - best = np.zeros(n, bool) - for _ in range(iters): - idx = rng.choice(n, 3, replace=False) - try: - T = fit_similarity(X[idx], Y[idx], with_scale) - except np.linalg.LinAlgError: - continue - res = np.linalg.norm(apply_sim(T, X) - Y, axis=1) - inl = res < thresh - if inl.sum() > best.sum(): - best = inl - if best.sum() < 3: # fall back to plain fit - best = np.ones(n, bool) - T = fit_similarity(X[best], Y[best], with_scale) - return (*T, best) - - -def residual_ladder(X, Y): - """Fit rigid -> similarity -> affine and report each RMS residual. - - The DROP from rigid->similarity is how much was pure uniform scale; the drop - from similarity->affine is how much non-uniform warp remains. depth_corr is - the Pearson correlation between the per-point similarity residual magnitude - and the point's distance from the centroid -- a positive value means the - leftover error GROWS with depth, the fingerprint of focal/projective error. - All residuals are absolute (same units as Y); compare against `scene`. - """ - scene = _rms(Y - Y.mean(0)) # overall extent of the target - - Tr = fit_rigid(X, Y) - r_rigid = _rms(apply_sim(Tr, X) - Y) - - Ts = fit_similarity(X, Y, with_scale=True) - res_sim = apply_sim(Ts, X) - Y - r_sim = _rms(res_sim) - - _, pred_aff = fit_affine(X, Y) - r_aff = _rms(pred_aff - Y) - - depth = np.linalg.norm(X - X.mean(0), axis=1) - rmag = np.linalg.norm(res_sim, axis=1) - depth_corr = float(np.corrcoef(depth, rmag)[0, 1]) if rmag.std() > 1e-12 else 0.0 - - return { - "scene": scene, - "rigid": r_rigid, - "similarity": r_sim, - "affine": r_aff, - "scale": float(Ts[0]), - "depth_corr": depth_corr, - } - - -def diagnose(X, Y, tol=1e-3, corr_tol=0.3): - """Classify the A<->B mismatch from the residual ladder. - - Residuals are judged relative to scene extent. Verdicts: - rigid_ok : a rigid transform already fits -- same scale (rare across runs) - needs_scale : a SIMILARITY fits but rigid does not -- the expected case - needs_affine: even similarity fails but affine fits -- non-uniform warp - nonlinear : affine fails too -- genuinely nonlinear (e.g. quadratic depth) - `depth_corr` is reported as corroborating evidence of structure. - """ - L = residual_ladder(X, Y) - sc = L["scene"] or 1.0 - rr, rs, ra = L["rigid"] / sc, L["similarity"] / sc, L["affine"] / sc - # Is the leftover after a similarity STRUCTURED (a real warp) or just noise? - # Two independent signatures: affine meaningfully beats similarity, OR the - # residual correlates with depth. Neither => unstructured noise floor. - aff_gain = (L["similarity"] - L["affine"]) / L["similarity"] if L["similarity"] > 0 else 0.0 - structured = abs(L["depth_corr"]) > corr_tol or aff_gain > 0.25 - if rr < tol: - v = "rigid_ok" # same scale AND pose: rare across runs - elif rs < tol: - v = "needs_scale" # a clean similarity fits: the expected case - elif ra < tol: - v = "needs_affine" # similarity fails, affine fits: non-uniform - elif structured: - v = "nonlinear" # even affine fails AND residual is structured - else: - v = "similarity_plus_noise" # similarity is the model; rest is noise floor - L["verdict"] = v - L["structured"] = bool(structured) - L["aff_gain"] = float(aff_gain) - return L - - -# --- chaining similarities along the photo stream -------------------------- - -def compose(T2, T1): - """Return T2 . T1 (apply T1 then T2).""" - s1, R1, t1 = T1 - s2, R2, t2 = T2 - return (s2 * s1, R2 @ R1, s2 * (R2 @ t1) + t2) - - -def invert(T): - """Inverse similarity.""" - s, R, t = T - si = 1.0 / s - Ri = R.T - return (si, Ri, -si * (Ri @ t)) - - -def identity(): - return (1.0, np.eye(3), np.zeros(3)) - - -# --- Sim(3) as 4x4 matrices (for pose-graph relaxation) -------------------- - -def sim_matrix(s, R, t): - """Sim(3) as a 4x4 homogeneous matrix [[sR, t],[0,1]] (compose == matmul).""" - M = np.eye(4) - M[:3, :3] = s * R - M[:3, 3] = t - return M - - -def sim_frac_power(M, f): - """Fractional power M^f of a Sim(3) 4x4, via eigendecomposition (real part). - - The even loop-closure relaxation: distribute an accumulated drift D over n - nodes by applying D^(k/n) at node k. M^0=I, M^1=M, (M^0.5)^2=M; valid while the - rotation is < 180deg (principal branch). Tested in test_pose.py.""" - w, V = np.linalg.eig(M) - return (V @ np.diag(w ** f) @ np.linalg.inv(V)).real - - -def loop_closure_error(transforms): - """Compose a closed loop of similarities and measure deviation from identity. - - transforms: list [T_{0->1}, T_{1->2}, ..., T_{k->0}]. With drift-free links - the product is identity; the returned dict quantifies residual scale, rotation - (deg) and translation drift -- a direct scale-drift / loop-closure metric. - """ - T = identity() - for Ti in transforms: - T = compose(Ti, T) - s, R, t = T - ang = np.degrees(np.arccos(np.clip((np.trace(R) - 1) / 2, -1, 1))) - return { - "scale_err": abs(np.log(s)), # 0 == no scale drift - "rot_deg": float(ang), - "trans": float(np.linalg.norm(t)), - } diff --git a/pose/check_cv2_parity.py b/pose/check_cv2_parity.py deleted file mode 100644 index 4f9e8f2..0000000 --- a/pose/check_cv2_parity.py +++ /dev/null @@ -1,73 +0,0 @@ -"""Verify the numpy DLT/RANSAC PnP against the EXACT upstream solver. - -FreeSplatter uses cv2.solvePnPRansac(reprojErr=5, SOLVEPNP_SQPNP). This compares -our dependency-free reference backend to that exact call on synthetic scenes with -known ground truth -- both must recover the true pose AND agree with each other. -Needs cv2 (nix develop, after opencv was added to the flake). - - nix develop -c python3 pose/check_cv2_parity.py -""" -import numpy as np -import pnp - -RNG = np.random.default_rng(7) -fails = [] - - -def rot_angle(R): - return np.degrees(np.arccos(np.clip((np.trace(R) - 1) / 2, -1, 1))) - - -def rand_small_pose(rng): - ax = rng.normal(size=3) - ax /= np.linalg.norm(ax) - ang = np.radians(rng.uniform(3, 20)) - Kx = np.array([[0, -ax[2], ax[1]], [ax[2], 0, -ax[0]], [-ax[1], ax[0], 0]]) - R = np.eye(3) + np.sin(ang) * Kx + (1 - np.cos(ang)) * Kx @ Kx - t = rng.uniform(-0.6, 0.6, size=3) - return R, t - - -def run(label, with_outliers): - f = 400.0 - K = np.array([[f, 0, 256.0], [0, f, 256.0], [0, 0, 1.0]]) - worst = {"np_R": 0, "np_t": 0, "cv_R": 0, "cv_t": 0, "agree_R": 0, "agree_t": 0} - for _ in range(10): - Pw = RNG.uniform([-2, -2, 4], [2, 2, 8], size=(500, 3)) - R, t = rand_small_pose(RNG) - Xc = (R @ Pw.T).T + t - if np.any(Xc[:, 2] <= 0): - continue - px = (K @ Xc.T).T - px = px[:, :2] / px[:, 2:3] - if with_outliers: - oi = RNG.choice(len(px), int(0.2 * len(px)), replace=False) - px[oi] += RNG.uniform(-120, 120, size=(len(oi), 2)) - - c2w_np, _ = pnp.solve_pnp_numpy(Pw, px, K, thresh_px=4.0, iters=300) - c2w_cv, _ = pnp.solve_pnp_cv2(Pw, px, K) - w_np, w_cv = np.linalg.inv(c2w_np), np.linalg.inv(c2w_cv) - worst["np_R"] = max(worst["np_R"], rot_angle(w_np[:3, :3] @ R.T)) - worst["np_t"] = max(worst["np_t"], np.linalg.norm(w_np[:3, 3] - t)) - worst["cv_R"] = max(worst["cv_R"], rot_angle(w_cv[:3, :3] @ R.T)) - worst["cv_t"] = max(worst["cv_t"], np.linalg.norm(w_cv[:3, 3] - t)) - worst["agree_R"] = max(worst["agree_R"], rot_angle(w_np[:3, :3] @ w_cv[:3, :3].T)) - worst["agree_t"] = max(worst["agree_t"], np.linalg.norm(w_np[:3, 3] - w_cv[:3, 3])) - - tolR, tolt = (0.5, 0.05) if with_outliers else (1e-3, 1e-4) - print(f"{label}: worst-of-10 " - f"numpy(R={worst['np_R']:.2e}deg t={worst['np_t']:.2e}) " - f"cv2(R={worst['cv_R']:.2e}deg t={worst['cv_t']:.2e}) " - f"agree(R={worst['agree_R']:.2e}deg t={worst['agree_t']:.2e})") - ok = (worst["np_R"] < tolR and worst["cv_R"] < tolR and - worst["agree_R"] < tolR and worst["agree_t"] < tolt) - if not ok: - fails.append(label) - return ok - - -print("PnP backend parity: numpy DLT/RANSAC vs upstream cv2.solvePnPRansac") -run("clean ", with_outliers=False) -run("outliers", with_outliers=True) -print("PARITY OK" if not fails else f"PARITY FAILED: {fails}") -raise SystemExit(1 if fails else 0) diff --git a/pose/check_upstream_parity.py b/pose/check_upstream_parity.py deleted file mode 100644 index 6fceb65..0000000 --- a/pose/check_upstream_parity.py +++ /dev/null @@ -1,154 +0,0 @@ -"""Verify our WHOLE pose orchestration against upstream, on REAL engine output. - -`check_cv2_parity.py` already pinned our solver wrapper to cv2.solvePnPRansac on -synthetic ground truth. This goes one level up: it drives the *entire* upstream -`estimate_poses` recipe (FreeSplatter scene path: use_first_focal, opacity mask -for PnP, no focal mask, pnp_iter=10 -- see .cache/upstream/{model,runner}.py) -using the vendored-verbatim upstream kernels in `_upstream.py`, and compares it -to our `focal.py`/`pnp.py` on the gaussians the C++ engine actually produced. - -Three levels, each isolating one stage so a divergence points at one file: - 1. focal kernel -- focal.estimate_focal vs _upstream.estimate_focal_np - 2. PnP solver -- pnp.solve_pnp_cv2 vs _upstream.fast_pnp (shared focal) - 3. full pipeline -- pnp.estimate_poses vs the upstream recipe replica -Plus the scene baseline-rescale (runner.run_freesplatter_scene) for reference. - - nix develop -c python3 pose/check_upstream_parity.py A_scn.f32 # scene (live case) - -Defaults to the scratchpad dumps if no path is given. Opacity in the engine -output is ALREADY sigmoid-activated, so mask = (opacity > thr) reproduces -upstream's sigmoid(raw) > thr exactly. -""" -import os -import sys -import numpy as np -import cv2 - -import focal -import pnp -import _upstream as up - -C, OPACITY = 23, 15 -THR = 0.05 -SEED = 0 # cv2.solvePnPRansac draws from the shared global cv::theRNG(); - # reseed identically before each call so RANSAC picks the SAME - # minimal samples -> isolates algorithmic parity from RNG state. -SCRATCH = os.environ.get("EMP_DIR", "").rstrip("/") -_fails = [] - - -def rot_angle(R): - return float(np.degrees(np.arccos(np.clip((np.trace(R) - 1) / 2, -1, 1)))) - - -def load(path, H=512, W=512): - a = np.fromfile(path, np.float32) - N = a.size // (H * W * C) - a = a.reshape(N, H, W, C) - pts = [a[i, ..., 0:3].astype(np.float64) for i in range(N)] # (H,W,3) each - op = [a[i, ..., OPACITY].astype(np.float64) for i in range(N)] # (H,W) each - return N, H, W, pts, op - - -def check(name, cond, detail=""): - tag = "PASS" if cond else "FAIL" - print(f" [{tag}] {name}" + (f" ({detail})" if detail else "")) - if not cond: - _fails.append(name) - - -def upstream_estimate_poses(pts, op, H, W, pnp_iter=10): - """Replicate FreeSplatter scene estimate_poses with vendored kernels. - - use_first_focal=True, masks=None: focal from view0 over ALL pixels; per-view - PnP masked by opacity>THR. Returns (c2ws list, focal).""" - pp = np.array([W / 2.0, H / 2.0]) - f = up.estimate_focal_np(pts[0], pp=pp, mask=None) # view0, all pixels - c2ws = [] - cv2.setRNGSeed(SEED) - for i in range(len(pts)): - mask = op[i] > THR # (H,W) bool - res = up.fast_pnp(pts[i], mask, focal=f, niter_PnP=pnp_iter, - k_dtype=np.float64) # matched precision (our wrapper is f64) - c2ws.append(res[1]) - return c2ws, f - - -def main(): - path = sys.argv[1] if len(sys.argv) > 1 else os.path.join(SCRATCH, "A_scn.f32") - if not os.path.isabs(path) and SCRATCH and not os.path.exists(path): - path = os.path.join(SCRATCH, path) - N, H, W, pts, op = load(path) - pp = np.array([W / 2.0, H / 2.0]) - print(f"=== upstream-parity on {os.path.basename(path)} ({N} views) ===") - for i in range(N): - print(f" view{i}: valid(opacity>{THR}) = {(op[i] > THR).mean()*100:.1f}%") - - # ---- Level 1: focal kernel (view0, all pixels) ---- - print("\n[1] focal kernel focal.estimate_focal vs upstream.estimate_focal_np") - grid = pnp.pixel_grid(H, W) # our pixel convention - f_up = up.estimate_focal_np(pts[0], pp=pp, mask=None) - f_ours = focal.estimate_focal(pts[0].reshape(-1, 3), grid, pp) - rel = abs(f_ours - f_up) / f_up - check("focal matches upstream (all pixels, view0)", rel < 1e-6, - f"ours {f_ours:.4f} vs up {f_up:.4f} rel {rel:.2e}") - - # ---- Level 2: PnP solver, shared upstream focal ---- - print("\n[2] PnP solver pnp.solve_pnp_cv2 vs upstream.fast_pnp (focal fixed)") - K = np.array([[f_up, 0, pp[0]], [0, f_up, pp[1]], [0, 0, 1.0]]) - for i in range(N): - mask = op[i] > THR - Xw = pts[i].reshape(-1, 3)[mask.ravel()] - px = grid[mask.ravel()] - cv2.setRNGSeed(SEED) - c2w_up = up.fast_pnp(pts[i], mask, focal=f_up, niter_PnP=10, - k_dtype=np.float64)[1] # matched precision - cv2.setRNGSeed(SEED) - c2w_ours, _ = pnp.solve_pnp_cv2(Xw, px, K, iters=10, reproj=5.0) - dR = rot_angle(c2w_ours[:3, :3] @ c2w_up[:3, :3].T) - dt = float(np.linalg.norm(c2w_ours[:3, 3] - c2w_up[:3, 3])) - check(f"view{i} cam2world matches upstream", dR < 1e-3 and dt < 1e-4, - f"dR {dR:.2e} deg dt {dt:.2e}") - # diagnostic: upstream's native float32 K (precision-only effect) - cv2.setRNGSeed(SEED) - c2w_f32 = up.fast_pnp(pts[i], mask, focal=f_up, niter_PnP=10)[1] - dR32 = rot_angle(c2w_ours[:3, :3] @ c2w_f32[:3, :3].T) - print(f" (upstream native float32 K: dR {dR32:.2e} deg -- " - f"RANSAC inlier-boundary precision, not orchestration)") - - # ---- Level 3: full pipeline ---- - print("\n[3] full pipeline pnp.estimate_poses vs upstream recipe") - up_c2ws, f_up2 = upstream_estimate_poses(pts, op, H, W, pnp_iter=10) - cv2.setRNGSeed(SEED) - out = pnp.estimate_poses(pts, op, opacity_threshold=THR, backend="cv2", - normalize=False, cam_frame_points=pts) - our_c2ws, f_our2 = out["cam2world"], out["focal"] - relf = abs(f_our2 - f_up2) / f_up2 - check("pipeline focal matches", relf < 1e-6, f"ours {f_our2:.4f} vs up {f_up2:.4f}") - # anchor-invariant relative pose view0->view1 (handles any re-anchor convention) - def rel_pose(c): - return np.linalg.inv(c[0]) @ c[1] - Rup, Rours = rel_pose(up_c2ws), rel_pose(our_c2ws) - dR = rot_angle(Rours[:3, :3] @ Rup[:3, :3].T) - # compare translation DIRECTION (scale/anchor independent) - tup, tour = Rup[:3, 3], Rours[:3, 3] - cos_t = float(tup @ tour / (np.linalg.norm(tup) * np.linalg.norm(tour) + 1e-12)) - check("relative-pose rotation matches", dR < 1e-2, f"dR {dR:.2e} deg") - check("relative-pose translation direction matches", cos_t > 1 - 1e-6, - f"cos {cos_t:.8f}") - - # ---- scene baseline rescale (runner), reference only ---- - baseline = float(np.linalg.norm(up_c2ws[1][:3, 3] - up_c2ws[0][:3, 3])) + 1e-2 - print(f"\n[ref] scene baseline rescale: 1/baseline = {1.0/baseline:.4f} " - f"(runner applies this to camera centers; our estimate_poses returns raw " - f"poses, normalize=False)") - - print() - if _fails: - print(f"UPSTREAM PARITY FAILED ({len(_fails)}): " + ", ".join(_fails)) - raise SystemExit(1) - print("UPSTREAM PARITY OK") - - -if __name__ == "__main__": - main() diff --git a/pose/empirical.py b/pose/empirical.py deleted file mode 100644 index bd91498..0000000 --- a/pose/empirical.py +++ /dev/null @@ -1,83 +0,0 @@ -"""Empirical cross-run test on REAL engine output. - -A photo that appears in two separate inferences (an overlapping pair sharing it) -is reconstructed twice, in two different coordinate systems. Its per-pixel -gaussian centers from the two runs are the SAME physical points described two -ways -- exact correspondences. Fitting a similarity between them and reading the -residual ladder answers the open question on real data: is the cross-run mismatch -a clean uniform-scale similarity, or does it carry nonlinear warp? - -Usage: - nix develop -c python3 pose/empirical.py A.f32 viewA B.f32 viewB [label] - -A.f32/B.f32 are raw engine outputs [N,H,W,23]; viewA/viewB pick the shared photo -within each (e.g. it's view 1 of pair (000,004) and view 0 of pair (004,013)). -""" -import sys -import numpy as np -import align - -C = 23 -OPACITY = 15 - - -def load_view(path, view, H=512, W=512): - a = np.fromfile(path, np.float32) - N = a.size // (H * W * C) - a = a.reshape(N, H, W, C) - g = a[view] - return g[..., 0:3].reshape(-1, 3).astype(np.float64), g[..., OPACITY].reshape(-1) - - -def main(): - pA, vA, pB, vB = sys.argv[1], int(sys.argv[2]), sys.argv[3], int(sys.argv[4]) - label = sys.argv[5] if len(sys.argv) > 5 else "" - XA, oA = load_view(pA, vA) - XB, oB = load_view(pB, vB) - - print(f"=== {label} ===") - print(f"opacity A: [{oA.min():.3f},{oA.max():.3f}] mean {oA.mean():.3f} | " - f"B: [{oB.min():.3f},{oB.max():.3f}] mean {oB.mean():.3f}") - - thr = 0.05 - mask = (oA > thr) & (oB > thr) - A, B = XA[mask], XB[mask] - print(f"valid (opacity>{thr} in both): {mask.sum()}/{mask.size} " - f"({100*mask.mean():.1f}%)") - if mask.sum() < 100: - print("too few valid correspondences"); return - - scene = align._rms(A - A.mean(0)) - # robust fit B -> A; threshold = 2% of scene extent - s, R, t, inl = align.fit_similarity_ransac(B, A, thresh=0.02 * scene, iters=400) - print(f"scene extent (rms): {scene:.4f}") - print(f"robust similarity B->A: scale={s:.4f} inliers={inl.sum()}/{len(A)} " - f"({100*inl.mean():.1f}%)") - - # how cross-run-consistent is the WHOLE valid set under the robust fit? - res_all = np.linalg.norm(align.apply_sim((s, R, t), B) - A, axis=1) - print("cross-run consistency (all valid pixels, under robust similarity):") - for q in (0.01, 0.02, 0.05, 0.10): - print(f" within {100*q:4.0f}% of scene extent: " - f"{100*(res_all < q*scene).mean():5.1f}% of pixels") - - # residual ladder on the inlier (well-corresponded) surface - L = align.diagnose(B[inl], A[inl]) - sc = L["scene"] - print("residual ladder (inliers, RMS | % of scene extent):") - for k in ("rigid", "similarity", "affine"): - print(f" {k:11s} {L[k]:.5f} {100*L[k]/sc:6.2f}%") - print(f" recovered scale {L['scale']:.4f}") - print(f" depth_corr {L['depth_corr']:+.3f} (structured={L['structured']})") - print(f" VERDICT {L['verdict']}") - - # interpretation - drop_scale = (L["rigid"] - L["similarity"]) / sc - drop_aff = (L["similarity"] - L["affine"]) / sc - print(f"interpretation: rigid->similarity drop {100*drop_scale:.2f}% " - f"(=how much was uniform scale); similarity->affine drop " - f"{100*drop_aff:.2f}% (=non-uniform warp).") - - -if __name__ == "__main__": - main() diff --git a/pose/find_loop.py b/pose/find_loop.py deleted file mode 100644 index ab732cf..0000000 --- a/pose/find_loop.py +++ /dev/null @@ -1,70 +0,0 @@ -"""Search RealEstate10K poses for a LOOPING clip -- camera wanders away from its -start then returns near it (with a similar viewing direction, so the first and last -frames overlap). That overlap is the loop-closure constraint we need to test drift -correction. Pose-only: no video downloads until a candidate is chosen. - -A good loop: large max distance from the start (real motion) but small end-to-start -distance (returns), and small first<->last rotation (same view => the closing pair -(f_0, f_n) is a valid stereo pair the engine can reconstruct). - - nix develop -c python3 pose/find_loop.py [split] [limit] [--live N] -""" -import sys -import numpy as np - -import re10k_control as rc -import re10k_fetch as rf - - -def score(fr): - C = np.array([f["c2w"][:3, 3] for f in fr]) - d = np.linalg.norm(C - C[0], axis=1) - maxd = d.max() - if maxd < 1e-2: - return None - rotend = rc.rot_deg(rc.rel_pose(fr[0]["c2w"], fr[-1]["c2w"])) - return {"n": len(fr), "maxd": float(maxd), "endd": float(d[-1]), - "return_ratio": float(d[-1] / maxd), "rotend": rotend, - "loopiness": float(maxd / (d[-1] + 0.05 * maxd))} - - -def search(split="test", limit=None, min_frames=80, - max_return_ratio=0.25, max_rotend=45.0): - rows = [] - clips = rc.list_clips(split) - if limit: - clips = clips[:limit] - for cp in clips: - try: - url, fr = rc.parse_clip(cp) - except Exception: - continue - if len(fr) < min_frames: - continue - s = score(fr) - if not s or s["return_ratio"] > max_return_ratio or s["rotend"] > max_rotend: - continue - s["clip"], s["id"] = cp, rc.clip_video_id(url) - rows.append(s) - rows.sort(key=lambda r: r["loopiness"], reverse=True) - return rows - - -if __name__ == "__main__": - split = sys.argv[1] if len(sys.argv) > 1 and not sys.argv[1].startswith("-") else "test" - limit = int(sys.argv[2]) if len(sys.argv) > 2 and not sys.argv[2].startswith("-") else None - n_live = int(sys.argv[sys.argv.index("--live") + 1]) if "--live" in sys.argv else 0 - rows = search(split, limit) - print(f"{len(rows)} loop candidates (split={split}, limit={limit})") - print(f" {'loopiness':>9} {'n':>4} {'maxd':>6} {'ret%':>5} {'rotEnd':>6} id") - shown = 0 - for r in rows[:30]: - live = "" - if shown < n_live: - try: - live = " LIVE" if rf.is_live(rc.parse_clip(r["clip"])[0]) else " dead" - except Exception: - live = " ?" - shown += 1 - print(f" {r['loopiness']:>9.1f} {r['n']:>4} {r['maxd']:>6.2f} " - f"{100*r['return_ratio']:>5.0f} {r['rotend']:>6.1f} {r['id']}{live}") diff --git a/pose/focal.py b/pose/focal.py deleted file mode 100644 index 1e2ac27..0000000 --- a/pose/focal.py +++ /dev/null @@ -1,48 +0,0 @@ -"""Shared focal-length estimation (Weiszfeld), faithful to FreeSplatter. - -Upstream (freesplatter/models/model.py::estimate_focal, via DUSt3R) assumes a -single shared focal across views, centered principal point, square pixels. Per -view it solves - - f* = argmin_f sum_ij || (i',j') - f * (X_ij0/X_ij2, X_ij1/X_ij2) || - -where (i',j') are pixel coords relative to the principal point and X is that -view's camera-frame point map (gaussian xyz). The robust minimizer is found by -the Weiszfeld algorithm (iteratively reweighted least squares, ~L1). Per-view -focals are then averaged into one shared value. - -Pure numpy -- no cv2 needed for this stage. -""" -import numpy as np - - -def estimate_focal(pts3d_cam, pixels, pp, weiszfeld_iters=10): - """Weiszfeld focal from camera-frame points and their pixels. - - pts3d_cam : (...,3) points in the view's OWN camera frame (Z>0 in front). - pixels : (...,2) pixel coordinates, same leading shape. - pp : (2,) principal point (image center). - """ - X = np.asarray(pts3d_cam, float).reshape(-1, 3) - P = np.asarray(pixels, float).reshape(-1, 2) - np.asarray(pp, float) - # ray directions; Z==0 -> 0 exactly as upstream estimate_focal (nan_to_num) - u = np.nan_to_num(X[:, :2] / X[:, 2:3], posinf=0.0, neginf=0.0) # (N,2) - - pu = (P * u).sum(1) # per-point p . u - uu = (u * u).sum(1) # per-point ||u||^2 - f = pu.sum() / uu.sum() # least-squares init - for _ in range(weiszfeld_iters): - r = np.linalg.norm(P - f * u, axis=1) - w = 1.0 / np.maximum(r, 1e-8) - f = (w * pu).sum() / (w * uu).sum() - return float(f) - - -def estimate_shared_focal(views, pp, weiszfeld_iters=10): - """Average per-view focals into one shared focal. - - views: list of (pts3d_cam, pixels) pairs. Mirrors upstream's - `focals = focals.mean().repeat(N)`. - """ - fs = [estimate_focal(p3, px, pp, weiszfeld_iters) for (p3, px) in views] - return float(np.mean(fs)), fs diff --git a/pose/fuse.py b/pose/fuse.py deleted file mode 100644 index df7b1f7..0000000 --- a/pose/fuse.py +++ /dev/null @@ -1,134 +0,0 @@ -"""Consensus fusion -- remove the partner-dependent floaters (the edge noise). - -Each per-pixel gaussian is conditioned on its partner view, so occlusion-edge / -depth-ambiguous points land differently per run = floaters. A REAL surface point, -though, is reconstructed by several overlapping frames and they agree in the global -frame. So: voxelize the accumulated cloud at the consistency scale and keep only -voxels corroborated by >= K DISTINCT frames, averaging the agreeing predictions -(which also denoises the surface). Singletons / scattered floaters are dropped. - -This is the direct answer to "does accumulation remove the edge noise?" -- yes, once -you gate on cross-frame consensus. Reuses cached engine dumps from accumulate.py -(no new inference). - - nix develop -c python3 pose/fuse.py CLIP FRAME_DIR --cache DIR \\ - --stride 20 --count 13 --voxel 0.02 --k 2 --ply-raw raw.ply --ply-fused fused.ply -""" -import os -import argparse -import numpy as np - -import align - -THR = 0.05 -C0 = 0.28209479177387814 - - -def load(dump): - a = np.fromfile(dump, np.float32).reshape(2, 512, 512, 23) - pts = [a[v, ..., 0:3].astype(np.float64) for v in (0, 1)] - op = [a[v, ..., 15].astype(np.float64) for v in (0, 1)] - rgb = [np.clip(a[v, ..., 3:6] * C0 + 0.5, 0, 1) for v in (0, 1)] - return pts, op, rgb - - -def fit_sim(X, Y): - scene = align._rms(Y - Y.mean(0)) - s, R, t, _ = align.fit_similarity_ransac(X, Y, thresh=0.02 * scene, iters=300) - return (s, R, t) - - -def write_ply(path, xyz, rgb): - n = len(xyz) - hdr = ("ply\nformat binary_little_endian 1.0\n" - f"element vertex {n}\nproperty float x\nproperty float y\nproperty float z\n" - "property uchar red\nproperty uchar green\nproperty uchar blue\nend_header\n") - rec = np.zeros(n, dtype=[("x", " global (same as accumulate.py) - T = [align.identity()] - for k in range(1, a.count): - XA = runs[k]["pts"][0].reshape(-1, 3); oA = runs[k]["op"][0].reshape(-1) - XB = runs[k-1]["pts"][1].reshape(-1, 3); oB = runs[k-1]["op"][1].reshape(-1) - m = (oA > THR) & (oB > THR) - T.append(align.compose(T[k-1], fit_sim(XA[m], XB[m]))) - - # accumulate every frame's gaussians, tagged with a source-frame id - XYZ, RGB, FR = [], [], [] - def add(run, view, Tk, fid): - op = runs[run]["op"][view].reshape(-1) - keep = op > THR - XYZ.append(align.apply_sim(Tk, runs[run]["pts"][view].reshape(-1, 3)[keep])) - RGB.append(runs[run]["rgb"][view].reshape(-1, 3)[keep]) - FR.append(np.full(int(keep.sum()), fid, np.int64)) - add(0, 0, T[0], 0) - for k in range(a.count): - add(k, 1, T[k], k + 1) - XYZ = np.concatenate(XYZ); RGB = np.concatenate(RGB); FR = np.concatenate(FR) - - # voxel multi-frame consensus - ext = align._rms(XYZ - XYZ.mean(0)) - v = a.voxel * ext - ijk = np.floor((XYZ - XYZ.min(0)) / v).astype(np.int64) - uniq, inv = np.unique(ijk, axis=0, return_inverse=True) - G = len(uniq) - # distinct source-frames per voxel - pairs = np.unique(np.stack([inv, FR], 1), axis=0) - support = np.bincount(pairs[:, 0], minlength=G) - cnt = np.bincount(inv, minlength=G).astype(float) - mean = np.stack([np.bincount(inv, XYZ[:, c], minlength=G) / cnt for c in range(3)], 1) - mrgb = np.stack([np.bincount(inv, RGB[:, c], minlength=G) / cnt for c in range(3)], 1) - keep = support >= a.k - - print(f"=== consensus fusion voxel={a.voxel:.3f}*extent K>={a.k} ===") - print(f"raw points: {len(XYZ):,} voxels: {G:,}") - hist = np.bincount(support)[:6] - print("voxels by distinct-frame support: " + - " ".join(f"{i}f:{hist[i] if i < len(hist) else 0:,}" for i in range(1, 6)) + - f" 6+f:{(support >= 6).sum():,}") - print(f"FUSED points (>= {a.k} frames): {keep.sum():,} " - f"({100*keep.sum()/G:.0f}% of voxels, {100*(1-keep.sum()/len(XYZ)):.0f}% point reduction)") - pkeep = keep[inv] # per-point consensus mask - print(f"per-point: kept {pkeep.sum():,} floaters dropped {(~pkeep).sum():,} " - f"({100*(~pkeep).mean():.0f}%)") - if a.ply_raw: - write_ply(a.ply_raw, XYZ, RGB); print(f" raw -> {a.ply_raw}") - if a.ply_fused: - write_ply(a.ply_fused, mean[keep], mrgb[keep]); print(f" fused -> {a.ply_fused}") - if a.ply_kept: - write_ply(a.ply_kept, XYZ[pkeep], RGB[pkeep]); print(f" kept -> {a.ply_kept}") - if a.ply_floaters: - write_ply(a.ply_floaters, XYZ[~pkeep], RGB[~pkeep]); print(f" floaters -> {a.ply_floaters}") - - -if __name__ == "__main__": - main() diff --git a/pose/loop_closure.py b/pose/loop_closure.py deleted file mode 100644 index 961af61..0000000 --- a/pose/loop_closure.py +++ /dev/null @@ -1,150 +0,0 @@ -"""Loop-closure drift correction on a LOOPING re10k clip (see find_loop.py). - -accumulate.py chains Sim(3) open-loop and the drift grows to the endpoint (~11% -ATE). When the camera RETURNS near its start, frame 0 and frame n overlap -- a -loop-closure constraint. Here we: - 1. chain open-loop -> per-frame global poses P_k, recovered trajectory, ATE vs GT. - 2. measure the loop -> run the closing pair (f_0, f_n), scale-aligned to run 0, - giving f_n's pose via the loop (P_n_loop), INDEPENDENT of the chain. - 3. the discrepancy D (P_n_loop = D . P_n) IS the accumulated drift (scale/rot/trans). - 4. distribute D over the chain (even Sim(3) relaxation, D^(k/n)) and show the - trajectory ATE and endpoint error collapse. - -Closes the drift story accumulate.py left open; exercises align's chaining on a -real loop. Poses are 4x4 similarity matrices [[sR,t],[0,1]] (compose = matmul). - - FS_DEVICE=cpu nix develop -c python3 pose/loop_closure.py CLIP FRAME_DIR \\ - --stride 20 --count 13 --cache DIR -""" -import os -import argparse -import numpy as np - -import re10k_control as rc -import re10k_fetch as rf -import re10k_experiment as ex -import pnp -import align - -THR = 0.05 -simmat = align.sim_matrix # Sim(3) 4x4 -matfrac = align.sim_frac_power # M^f for even loop-closure relaxation - - -def fit_sim_M(X, Y): - scene = align._rms(Y - Y.mean(0)) - s, R, t, inl = align.fit_similarity_ransac(X, Y, thresh=0.02 * scene, iters=300) - return simmat(s, R, t), inl - - -def decompose(M): - A = M[:3, :3] - s = float(np.linalg.det(A) ** (1.0 / 3.0)) - R = A / s - ang = float(np.degrees(np.arccos(np.clip((np.trace(R) - 1) / 2, -1, 1)))) - return s, ang, M[:3, 3] - - -def load(dump): - a = np.fromfile(dump, np.float32).reshape(2, 512, 512, 23) - pts = [a[v, ..., 0:3].astype(np.float64) for v in (0, 1)] - op = [a[v, ..., 15].astype(np.float64) for v in (0, 1)] - return pts, op - - -def ate(C, gt): - sA, RA, tA = align.fit_similarity(C, gt) - Cal = align.apply_sim((sA, RA, tA), C) - return align._rms(Cal - gt), np.linalg.norm(Cal - gt, axis=1) - - -def main(): - ap = argparse.ArgumentParser() - ap.add_argument("clip"); ap.add_argument("frame_dir") - ap.add_argument("--start", type=int, default=0) - ap.add_argument("--stride", type=int, default=20) - ap.add_argument("--count", type=int, default=13) - ap.add_argument("--cache", default=os.environ.get("SCRATCH", "/tmp")) - a = ap.parse_args() - - url, frames = rc.parse_clip(a.clip) - idx = [a.start + k * a.stride for k in range(a.count + 1)] - os.makedirs(a.cache, exist_ok=True) - fp = lambda i: os.path.join(a.frame_dir, f"f{i:04d}.png") - if any(not os.path.exists(fp(i)) for i in idx): - rf.fetch(a.clip, a.frame_dir, indices=idx) - print(f"=== loop closure: {os.path.basename(a.clip)} frames {idx[0]}..{idx[-1]} " - f"stride {a.stride} device={ex.DEVICE} ===\n{url}") - - # --- run / load chain pairs --- - runs = [] - for k in range(a.count): - dump = os.path.join(a.cache, f"pair_{idx[k]}_{idx[k+1]}.f32") - if not os.path.exists(dump): - ex.run_engine(fp(idx[k]), fp(idx[k + 1]), dump) - pts, op = load(dump) - c2w = pnp.estimate_poses(pts, op, normalize=False)["cam2world"] - runs.append({"pts": pts, "op": op, "c2w": c2w}) - - # --- open-loop chain: T[k] maps run k -> global --- - T = [np.eye(4)]; chain_diag = [] - for k in range(1, a.count): - XA = runs[k]["pts"][0].reshape(-1, 3); oA = runs[k]["op"][0].reshape(-1) - XB = runs[k-1]["pts"][1].reshape(-1, 3); oB = runs[k-1]["op"][1].reshape(-1) - m = (oA > THR) & (oB > THR) - S, inlk = fit_sim_M(XA[m], XB[m]); T.append(T[k-1] @ S) - sk, _, _ = decompose(S) - chain_diag.append((sk, 100 * inlk.mean(), 100 * m.mean())) - P = [T[k] @ runs[k]["c2w"][0] for k in range(a.count)] - P.append(T[a.count-1] @ runs[a.count-1]["c2w"][1]) # f_n from last run's view1 - centers = np.array([p[:3, 3] for p in P]) - gt = np.array([frames[i]["c2w"][:3, 3] for i in idx]) - ext = align._rms(gt - gt.mean(0)) - - # --- loop measurement: closing pair (f_0, f_n) --- - cdump = os.path.join(a.cache, f"close_{idx[0]}_{idx[-1]}.f32") - if not os.path.exists(cdump): - ex.run_engine(fp(idx[0]), fp(idx[-1]), cdump) - cpts, cop = load(cdump) - cc2w = pnp.estimate_poses(cpts, cop, normalize=False)["cam2world"] - # scale/pose-align the closing run's f_0 (view0) to run 0's f_0 (view0, == global) - XA = cpts[0].reshape(-1, 3); oA = cop[0].reshape(-1) - XB = runs[0]["pts"][0].reshape(-1, 3); oB = runs[0]["op"][0].reshape(-1) - m = (oA > THR) & (oB > THR) - close_valid = 100 * m.mean() - G, inl = fit_sim_M(XA[m], XB[m]) - P_n_loop = G @ cc2w[1] # f_n via the loop, in global - - # --- drift D (P_n_loop = D . P_n) and even distribution --- - D = P_n_loop @ np.linalg.inv(P[-1]) - s, ang, t = decompose(D) - n = a.count - centers_c = np.array([(matfrac(D, k / n) @ np.append(centers[k], 1.0))[:3] - for k in range(len(centers))]) - - # --- report --- - ate0, perr0 = ate(centers, gt) - ate1, perr1 = ate(centers_c, gt) - # diagnostics: is the open-loop chain a clean drift, and does it even return? - sA, RA, tA = align.fit_similarity(centers, gt) - pnl_al = align.apply_sim((sA, RA, tA), P_n_loop[:3, 3]) - print("\nchain per-link scale/inlier%/valid%: " + - " ".join(f"{c[0]:.2f}/{c[1]:.0f}/{c[2]:.0f}" for c in chain_diag)) - print(f"return-to-start dist: recovered {np.linalg.norm(centers[-1]-centers[0]):.3f} " - f"GT {np.linalg.norm(gt[-1]-gt[0]):.3f} (GT extent {ext:.3f})") - print(f"loop measurement P_n_loop endpoint err vs GT: " - f"{100*np.linalg.norm(pnl_al-gt[-1])/ext:.0f}% of extent") - print(f"\nclosing pair (f_{idx[0]}, f_{idx[-1]}): valid%={close_valid:.1f} " - f"inliers={100*inl.mean():.0f}%") - print(f"LOOP-CLOSURE ERROR (open-loop drift at the loop point):") - print(f" scale {s:.3f} ({100*abs(np.log(s)):.1f}% log) rotation {ang:.1f} deg " - f"translation {np.linalg.norm(t):.3f} ({100*np.linalg.norm(t)/ext:.0f}% of extent)") - print(f"\ntrajectory ATE vs GT (after global Sim(3) align):") - print(f" open-loop : {100*ate0/ext:5.1f}% of extent endpoint {100*perr0[-1]/ext:.0f}%") - print(f" closed : {100*ate1/ext:5.1f}% of extent endpoint {100*perr1[-1]/ext:.0f}%") - print(f" per-frame % (open ): " + " ".join(f"{100*e/ext:.0f}" for e in perr0)) - print(f" per-frame % (closed): " + " ".join(f"{100*e/ext:.0f}" for e in perr1)) - - -if __name__ == "__main__": - main() diff --git a/pose/pnp.py b/pose/pnp.py deleted file mode 100644 index 85f3dd8..0000000 --- a/pose/pnp.py +++ /dev/null @@ -1,245 +0,0 @@ -"""PnP camera recovery from the engine's per-pixel gaussians. - -Mirrors FreeSplatter's estimate_poses (freesplatter/models/model.py, which calls -the DUSt3R-derived fast_pnp in utils/recon_util.py): - - * correspondences : every pixel -> its predicted 3D gaussian center X (channels - 0:3) <-> its 2D pixel coordinate. - * validity mask : opacity > 0.05 (drops floaters before PnP). - * focal : Weiszfeld per view, averaged into one shared focal (focal.py). - * pose : cv2.solvePnPRansac(reprojectionError=5, SOLVEPNP_SQPNP, - iterationsCount=10); world2cam, then cam2world = inv(.); - OpenCV convention; view 0 comes out ~identity by construction - (gaussians are predicted in view 0's frame), no re-anchor. - * scene scale : upstream's `estimate_poses` does NOT normalize; the runner - (run_freesplatter_scene) rescales camera centers so the - view0->view1 baseline is 1.0 (s = 1/(||t1-t0||+1e-2)). See - `rescale_to_baseline`. (An earlier 1/mean||pts|| guess was - wrong -- verified against .cache/upstream/{model,runner}.py.) - -Two solver backends, selected by `backend`: - - "cv2" : the exact upstream call (needs opencv; available in the CUDA - container or after adding opencv to the nix shell). Use this for - numerical parity against the original. - - "numpy": a dependency-free DLT + RANSAC reference solver, so the pipeline and - its golden tests run anywhere numpy is present. Verified against - analytic ground truth in test_pose.py. - - "auto" : cv2 if importable, else numpy. - -Gaussian channel layout (scene, 23 ch): xyz[0:3] SH[3:15] opacity[15] scale[16:19] -rotation[19:23]; opacity is assumed already activated (sigmoid) in [0,1]. -""" -import numpy as np - -OPACITY_CHANNEL = 15 - - -# --- numpy reference PnP (known intrinsics) -------------------------------- - -def _dlt(Xw, xn): - """Direct Linear Transform for world2cam [R|t] from normalized image coords. - - Xw : (N,3) world points. xn : (N,2) normalized coords ( K^-1 [u v 1] ). - Returns the raw 3x4 matrix p (defined up to sign/scale); decode with _decode. - """ - N = len(Xw) - A = np.zeros((2 * N, 12)) - Xh = np.hstack([Xw, np.ones((N, 1))]) # (N,4) - A[0::2, 0:4] = Xh - A[0::2, 8:12] = -xn[:, 0:1] * Xh - A[1::2, 4:8] = Xh - A[1::2, 8:12] = -xn[:, 1:2] * Xh - _, _, Vt = np.linalg.svd(A) - return Vt[-1].reshape(3, 4) - - -def _decode(p, Xw, xn): - """Turn a DLT matrix into a proper (R,t) world2cam, resolving sign by - reprojection error and cheirality (points must sit in front of the camera).""" - best = None - for sgn in (1.0, -1.0): - M = sgn * p[:, :3] - tau = sgn * p[:, 3] - U, Sv, Vt = np.linalg.svd(M) - d = np.linalg.det(U @ Vt) - R = U @ np.diag([1, 1, d]) @ Vt # nearest proper rotation - lam = Sv.mean() # M ~= lam * R - if lam < 1e-12: - continue - t = tau / lam - Xc = (R @ Xw.T).T + t - if np.any(Xc[:, 2] <= 0): - err = np.inf - else: - proj = Xc[:, :2] / Xc[:, 2:3] - err = float(np.sqrt(((proj - xn) ** 2).sum(1)).mean()) - if best is None or err < best[0]: - best = (err, R, t) - if best is None: - raise np.linalg.LinAlgError("PnP decode failed (all-behind camera)") - return best[1], best[2], best[0] - - -def solve_pnp_numpy(Xw, pixels, K, thresh_px=5.0, iters=100, seed=0): - """RANSAC DLT PnP. Returns (cam2world 4x4, inlier_mask).""" - Kinv = np.linalg.inv(K) - uv1 = np.hstack([pixels, np.ones((len(pixels), 1))]) - xn = (Kinv @ uv1.T).T[:, :2] # normalized image coords - thresh = thresh_px / K[0, 0] # px tol -> normalized tol - - rng = np.random.default_rng(seed) - n = len(Xw) - best = np.zeros(n, bool) - for _ in range(iters): - idx = rng.choice(n, 6, replace=False) - try: - R, t, _ = _decode(_dlt(Xw[idx], xn[idx]), Xw[idx], xn[idx]) - except np.linalg.LinAlgError: - continue - Xc = (R @ Xw.T).T + t - good = Xc[:, 2] > 0 - proj = np.full((n, 2), 1e9) - proj[good] = Xc[good, :2] / Xc[good, 2:3] - inl = np.linalg.norm(proj - xn, axis=1) < thresh - if inl.sum() > best.sum(): - best = inl - if best.sum() < 6: - best = np.ones(n, bool) - R, t, _ = _decode(_dlt(Xw[best], xn[best]), Xw[best], xn[best]) - world2cam = np.eye(4) - world2cam[:3, :3] = R - world2cam[:3, 3] = t - return np.linalg.inv(world2cam), best - - -def solve_pnp_cv2(Xw, pixels, K, iters=20, reproj=5.0): - """Exact upstream call (needs cv2). Returns (cam2world, inlier_mask).""" - import cv2 - ok, rvec, tvec, inliers = cv2.solvePnPRansac( - Xw.astype(np.float64), pixels.astype(np.float64), K.astype(np.float64), - None, iterationsCount=iters, reprojectionError=reproj, - flags=cv2.SOLVEPNP_SQPNP) - if not ok: - raise RuntimeError("cv2.solvePnPRansac failed") - R, _ = cv2.Rodrigues(rvec) - world2cam = np.eye(4) - world2cam[:3, :3] = R - world2cam[:3, 3] = tvec[:, 0] - mask = np.zeros(len(Xw), bool) - if inliers is not None: - mask[inliers[:, 0]] = True - return np.linalg.inv(world2cam), mask - - -def _have_cv2(): - try: - import cv2 # noqa: F401 - return True - except Exception: - return False - - -def solve_pnp(Xw, pixels, K, backend="auto", **kw): - if backend == "auto": - backend = "cv2" if _have_cv2() else "numpy" - if backend == "cv2": - return solve_pnp_cv2(Xw, pixels, K, **kw) - return solve_pnp_numpy(Xw, pixels, K, **kw) - - -# --- orchestration --------------------------------------------------------- - -def pixel_grid(H, W): - """(H*W, 2) INTEGER pixel coords [x=col, y=row], row-major -- matches upstream - xy_grid / np.mgrid[:W,:H].T exactly (NO half-pixel offset). Row-major so it - lines up with a C-order (H,W,3) gaussian map flattened by reshape(-1,3).""" - xs, ys = np.meshgrid(np.arange(W), np.arange(H)) # both (H,W): xs=col, ys=row - return np.stack([xs.ravel(), ys.ravel()], axis=1).astype(float) - - -def rescale_to_baseline(cam2world): - """Scene normalization from runner.run_freesplatter_scene: rescale camera - centers so the view0->view(last) baseline is 1.0. Returns (list, scale_factor). - - Mirrors upstream exactly: baseline = ||t_last - t_0|| + 1e-2; s = 1/baseline; - every camera translation *= s (rotations untouched).""" - t0 = cam2world[0][:3, 3] - baseline = float(np.linalg.norm(cam2world[-1][:3, 3] - t0)) + 1e-2 - s = 1.0 / baseline - out = [] - for c in cam2world: - c = c.copy() - c[:3, 3] *= s - out.append(c) - return out, s - - -def estimate_poses(points, opacities=None, focal=None, pp=None, - opacity_threshold=0.05, backend="auto", cam_frame_points=None, - use_first_focal=True, focal_use_opacity_mask=False, - pnp_iter=10, normalize=False): - """Faithful port of FreeSplatter's scene estimate_poses (.cache/upstream/model.py) - plus the runner's optional baseline rescale. - - Defaults mirror the scene recipe: focal from view 0 over ALL pixels - (use_first_focal=True, masks=None), per-view PnP masked by opacity>thr, - pnp_iter=10, cam2world=inv(world2cam). estimate_poses returns the RAW poses - (view 0 comes out ~identity because the gaussians live in view 0's frame -- no - explicit re-anchor, matching upstream). Set normalize=True for the runner's - 1/baseline camera rescale (the scene viewer normalization). - - points : list of (H,W,3) gaussian-center maps (view 0 frame). - opacities : list of (H,W) activated opacities for the PnP mask (or None). - cam_frame_points: per-view points in each view's OWN frame for focal. With - use_first_focal only view 0 is used, and view 0's frame IS the - world frame, so points[0] suffices; pass None to use `points`. - focal_use_opacity_mask: scene=False (all pixels, faithful); object would pass an - alpha mask -- approximated here by the opacity mask if True. - Returns dict: cam2world (list of 4x4), focal, scale. - """ - from focal import estimate_focal - - backend_cv2 = backend == "cv2" or (backend == "auto" and _have_cv2()) - - N = len(points) - H, W = points[0].shape[:2] - if pp is None: - pp = np.array([W / 2.0, H / 2.0]) - grid = pixel_grid(H, W) - - masks = [] - for i in range(N): - if opacities is None: - masks.append(np.ones(H * W, bool)) - else: - masks.append(opacities[i].ravel() > opacity_threshold) - - # focal: mirror estimate_focals -- view 0 only (use_first_focal), all pixels - # unless an opacity mask is explicitly requested. - if focal is None: - cfp = cam_frame_points if cam_frame_points is not None else points - n_focal = 1 if use_first_focal else N - fs = [] - for i in range(n_focal): - m = masks[i] if (focal_use_opacity_mask and opacities is not None) \ - else np.ones(H * W, bool) - fs.append(estimate_focal(cfp[i].reshape(-1, 3)[m], grid[m], pp)) - focal = float(np.mean(fs)) - - K = np.array([[focal, 0, pp[0]], [0, focal, pp[1]], [0, 0, 1.0]]) - - cam2world = [] - for i in range(N): - Xw = points[i].reshape(-1, 3)[masks[i]] - px = grid[masks[i]] - if backend_cv2: - c2w, _ = solve_pnp_cv2(Xw, px, K, iters=pnp_iter, reproj=5.0) - else: - c2w, _ = solve_pnp_numpy(Xw, px, K) - cam2world.append(c2w) - - scale = 1.0 - if normalize: - cam2world, scale = rescale_to_baseline(cam2world) - - return {"cam2world": cam2world, "focal": focal, "scale": scale} diff --git a/pose/re10k_control.py b/pose/re10k_control.py deleted file mode 100644 index fe389ea..0000000 --- a/pose/re10k_control.py +++ /dev/null @@ -1,105 +0,0 @@ -"""RealEstate10K loader -- the dense, GT-posed, IN-DISTRIBUTION control. - -FreeSplatter-scene is trained on RealEstate10K-style wide-FOV walkthroughs, so -(unlike Tanks-and-Temples, which is out of distribution) the model is known-good -here. Each clip .txt is: - - line 1 : YouTube URL - line k : timestamp(us) fx fy cx cy 0 0 m00 m01 m02 m03 m10 ... m23 - ^int ^normalized intrinsics ^3x4 world->camera, row-major - -Intrinsics are resolution-independent (fx normalized by width, fy by height). -Frames come from the YouTube video at the given timestamps (yt-dlp + ffmpeg). - -Camera convention: the pose is world->camera extrinsic P=[R|t] (KP maps a world -point to the image), OpenCV-style. c2w = inv([[R,t],[0,0,0,1]]). We VALIDATE this -convention empirically by checking that GT relative rotation agrees with the pose -FreeSplatter recovers on the same pair (the model is accurate in-distribution). - -GT focal after our 512x512 preprocessing (center-crop to height, resize): because -fy is normalized by height and we crop to the full height then scale to 512, - focal_512 = fy * 512 (independent of source resolution). - -Pure numpy for parsing; frame extraction shells out to yt-dlp/ffmpeg. -""" -import os -import subprocess -import numpy as np - -HERE = os.path.dirname(os.path.abspath(__file__)) -RE10K = os.environ.get("RE10K_ROOT", os.path.join(HERE, "..", ".cache", "re10k", "RealEstate10K")) - - -def parse_clip(path): - """Return (url, frames). frames: list of dicts {ts, fx,fy,cx,cy, w2c(3x4), c2w(4x4)}.""" - with open(path) as f: - lines = [ln.strip() for ln in f if ln.strip()] - url = lines[0] - frames = [] - for ln in lines[1:]: - v = ln.split() - ts = int(v[0]) - fx, fy, cx, cy = (float(x) for x in v[1:5]) - # v[5], v[6] are zeros (distortion); v[7:19] are the 3x4 world->cam - P = np.array([float(x) for x in v[7:19]], float).reshape(3, 4) - w2c = np.eye(4) - w2c[:3, :4] = P - c2w = np.linalg.inv(w2c) - frames.append({"ts": ts, "fx": fx, "fy": fy, "cx": cx, "cy": cy, - "w2c": w2c, "c2w": c2w}) - return url, frames - - -def gt_focal_512(frame): - """GT focal in the engine's 512x512 input (center-crop to height + resize).""" - return frame["fy"] * 512.0 - - -def rel_pose(c2w_a, c2w_b): - return np.linalg.inv(c2w_a) @ c2w_b - - -def rot_deg(R): - return float(np.degrees(np.arccos(np.clip((np.trace(R[:3, :3]) - 1) / 2, -1, 1)))) - - -def list_clips(split="test"): - d = os.path.join(RE10K, split) - return sorted(os.path.join(d, f) for f in os.listdir(d) if f.endswith(".txt")) - - -def clip_video_id(url): - """YouTube id from the clip header URL.""" - for key in ("watch?v=", "youtu.be/", "/embed/"): - if key in url: - return url.split(key)[1][:11] - return url.rsplit("/", 1)[-1][:11] - - -def baseline_report(clip_path): - url, fr = parse_clip(clip_path) - n = len(fr) - cen = np.array([f["c2w"][:3, 3] for f in fr]) - span = float(np.linalg.norm(cen - cen.mean(0), axis=1).mean()) - print(f"=== {os.path.basename(clip_path)} ({n} frames) {url} ===") - print(f"video id: {clip_video_id(url)} duration: " - f"{(fr[-1]['ts']-fr[0]['ts'])/1e6:.1f}s GT focal(512)~{gt_focal_512(fr[0]):.1f}") - print(f"camera-center spread (mean dist to centroid): {span:.4f}") - print(f" {'stride':>6} {'pairs':>6} {'rot(deg) med':>13} {'p10..p90':>16} " - f"{'baseline med':>13}") - for s in (1, 2, 5, 10, 20, 40): - rots, bl = [], [] - for i in range(0, n - s): - T = rel_pose(fr[i]["c2w"], fr[i + s]["c2w"]) - rots.append(rot_deg(T)); bl.append(np.linalg.norm(T[:3, 3])) - if not rots: - continue - rots, bl = np.array(rots), np.array(bl) - print(f" {s:>6} {len(rots):>6} {np.median(rots):>13.2f} " - f"{np.percentile(rots,10):>7.2f}..{np.percentile(rots,90):<7.2f} " - f"{np.median(bl):>13.4f}") - - -if __name__ == "__main__": - import sys - baseline_report(sys.argv[1]) diff --git a/pose/re10k_crossrun.py b/pose/re10k_crossrun.py deleted file mode 100644 index 422c1d5..0000000 --- a/pose/re10k_crossrun.py +++ /dev/null @@ -1,69 +0,0 @@ -"""Cross-run consistency on IN-DISTRIBUTION re10k -- the accumulation question. - -A shared frame m is reconstructed by two overlapping pairs: - run A = (m-s, m) -> frame m is view 1 (gaussians in frame (m-s)'s world) - run B = (m, m+s) -> frame m is view 0 (gaussians in frame m's world) -Frame m's per-pixel gaussian centers from A and B are the SAME physical points in -two coordinate systems -- exact correspondences. Fit a robust similarity B->A and -read the residual ladder + how many pixels actually agree. High agreement => clean -accumulation; low => floaters dominate and we need consensus fusion. - -This mirrors empirical.py but on data the model handles well (re10k), and sweeps -the baseline stride s so we can see whether a sweet spot maximizes consistency. - - FS_DEVICE=cpu nix develop -c python3 pose/re10k_crossrun.py CLIP.txt FRAME_DIR 120 20,40,80 -""" -import os -import sys -import numpy as np - -import re10k_control as rc -import re10k_experiment as ex # run_engine -import align - -SCRATCH = os.environ.get("SCRATCH", "/tmp") -THR = 0.05 - - -def crossrun(clip, frame_dir, m, strides): - _, frames = rc.parse_clip(clip) - print(f"=== cross-run consistency: {os.path.basename(clip)} shared frame #{m} ===") - print(f" {'stride':>6} {'GTrot(i-j)':>10} {'valid%':>6} {'scale':>6} " - f"{'<1%':>5} {'<2%':>5} {'<5%':>5} {'<10%':>6} {'rigid%':>7} {'sim%':>6} " - f"{'aff%':>6} {'verdict':>20}") - fp = lambda i: os.path.join(frame_dir, f"f{i:04d}.png") - for s in strides: - i, j = m - s, m + s - if not all(os.path.exists(fp(k)) for k in (i, m, j)): - print(f" {s:>6} (missing a frame among {i},{m},{j})"); continue - ptsA, opA = ex.run_engine(fp(i), fp(m), os.path.join(SCRATCH, f"xrA_{m}_{s}.f32")) - ptsB, opB = ex.run_engine(fp(m), fp(j), os.path.join(SCRATCH, f"xrB_{m}_{s}.f32")) - XA, oA = ptsA[1].reshape(-1, 3), opA[1].reshape(-1) # m as view1 of A - XB, oB = ptsB[0].reshape(-1, 3), opB[0].reshape(-1) # m as view0 of B - mask = (oA > THR) & (oB > THR) - A, B = XA[mask], XB[mask] - if mask.sum() < 200: - print(f" {s:>6} (too few valid: {mask.sum()})"); continue - scene = align._rms(A - A.mean(0)) - sc, R, t, inl = align.fit_similarity_ransac(B, A, thresh=0.02 * scene, iters=400) - res = np.linalg.norm(align.apply_sim((sc, R, t), B) - A, axis=1) - within = {q: 100 * (res < q * scene).mean() for q in (0.01, 0.02, 0.05, 0.10)} - L = align.diagnose(B[inl], A[inl]) - gtrot = rc.rot_deg(rc.rel_pose(frames[i]["c2w"], frames[j]["c2w"])) - print(f" {s:>6} {gtrot:>10.2f} {100*mask.mean():>6.1f} {sc:>6.3f} " - f"{within[0.01]:>5.0f} {within[0.02]:>5.0f} {within[0.05]:>5.0f} " - f"{within[0.10]:>6.0f} {100*L['rigid']/L['scene']:>7.1f} " - f"{100*L['similarity']/L['scene']:>6.1f} {100*L['affine']/L['scene']:>6.1f} " - f"{L['verdict']:>20}") - print("\n accumulation\n" - "is clean; low => most per-pixel gaussians are partner-dependent floaters " - "(need consensus fusion).") - - -if __name__ == "__main__": - clip, frame_dir = sys.argv[1], sys.argv[2] - m = int(sys.argv[3]) if len(sys.argv) > 3 else 120 - strides = tuple(int(x) for x in sys.argv[4].split(",")) if len(sys.argv) > 4 \ - else (20, 40, 80) - crossrun(clip, frame_dir, m, strides) diff --git a/pose/re10k_experiment.py b/pose/re10k_experiment.py deleted file mode 100644 index a2712d3..0000000 --- a/pose/re10k_experiment.py +++ /dev/null @@ -1,76 +0,0 @@ -"""Run FreeSplatter-scene on RealEstate10K pairs (IN-distribution) and check the -recovered relative pose + focal against ground truth, vs frame stride (baseline). - -Unlike Tanks-and-Temples (out of distribution -> garbage), the model is known-good -here, so this is a real control: low poseErr confirms our PnP against an INDEPENDENT -GT (re10k poses), and agreement validates the re10k camera convention. - -Frames must already be fetched into frame_dir as fNNNN.png (re10k_fetch.py); both -anchor and anchor+stride must be among the fetched indices. - - FS_DEVICE=cpu nix develop -c python3 pose/re10k_experiment.py CLIP.txt FRAME_DIR 40 20,40,80,160 -""" -import os -import sys -import subprocess -import numpy as np - -import re10k_control as rc -import pnp - -HERE = os.path.dirname(os.path.abspath(__file__)) -GGUF = os.environ.get("FS_GGUF", os.path.join(HERE, "..", ".cache", "freesplatter-scene-f16.gguf")) -CLI = os.environ.get("FS_CLI", os.path.join(HERE, "..", "build", "release", "bin", "free_splatter-cli")) -DEVICE = os.environ.get("FS_DEVICE", "cpu") -SCRATCH = os.environ.get("SCRATCH", "/tmp") - - -def run_engine(img0, img1, out): - r = subprocess.run([CLI, "--device", DEVICE, "--out", out, GGUF, img0, img1], - capture_output=True, text=True) - if r.returncode != 0: - raise RuntimeError(f"CLI failed: {r.stderr[-500:]}") - a = np.fromfile(out, np.float32).reshape(2, 512, 512, 23) - pts = [a[v, ..., 0:3].astype(np.float64) for v in (0, 1)] - op = [a[v, ..., 15].astype(np.float64) for v in (0, 1)] - return pts, op - - -def cos(a, b): - return float(a @ b / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-12)) - - -def experiment(clip, frame_dir, anchor, strides): - url, frames = rc.parse_clip(clip) - f_gt = rc.gt_focal_512(frames[0]) - print(f"=== {os.path.basename(clip)} anchor #{anchor} GT focal(512)={f_gt:.1f} " - f"device={DEVICE} ===\n{url}") - print(f" {'stride':>6} {'GTrot':>6} {'valid%':>6} {'poseErr':>8} " - f"{'transDir':>8} {'scaleRat':>8} {'focal':>7} {'fErr%':>6}") - for s in strides: - j = anchor + s - img0 = os.path.join(frame_dir, f"f{anchor:04d}.png") - img1 = os.path.join(frame_dir, f"f{j:04d}.png") - if not (os.path.exists(img0) and os.path.exists(img1)): - print(f" {s:>6} (missing frame {anchor} or {j})"); continue - out = os.path.join(SCRATCH, f"re_{anchor}_{s}.f32") - pts, op = run_engine(img0, img1, out) - res = pnp.estimate_poses(pts, op, normalize=False) - c2w, f_est = res["cam2world"], res["focal"] - rel = np.linalg.inv(c2w[0]) @ c2w[1] - G = rc.rel_pose(frames[anchor]["c2w"], frames[j]["c2w"]) - poseErr = rc.rot_deg(rel[:3, :3] @ G[:3, :3].T) - tdir = cos(rel[:3, 3], G[:3, 3]) - srat = np.linalg.norm(rel[:3, 3]) / (np.linalg.norm(G[:3, 3]) + 1e-12) - validp = min((op[0] > 0.05).mean(), (op[1] > 0.05).mean()) * 100 - ferr = abs(f_est - f_gt) / f_gt * 100 - print(f" {s:>6} {rc.rot_deg(G):>6.2f} {validp:>6.1f} {poseErr:>8.2f} " - f"{tdir:>8.4f} {srat:>8.3f} {f_est:>7.1f} {ferr:>6.1f}") - - -if __name__ == "__main__": - clip, frame_dir = sys.argv[1], sys.argv[2] - anchor = int(sys.argv[3]) if len(sys.argv) > 3 else 40 - strides = tuple(int(x) for x in sys.argv[4].split(",")) if len(sys.argv) > 4 \ - else (20, 40, 80, 160) - experiment(clip, frame_dir, anchor, strides) diff --git a/pose/re10k_fetch.py b/pose/re10k_fetch.py deleted file mode 100644 index 94b1770..0000000 --- a/pose/re10k_fetch.py +++ /dev/null @@ -1,84 +0,0 @@ -"""Fetch a RealEstate10K clip's frames via yt-dlp + ffmpeg -- the standard re10k -workflow, since the dataset ships only poses + YouTube URLs (frames can't be -redistributed). Gives us a dense, in-distribution, GT-posed sequence to drive the -control. Frames are written as fNNNN.png (NNNN = pose-line index) so they pair 1:1 -with re10k_control.parse_clip()'s frames. - - nix develop -c python3 pose/re10k_fetch.py CLIP.txt OUT_DIR [step] - -yt-dlp comes from `nix run nixpkgs#yt-dlp`; ffmpeg is on PATH. Dead/blocked videos -raise -- use find_live_clip to skip them. -""" -import os -import subprocess -import re10k_control as rc - - -def _ytdlp(*a): - return ["nix", "run", "nixpkgs#yt-dlp", "--", *a] - - -def is_live(url, timeout=60): - r = subprocess.run(_ytdlp("--no-warnings", "--simulate", "--get-duration", url), - capture_output=True, text=True, timeout=timeout) - return r.returncode == 0 - - -def download_video(url, out_dir, height=720, timeout=900): - os.makedirs(out_dir, exist_ok=True) - vid = os.path.join(out_dir, "video.mp4") - if os.path.exists(vid) and os.path.getsize(vid) > 0: - return vid - fmt = (f"bv*[height<={height}][ext=mp4]/bv*[height<={height}]/" - f"b[height<={height}]/best") - r = subprocess.run(_ytdlp("-f", fmt, "--no-warnings", "--no-playlist", - "-o", vid, url), - capture_output=True, text=True, timeout=timeout) - if not (os.path.exists(vid) and os.path.getsize(vid) > 0): - raise RuntimeError("yt-dlp failed:\n" + r.stderr[-600:]) - return vid - - -def extract_frames(vid, frames, indices, out_dir): - imgs = {} - for i in indices: - ts = frames[i]["ts"] / 1e6 # absolute video time (s) - p = os.path.join(out_dir, f"f{i:04d}.png") - if not os.path.exists(p): - subprocess.run(["ffmpeg", "-nostdin", "-loglevel", "error", - "-ss", f"{ts:.6f}", "-i", vid, "-frames:v", "1", - "-q:v", "2", "-y", p], check=True, timeout=120) - imgs[i] = p - return imgs - - -def fetch(clip_path, out_dir, indices=None, step=10, height=720): - url, frames = rc.parse_clip(clip_path) - if indices is None: - indices = list(range(0, len(frames), step)) - vid = download_video(url, out_dir, height) - imgs = extract_frames(vid, frames, indices, out_dir) - return url, frames, imgs - - -def find_live_clip(split="test", start=0, limit=300, min_frames=120): - """First clip in [start, start+limit) that has >= min_frames AND a live video.""" - for cp in rc.list_clips(split)[start:start + limit]: - _, fr = rc.parse_clip(cp) - if len(fr) < min_frames: - continue - try: - if is_live(rc.parse_clip(cp)[0]): - return cp - except subprocess.TimeoutExpired: - continue - return None - - -if __name__ == "__main__": - import sys - clip, out_dir = sys.argv[1], sys.argv[2] - step = int(sys.argv[3]) if len(sys.argv) > 3 else 10 - url, frames, imgs = fetch(clip, out_dir, step=step) - print(f"got {len(imgs)} frames from {url}") - print("indices:", sorted(imgs)[:6], "...", sorted(imgs)[-3:]) diff --git a/pose/render_ply.py b/pose/render_ply.py deleted file mode 100644 index 1cde6c7..0000000 --- a/pose/render_ply.py +++ /dev/null @@ -1,54 +0,0 @@ -"""Quick pinhole projection of a colored PLY to a PNG (z-buffered point splat) -- -a visual coherence check for the accumulated cloud, no 3D viewer needed. - - nix develop -c python3 pose/render_ply.py world.ply out.png [focal] [radius] -""" -import sys -import numpy as np -import cv2 - - -def read_ply(path): - with open(path, "rb") as f: - n = 0 - while True: - l = f.readline() - if l.startswith(b"element vertex"): - n = int(l.split()[-1]) - if l.strip() == b"end_header": - break - rec = np.frombuffer(f.read(n * 15), - dtype=[("x", " 1e-3 - Xc, col = Xc[good], rgb[good] - u = focal * Xc[:, 0] / Xc[:, 2] + W / 2 - v = focal * Xc[:, 1] / Xc[:, 2] + H / 2 - ui, vi = np.round(u).astype(int), np.round(v).astype(int) - img = np.zeros((H, W, 3), np.uint8) - order = np.argsort(-Xc[:, 2]) # far first; near overwrites - ui, vi, col = ui[order], vi[order], col[order] - for dy in range(-radius, radius + 1): - for dx in range(-radius, radius + 1): - uu, vv = ui + dx, vi + dy - inb = (uu >= 0) & (uu < W) & (vv >= 0) & (vv < H) - img[vv[inb], uu[inb]] = col[inb] - return img - - -if __name__ == "__main__": - ply, out = sys.argv[1], sys.argv[2] - focal = float(sys.argv[3]) if len(sys.argv) > 3 else 274.0 - radius = int(sys.argv[4]) if len(sys.argv) > 4 else 1 - xyz, rgb = read_ply(ply) - img = render(xyz, rgb, focal, radius=radius) - cv2.imwrite(out, cv2.cvtColor(img, cv2.COLOR_RGB2BGR)) - print(f"rendered {len(xyz):,} pts -> {out} ({(img.any(2)).mean()*100:.0f}% filled)") diff --git a/pose/test_pose.py b/pose/test_pose.py deleted file mode 100644 index 3ad0c80..0000000 --- a/pose/test_pose.py +++ /dev/null @@ -1,258 +0,0 @@ -"""Asset-free golden tests for the pose / coordinate-normalization prototype. - -No model, no fixtures, no cv2: synthetic geometry with KNOWN ground truth, the -way CLAUDE.md wants new ops verified ("pinned to hand-computed references... a -wrong op should fail with zero fixtures"). Run: - - nix develop -c python3 pose/test_pose.py # from repo root - cd pose && nix develop -c python3 test_pose.py - -Each test prints PASS/FAIL with the numbers it checked; exit code is nonzero on -any failure. -""" -import numpy as np - -import align -import focal -import pnp - -RNG = np.random.default_rng(12345) -_fails = [] - - -def check(name, cond, detail: object = ""): - tag = "PASS" if cond else "FAIL" - print(f" [{tag}] {name}" + (f" ({detail})" if detail else "")) - if not cond: - _fails.append(name) - - -def rand_rotation(rng): - Q, _ = np.linalg.qr(rng.normal(size=(3, 3))) - if np.linalg.det(Q) < 0: - Q[:, 0] = -Q[:, 0] - return Q - - -def rot_angle(R): - return np.degrees(np.arccos(np.clip((np.trace(R) - 1) / 2, -1, 1))) - - -# =========================================================================== -# Coordinate-system normalization (the scale question) -# =========================================================================== - -def test_similarity_roundtrip(): - """Recover a KNOWN (s,R,t) exactly -- the fundamental correctness golden.""" - print("test_similarity_roundtrip") - X = RNG.normal(size=(200, 3)) * np.array([3, 2, 5]) - s_t, R_t, t_t = 1.7, rand_rotation(RNG), np.array([4.0, -1.0, 2.0]) - Y = align.apply_sim((s_t, R_t, t_t), X) - s, R, t = align.fit_similarity(X, Y) - check("scale", abs(s - s_t) < 1e-9, f"{s:.12f} vs {s_t}") - check("rotation", rot_angle(R @ R_t.T) < 1e-7, f"{rot_angle(R @ R_t.T):.2e} deg") - check("translation", np.linalg.norm(t - t_t) < 1e-7) - check("residual~0", align._rms(align.apply_sim((s, R, t), X) - Y) < 1e-9) - - -def test_scale_detection(): - """B differs from A by PURE uniform scale: rigid fails, similarity nails it. - This is the detector for 'you need the scale DoF' -- the expected cross-run case.""" - print("test_scale_detection") - X = RNG.normal(size=(300, 3)) * 2.0 - s_t = 2.5 - Y = align.apply_sim((s_t, rand_rotation(RNG), np.array([1.0, 2.0, -3.0])), X) - L = align.diagnose(X, Y) - check("recovered scale", abs(L["scale"] - s_t) < 1e-8, f"{L['scale']:.10f}") - check("rigid residual large", L["rigid"] / L["scene"] > 0.1, f"{L['rigid']/L['scene']:.3f}") - check("similarity residual ~0", L["similarity"] / L["scene"] < 1e-9) - check("verdict needs_scale", L["verdict"] == "needs_scale", L["verdict"]) - - -def test_nonlinear_detection(): - """Affine and genuinely-nonlinear warps are distinguished from pure scale.""" - print("test_nonlinear_detection") - X = RNG.normal(size=(400, 3)) * 2.0 + np.array([0, 0, 6.0]) - - # (a) anisotropic (affine) warp: stretch axes unequally -> not a similarity, - # but affine fits exactly. - A = np.diag([1.0, 1.6, 0.7]) @ rand_rotation(RNG) - Yaff = (A @ X.T).T + np.array([1.0, 0.0, 2.0]) - L = align.diagnose(X, Yaff) - check("affine: similarity fails", L["similarity"] / L["scene"] > 1e-3, - f"{L['similarity']/L['scene']:.3f}") - check("affine: affine fits", L["affine"] / L["scene"] < 1e-9) - check("affine: verdict needs_affine", L["verdict"] == "needs_affine", L["verdict"]) - - # (b) nonlinear warp: depth-quadratic stretch (focal-error fingerprint) -> - # even affine cannot remove it, and the residual correlates with depth. - Z = X[:, 2] - Ynl = X.copy() - Ynl[:, 0] *= (1.0 + 0.15 * (Z - Z.mean()) ** 2 / Z.var()) - L2 = align.diagnose(X, Ynl) - check("nonlinear: affine fails", L2["affine"] / L2["scene"] > 1e-3, - f"{L2['affine']/L2['scene']:.3f}") - check("nonlinear: verdict nonlinear", L2["verdict"] == "nonlinear", L2["verdict"]) - check("nonlinear: depth-correlated residual", L2["structured"], - f"corr={L2['depth_corr']:.2f}") - - -def test_outlier_robustness(): - """RANSAC similarity survives 30% gross floaters in the overlap.""" - print("test_outlier_robustness") - n = 400 - X = RNG.normal(size=(n, 3)) * 2.0 - s_t, R_t, t_t = 1.3, rand_rotation(RNG), np.array([0.5, -2.0, 1.0]) - Y = align.apply_sim((s_t, R_t, t_t), X) - n_out = int(0.3 * n) - out_idx = RNG.choice(n, n_out, replace=False) - Y[out_idx] += RNG.normal(size=(n_out, 3)) * 10.0 # floaters - s, R, t, inl = align.fit_similarity_ransac(X, Y, thresh=0.05) - check("scale recovered", abs(s - s_t) < 1e-3, f"{s:.6f} vs {s_t}") - check("rotation recovered", rot_angle(R @ R_t.T) < 0.1) - check("translation recovered", np.linalg.norm(t - t_t) < 1e-2) - check("outliers excluded", (~inl[out_idx]).mean() > 0.9, - f"{(~inl[out_idx]).mean()*100:.0f}% of outliers rejected") - - -def test_loop_correction(): - """Sim(3) even-distribution loop closure recovers a loop from uniform drift.""" - print("test_loop_correction") - R = rand_rotation(RNG) - M = align.sim_matrix(1.07, R, np.array([0.3, -0.2, 0.1])) - check("M^0 == I", np.allclose(align.sim_frac_power(M, 0.0), np.eye(4), atol=1e-9)) - check("M^1 == M", np.allclose(align.sim_frac_power(M, 1.0), M, atol=1e-9)) - check("(M^.5)^2 == M", - np.allclose(align.sim_frac_power(M, .5) @ align.sim_frac_power(M, .5), M, atol=1e-9)) - # distributing a known accumulated drift D^(k/n) cancels it to ~0 - n = 12 - D = align.sim_matrix(1.15, R, np.array([0.4, -0.2, 0.1])) - clean = np.array([[np.cos(t), np.sin(t), 0.1 * t] - for t in np.linspace(0, 2 * np.pi, n + 1)]) - drift = np.array([(align.sim_frac_power(np.linalg.inv(D), k / n) @ - np.append(clean[k], 1))[:3] for k in range(n + 1)]) - corr = np.array([(align.sim_frac_power(D, k / n) @ - np.append(drift[k], 1))[:3] for k in range(n + 1)]) - check("drift present before", align._rms(drift - clean) > 0.1) - check("recovers clean loop", align._rms(corr - clean) < 1e-9, - f"ATE {align._rms(corr - clean):.1e}") - - -def test_loop_closure(): - """A closed loop of consistent links ~= identity; injected scale drift shows.""" - print("test_loop_closure") - links = [] - for _ in range(4): - links.append((RNG.uniform(0.8, 1.2), rand_rotation(RNG), RNG.normal(size=3))) - # close the loop exactly: last link = inverse of the composed first three - T = align.identity() - for Ti in links[:3]: - T = align.compose(Ti, T) - links_closed = links[:3] + [align.invert(T)] - e = align.loop_closure_error(links_closed) - check("closed loop ~ identity", e["scale_err"] < 1e-9 and e["rot_deg"] < 1e-6 - and e["trans"] < 1e-9, f"scale_err={e['scale_err']:.1e}") - # now inject a 12% scale error into one link - bad = links_closed[:] - bad[0] = (bad[0][0] * 1.12, bad[0][1], bad[0][2]) - e2 = align.loop_closure_error(bad) - check("scale drift detected", abs(e2["scale_err"] - np.log(1.12)) < 1e-9, - f"scale_err={e2['scale_err']:.4f} (expected {np.log(1.12):.4f})") - - -# =========================================================================== -# PnP (verified against analytic ground truth, numpy backend) -# =========================================================================== - -def make_intrinsics(f=400.0, H=512, W=512): - pp = np.array([W / 2.0, H / 2.0]) - K = np.array([[f, 0, pp[0]], [0, f, pp[1]], [0, 0, 1.0]]) - return K, pp, f, H, W - - -def project(Pw, R, t, K): - Xc = (R @ Pw.T).T + t - proj = (K @ Xc.T).T - return proj[:, :2] / proj[:, 2:3], Xc[:, 2] - - -def test_focal_recovery(): - """Weiszfeld recovers a known focal from exact (point, pixel) pairs.""" - print("test_focal_recovery") - K, pp, f, H, W = make_intrinsics(f=437.0) - Xc = RNG.uniform([-2, -2, 4], [2, 2, 9], size=(500, 3)) # camera-frame, Z>0 - px = f * Xc[:, :2] / Xc[:, 2:3] + pp - f_est = focal.estimate_focal(Xc, px, pp) - check("exact focal", abs(f_est - f) < 1e-6, f"{f_est:.9f} vs {f}") - px_noisy = px + RNG.normal(size=px.shape) * 0.5 - f_n = focal.estimate_focal(Xc, px_noisy, pp) - check("focal under noise", abs(f_n - f) / f < 0.02, f"{f_n:.2f} vs {f}") - - -def test_pnp_recovery(): - """numpy DLT+RANSAC PnP recovers known per-view extrinsics (cheirality ok).""" - print("test_pnp_recovery") - K, pp, f, H, W = make_intrinsics() - Pw = RNG.uniform([-2, -2, 4], [2, 2, 8], size=(500, 3)) # world == view-0 cam frame - worst_R, worst_t = 0.0, 0.0 - for _ in range(8): - # small motion so all points stay in front of the moved camera - ax = RNG.normal(size=3) - ax /= np.linalg.norm(ax) - ang = np.radians(RNG.uniform(3, 20)) - K_ = np.array([[0, -ax[2], ax[1]], [ax[2], 0, -ax[0]], [-ax[1], ax[0], 0]]) - R = np.eye(3) + np.sin(ang) * K_ + (1 - np.cos(ang)) * K_ @ K_ - t = RNG.uniform(-0.6, 0.6, size=3) - px, z = project(Pw, R, t, K) - if np.any(z <= 0): - continue - c2w, inl = pnp.solve_pnp_numpy(Pw, px, K) - w2c = np.linalg.inv(c2w) - worst_R = max(worst_R, rot_angle(w2c[:3, :3] @ R.T)) - worst_t = max(worst_t, np.linalg.norm(w2c[:3, 3] - t)) - check("rotation recovered", worst_R < 1e-4, f"worst {worst_R:.2e} deg") - check("translation recovered", worst_t < 1e-5, f"worst {worst_t:.2e}") - - -def test_pnp_outliers(): - """PnP RANSAC tolerates corrupted correspondences (floaters).""" - print("test_pnp_outliers") - K, pp, f, H, W = make_intrinsics() - Pw = RNG.uniform([-2, -2, 4], [2, 2, 8], size=(600, 3)) - R = rand_rotation(RNG) - # keep rotation modest so points stay in front - R = np.eye(3) + 0.15 * (R - R.T) - R, _ = np.linalg.qr(R) - t = np.array([0.3, -0.2, 0.4]) - px, z = project(Pw, R, t, K) - keep = z > 0 - Pw, px = Pw[keep], px[keep] - n_out = int(0.25 * len(px)) - oi = RNG.choice(len(px), n_out, replace=False) - px[oi] += RNG.uniform(-150, 150, size=(n_out, 2)) # corrupt pixels - c2w, inl = pnp.solve_pnp_numpy(Pw, px, K, thresh_px=2.0, iters=300) - w2c = np.linalg.inv(c2w) - check("rotation under outliers", rot_angle(w2c[:3, :3] @ R.T) < 0.2, - f"{rot_angle(w2c[:3, :3] @ R.T):.3f} deg") - check("translation under outliers", np.linalg.norm(w2c[:3, 3] - t) < 0.02) - check("outliers rejected", (~inl[oi]).mean() > 0.85, - f"{(~inl[oi]).mean()*100:.0f}% rejected") - - -def main(): - tests = [ - test_similarity_roundtrip, test_scale_detection, test_nonlinear_detection, - test_outlier_robustness, test_loop_correction, test_loop_closure, - test_focal_recovery, test_pnp_recovery, test_pnp_outliers, - ] - for t in tests: - t() - print() - if _fails: - print(f"FAILED ({len(_fails)}): " + ", ".join(_fails)) - raise SystemExit(1) - print("ALL POSE TESTS OK") - - -if __name__ == "__main__": - main() diff --git a/pose/tt_control.py b/pose/tt_control.py deleted file mode 100644 index fdd402f..0000000 --- a/pose/tt_control.py +++ /dev/null @@ -1,124 +0,0 @@ -"""Tanks-and-Temples (NSVF release) loader -- the dense, GT-posed control. - -A known-good, ground-truth-posed sequence so that when the live FreeSplatter path -misbehaves we can tell whether it's the data/model or our code. NSVF layout per -scene (.cache/tt/TanksAndTemple//): - rgb/{split}_{idx}_{origframe}.png 1920x1080 - pose/{split}_{idx}_{origframe}.txt 4x4 camera-to-world, OpenGL convention - intrinsics.txt 4x4 (fx 0 cx 0 / 0 fy cy 0 / ...) -split 0 = train, 1 = test (held-out orbit views). Frames sorted by origframe give -true capture order. - -What this gives the control: - * GT relative pose between any two frames -> check our PnP relative pose. - * GT focal after 512x512 preprocessing -> check FreeSplatter's recovered focal. - * GT baseline / parallax per frame-stride -> drive the baseline sweep. - -OpenGL->OpenCV: negate the camera y,z axes (columns 1,2 of the c2w rotation), -matching the runner's `c2ws[:, :, 1:3] *= -1`. cam2world, OpenCV (x right, y down, -z forward) -- the same convention our pnp.estimate_poses returns. - -Usage: - nix develop -c python3 pose/tt_control.py [Scene] # default Truck -""" -import os -import re -import sys -import numpy as np - -TT_ROOT = os.environ.get( - "TT_ROOT", - os.path.join(os.path.dirname(__file__), "..", ".cache", "tt", "TanksAndTemple")) - -_GL2CV = np.diag([1.0, -1.0, -1.0, 1.0]) # negate camera y,z axes (right-mul) - - -def gl_to_cv(c2w): - """OpenGL camera-to-world -> OpenCV camera-to-world.""" - return c2w @ _GL2CV - - -def load_intrinsics(scene_dir): - M = np.loadtxt(os.path.join(scene_dir, "intrinsics.txt")) - return {"fx": float(M[0, 0]), "fy": float(M[1, 1]), - "cx": float(M[0, 2]), "cy": float(M[1, 2])} - - -def _origframe(name): - # {split}_{idx}_{origframe}.ext -> int(origframe) - return int(re.split(r"[_.]", name)[2]) - - -def load_scene(scene, split=None): - """Return (frames, intr). frames: list of dicts {name, idx, orig, img, c2w(OpenCV)} - sorted by capture order. split=0/1 to filter train/test, None for all.""" - scene_dir = scene if os.path.isdir(scene) else os.path.join(TT_ROOT, scene) - intr = load_intrinsics(scene_dir) - pose_dir, rgb_dir = os.path.join(scene_dir, "pose"), os.path.join(scene_dir, "rgb") - frames = [] - for pf in os.listdir(pose_dir): - if not pf.endswith(".txt"): - continue - sp = int(pf.split("_")[0]) - if split is not None and sp != split: - continue - c2w_gl = np.loadtxt(os.path.join(pose_dir, pf)) - img = os.path.join(rgb_dir, pf[:-4] + ".png") - frames.append({"name": pf[:-4], "split": sp, "orig": _origframe(pf), - "img": img, "c2w": gl_to_cv(c2w_gl)}) - frames.sort(key=lambda f: f["orig"]) - return frames, intr - - -def rel_pose(c2w_a, c2w_b): - """Camera b expressed in camera a's frame: inv(c2w_a) @ c2w_b (OpenCV).""" - return np.linalg.inv(c2w_a) @ c2w_b - - -def rot_deg(R): - return float(np.degrees(np.arccos(np.clip((np.trace(R[:3, :3]) - 1) / 2, -1, 1)))) - - -def scene_radius(frames): - """Mean camera-center distance to the centroid of all camera centers.""" - cen = np.array([f["c2w"][:3, 3] for f in frames]) - return float(np.linalg.norm(cen - cen.mean(0), axis=1).mean()), cen.mean(0) - - -def baseline_report(scene="Truck"): - frames, intr = load_scene(scene) - n = len(frames) - radius, _ = scene_radius(frames) - f512 = intr["fx"] * 512.0 / 1080.0 # after center-crop 1080 + resize 512 - print(f"=== {scene}: {n} frames " - f"(train {sum(f['split']==0 for f in frames)}, " - f"test {sum(f['split']==1 for f in frames)}) ===") - print(f"intrinsics 1920x1080: fx={intr['fx']:.1f} fy={intr['fy']:.1f} " - f"cx={intr['cx']:.1f} cy={intr['cy']:.1f}") - print(f"GT focal after 512x512 preprocess (center-crop 1080 -> resize): " - f"{f512:.1f} px (pp=256)") - print(f"orbit radius (mean cam dist to center): {radius:.4f}") - print("\nframe-stride sweep -- viewpoint change between frame i and i+stride:") - print(f" {'stride':>6} {'pairs':>6} {'rot(deg) med':>13} {'p10..p90':>16} " - f"{'baseline/r med':>15} {'parallax(deg)':>14}") - for s in (1, 2, 3, 5, 10, 20, 40): - rots, bl = [], [] - for i in range(0, n - s): - T = rel_pose(frames[i]["c2w"], frames[i + s]["c2w"]) - rots.append(rot_deg(T)) - bl.append(np.linalg.norm(T[:3, 3])) - if not rots: - continue - rots, bl = np.array(rots), np.array(bl) - blr = bl / radius - # small-angle parallax ~ baseline / radius (rad) -> deg - par = np.degrees(np.median(blr)) - print(f" {s:>6} {len(rots):>6} {np.median(rots):>13.2f} " - f"{np.percentile(rots,10):>7.2f}..{np.percentile(rots,90):<7.2f} " - f"{np.median(blr):>15.3f} {par:>14.2f}") - print("\nUse this to pick pair strides for the sweet-spot sweep: too small a " - "stride -> ~0 parallax (degenerate depth); too large -> low overlap.") - - -if __name__ == "__main__": - baseline_report(sys.argv[1] if len(sys.argv) > 1 else "Truck") diff --git a/pose/tt_experiment.py b/pose/tt_experiment.py deleted file mode 100644 index 8f9449a..0000000 --- a/pose/tt_experiment.py +++ /dev/null @@ -1,79 +0,0 @@ -"""Run FreeSplatter-scene on Tanks-and-Temples pairs; check recovered pose + focal -against GROUND TRUTH as a function of frame stride (baseline). The control: GT -poses are known-good, so error here is OUR pipeline or the model being out of -distribution -- not unknown data. - -Metrics per pair (view0 -> view1): - GTrot : GT relative rotation (deg) = the viewpoint change we asked for. - valid% : min over views of opacity>0.05 fraction (PnP support). - poseErr : angle between our recovered relative rotation and GT (deg) -- the - scale-free "is the pose right" check. - transDir : cos angle between our relative-translation direction and GT (1=perfect; - FreeSplatter's metric scale is arbitrary, so only direction is checked). - scaleRat : ||our t|| / ||GT t|| -- the monocular scale FreeSplatter chose vs metric. - focalErr%: |recovered focal - GT focal(512)| / GT. - - FS_DEVICE=cpu nix develop -c python3 pose/tt_experiment.py Truck 120 2,5,10,20 -""" -import os -import sys -import subprocess -import numpy as np - -import tt_control as tt -import pnp - -HERE = os.path.dirname(os.path.abspath(__file__)) -GGUF = os.environ.get("FS_GGUF", os.path.join(HERE, "..", ".cache", "freesplatter-scene-f16.gguf")) -CLI = os.environ.get("FS_CLI", os.path.join(HERE, "..", "build", "release", "bin", "free_splatter-cli")) -DEVICE = os.environ.get("FS_DEVICE", "cpu") -SCRATCH = os.environ.get("SCRATCH", "/tmp") - - -def run_engine(img0, img1, out): - r = subprocess.run([CLI, "--device", DEVICE, "--out", out, GGUF, img0, img1], - capture_output=True, text=True) - if r.returncode != 0: - raise RuntimeError(f"CLI failed: {r.stderr[-500:]}") - a = np.fromfile(out, np.float32).reshape(2, 512, 512, 23) - pts = [a[v, ..., 0:3].astype(np.float64) for v in (0, 1)] - op = [a[v, ..., 15].astype(np.float64) for v in (0, 1)] - return pts, op - - -def cos(a, b): - return float(a @ b / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-12)) - - -def experiment(scene="Truck", anchor=120, strides=(2, 5, 10, 20)): - frames, intr = tt.load_scene(scene) - f_gt = intr["fx"] * 512.0 / 1080.0 - print(f"=== {scene} anchor #{anchor} ({frames[anchor]['name']}) " - f"GT focal(512)={f_gt:.1f} device={DEVICE} ===") - print(f" {'stride':>6} {'GTrot':>6} {'valid%':>6} {'poseErr':>8} " - f"{'transDir':>8} {'scaleRat':>8} {'focal':>7} {'fErr%':>6}") - for s in strides: - j = anchor + s - if j >= len(frames): - continue - out = os.path.join(SCRATCH, f"tt_{scene}_{anchor}_{s}.f32") - pts, op = run_engine(frames[anchor]["img"], frames[j]["img"], out) - res = pnp.estimate_poses(pts, op, normalize=False) # scene recipe - c2w, f_est = res["cam2world"], res["focal"] - rel = np.linalg.inv(c2w[0]) @ c2w[1] - G = tt.rel_pose(frames[anchor]["c2w"], frames[j]["c2w"]) - poseErr = tt.rot_deg(rel[:3, :3] @ G[:3, :3].T) - tdir = cos(rel[:3, 3], G[:3, 3]) - srat = np.linalg.norm(rel[:3, 3]) / (np.linalg.norm(G[:3, 3]) + 1e-12) - validp = min((op[0] > 0.05).mean(), (op[1] > 0.05).mean()) * 100 - ferr = abs(f_est - f_gt) / f_gt * 100 - print(f" {s:>6} {tt.rot_deg(G):>6.1f} {validp:>6.1f} {poseErr:>8.2f} " - f"{tdir:>8.4f} {srat:>8.3f} {f_est:>7.1f} {ferr:>6.1f}") - - -if __name__ == "__main__": - scene = sys.argv[1] if len(sys.argv) > 1 else "Truck" - anchor = int(sys.argv[2]) if len(sys.argv) > 2 else 120 - strides = tuple(int(x) for x in sys.argv[3].split(",")) if len(sys.argv) > 3 \ - else (2, 5, 10, 20) - experiment(scene, anchor, strides) diff --git a/src/linalg.h b/src/linalg.h index 5705621..a510471 100644 --- a/src/linalg.h +++ b/src/linalg.h @@ -6,8 +6,9 @@ // C++ with no extra runtime deps beyond ggml) — every routine here reduces to one // primitive: a symmetric cyclic-Jacobi eigensolver. SVD of a 3x3 is built as the // eigendecomposition of MᵀM; the DLT nullspace is the smallest eigenvector of -// AᵀA. This mirrors the dependency-free numpy reference solver in pose/pnp.py, -// which is itself verified bit-for-bit (~1e-7) against cv2 in check_cv2_parity.py. +// AᵀA. This mirrored the dependency-free numpy reference solver of the (now +// removed; see git history) pose/ prototype, itself verified bit-for-bit (~1e-7) +// against cv2. // // Everything is f64, row-major, and header-only. Sizes are tiny (<=12) so the // O(n^3) Jacobi sweeps are negligible next to the engine forward pass. diff --git a/src/pose.cpp b/src/pose.cpp index 01ea13e..78cc1b3 100644 --- a/src/pose.cpp +++ b/src/pose.cpp @@ -1,9 +1,13 @@ -// pose.cpp — C++ port of the pose/ prototype (focal.py + align.py + pnp.py). +// pose.cpp — C++ port of the (now removed) pose/ research prototype: focal + +// Umeyama/Sim(3) align + robust PnP + sliding-window accumulation / loop closure +// / consensus fusion. See git history for the Python prototype it was validated +// against, layer by layer. // -// Faithful to FreeSplatter's scene recipe; the dependency-free numpy reference -// solver is what ships here (verified ~1e-7 vs cv2, bit-exact vs upstream -// estimate_poses on real engine output). All linear algebra goes through the -// self-contained Jacobi eigensolver in linalg.h — no Eigen, no OpenCV. +// Faithful to FreeSplatter's scene recipe; the dependency-free solver is what +// ships here (the prototype's numpy reference was verified ~1e-7 vs cv2 and +// bit-exact vs upstream estimate_poses on real engine output). All linear algebra +// goes through the self-contained Jacobi eigensolver in linalg.h — no Eigen, no +// OpenCV. #include "pose.h" #include diff --git a/src/pose.h b/src/pose.h index dbe1a23..7641d1a 100644 --- a/src/pose.h +++ b/src/pose.h @@ -1,14 +1,14 @@ // pose.h — downstream camera-pose recovery + cross-run alignment (C++ port). // -// This is the C++ port of the proven `pose/` research prototype (focal.py + -// align.py + pnp.py): given the engine's per-pixel gaussians it recovers each +// This is the C++ port of the (now removed; see git history) proven `pose/` +// research prototype: given the engine's per-pixel gaussians it recovers each // view's camera (PnP), and aligns successive runs into one accumulating world // (Sim(3)). It is dependency-free — only the self-contained linalg.h — so it // ships from the CLI / C API with no Python and no Eigen/OpenCV (see CLAUDE.md). // // Faithful to FreeSplatter's scene recipe (estimate_poses -> DUSt3R fast_pnp): -// the numpy reference solver, verified ~1e-7 against cv2 and bit-exact to upstream -// estimate_poses on real engine output (pose/check_upstream_parity.py). +// the prototype's numpy reference solver was verified ~1e-7 against cv2 and +// bit-exact to upstream estimate_poses on real engine output. // // Conventions: f64 throughout; a similarity acts as x -> s*(R@x)+t; gaussian // channel layout (scene, 23ch) is xyz[0:3] SH[3:15] opacity[15] scale[16:19] @@ -71,7 +71,7 @@ LoopError loop_closure_error(const std::vector & links); Mat4 sim_frac_power(const Mat4 & M, double f); // ---- PnP ------------------------------------------------------------------- -// RANSAC DLT PnP with known intrinsics K (mirrors pose/pnp.py numpy backend). +// RANSAC DLT PnP with known intrinsics K (the prototype's numpy backend). // Xw is N*3 world points, pixels is N*2 (col,row). Returns cam2world (4x4) and // fills `inliers` (size N, 0/1). Kept as the asset-free golden-test reference; on // real scenes the DLT inherits the planar/mirror degeneracy — use solve_pnp. From 831a13db1157ccfde0ff32589363adf8c2994c40 Mon Sep 17 00:00:00 2001 From: Richard Palethorpe Date: Sat, 27 Jun 2026 16:10:48 +0100 Subject: [PATCH 15/33] fuzz: pose C-API + image-decode harnesses; fix a SIGFPE the fuzzer found Fuzz the untrusted user-facing surfaces opened by the pose work (GGUF stays trusted, not fuzzed): - fuzz_pose: the public pose C-API (free_splatter_estimate_poses + the accumulator add_pair/cloud/fuse/camera_path) on arbitrary float gaussian buffers (NaN/Inf/denormals) and fuzz-chosen geometry. - fuzz_decode: the image-FILE path (arbitrary bytes -> stb_image -> crop/resize -> CHW), the surface a user photo crosses in the CLI/demo. stb is third-party; per CLAUDE.md we fuzz the boundary and would guard rather than patch a stb-internal trip -- none seen (31k+ runs clean). Fixes found by fuzz_pose (our code, so fixed not guarded): - SIGFPE: fit_similarity_ransac sampled `% N` with N=0 (an image pair with no overlapping valid pixels). Guard N<3 (RANSAC's minimal sample): all-inlier, plain fit when N>=1, else identity. - Latent float-cast UB: consensus_fuse now skips non-finite cloud points and clamps the voxel-coordinate cast, so an Inf point can't make (int32)floor(NaN). All four fuzzers clean; asset-free ctest tier (incl. test_pose fusion goldens) still green under ASan/UBSan. Co-Authored-By: Claude Opus 4.8 (1M context) --- CMakeLists.txt | 9 ++++++- fuzz/README.md | 21 ++++++++++++--- fuzz/fuzz_decode.cpp | 39 ++++++++++++++++++++++++++++ fuzz/fuzz_pose.cpp | 62 ++++++++++++++++++++++++++++++++++++++++++++ src/pose.cpp | 35 +++++++++++++++++++++---- 5 files changed, 156 insertions(+), 10 deletions(-) create mode 100644 fuzz/fuzz_decode.cpp create mode 100644 fuzz/fuzz_pose.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index ae59312..2e757af 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -145,12 +145,19 @@ if (FREE_SPLATTER_BUILD_TOOLS AND NOT EMSCRIPTEN) endif() if (FREE_SPLATTER_FUZZ) - foreach(target fuzz_image fuzz_options) + foreach(target fuzz_image fuzz_options fuzz_pose) add_executable(${target} fuzz/${target}.cpp) target_link_libraries(${target} PRIVATE free_splatter) target_include_directories(${target} PRIVATE src) target_link_options(${target} PRIVATE -fsanitize=fuzzer) # libFuzzer main endforeach() + # fuzz_decode also drives the vendored (third-party) stb decoder, so it needs + # the stb implementation TU + headers (built warning-quiet like the CLI). + add_executable(fuzz_decode fuzz/fuzz_decode.cpp tools/stb_impl.cpp) + target_link_libraries(fuzz_decode PRIVATE free_splatter) + target_include_directories(fuzz_decode PRIVATE src ${CMAKE_SOURCE_DIR}/third_party/stb) + target_link_options(fuzz_decode PRIVATE -fsanitize=fuzzer) + set_source_files_properties(tools/stb_impl.cpp PROPERTIES COMPILE_OPTIONS "-w") endif() # --- WASM (Emscripten) ------------------------------------------------------- diff --git a/fuzz/README.md b/fuzz/README.md index 8789e0e..e36f47d 100644 --- a/fuzz/README.md +++ b/fuzz/README.md @@ -6,8 +6,10 @@ libFuzzer harnesses for the **untrusted** input surfaces, built with ```sh nix develop .#fuzz cmake --preset fuzz && cmake --build --preset fuzz -mkdir -p corpus_img && ./build/fuzz/fuzz_image -max_total_time=60 corpus_img -mkdir -p corpus_opt && ./build/fuzz/fuzz_options -max_total_time=60 corpus_opt +mkdir -p corpus_img && ./build/fuzz/fuzz_image -max_total_time=60 corpus_img +mkdir -p corpus_opt && ./build/fuzz/fuzz_options -max_total_time=60 corpus_opt +mkdir -p corpus_pose && ./build/fuzz/fuzz_pose -max_total_time=60 corpus_pose +mkdir -p corpus_decode && ./build/fuzz/fuzz_decode -max_total_time=60 corpus_decode ``` ## Targets @@ -18,9 +20,20 @@ mkdir -p corpus_opt && ./build/fuzz/fuzz_options -max_total_time=60 corpus_opt crashes, reads OOB, or returns a populated buffer on a rejected input. - **`fuzz_options`** — the options-builder / device-string setters with arbitrary byte strings; asserts no crash and NULL-safe frees. +- **`fuzz_pose`** — the public pose C-API: `free_splatter_estimate_poses` and the + accumulator (`add_pair` → `cloud` / `fuse` / `camera_path`) driven with + arbitrary float gaussian buffers (NaN/Inf/denormals) and fuzz-chosen geometry. + Caught a real SIGFPE (the RANSAC sampler's `% N` with N=0 valid + correspondences) and motivated the non-finite guards in `consensus_fuse` + (the float→int voxel-coordinate cast). +- **`fuzz_decode`** — the untrusted image-FILE path: arbitrary bytes → + `stb_image` → center-crop + resize → CHW (the surface a user photo crosses in + the CLI / demo). stb is vendored THIRD-PARTY; per CLAUDE.md we fuzz the boundary + and would guard (not patch) any stb-internal sanitizer trip — none seen so far. ## Trust boundary (intentional) The **GGUF model file is TRUSTED** and is deliberately **not** fuzzed: it is our -own converter's output, loaded by us. Image inputs and the public C-API string -arguments are **untrusted** and fuzzed. Keep this asymmetry — see `CLAUDE.md`. +own converter's output, loaded by us. Image inputs (encoded files and decoded +pixels) and the public C-API arguments are **untrusted** and fuzzed. Keep this +asymmetry — see `CLAUDE.md`. diff --git a/fuzz/fuzz_decode.cpp b/fuzz/fuzz_decode.cpp new file mode 100644 index 0000000..dcb0a5c --- /dev/null +++ b/fuzz/fuzz_decode.cpp @@ -0,0 +1,39 @@ +// libFuzzer harness for the UNTRUSTED image-FILE decode path: arbitrary bytes -> +// stb_image -> the CLI's center-crop + resize -> [0,1] CHW. This is the actual +// surface a user photo crosses in `free_splatter-cli` / the demo. stb_image is a +// vendored THIRD-PARTY decoder; per CLAUDE.md we fuzz the boundary and, where stb +// itself trips a sanitizer on malformed input, GUARD it rather than patch stb. +#include "stb_image.h" +#include "stb_image_resize2.h" + +#include +#include +#include + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t * data, size_t size) { + int w = 0, h = 0, n = 0; + // Decode arbitrary bytes as an image, forcing RGB (the CLI's contract). + unsigned char * px = stbi_load_from_memory(data, (int) size, &w, &h, &n, 3); + if (!px) return 0; // rejected -> nothing to do + + // Guard against a decoded size that would overflow the crop/resize math (stb + // can report large dims on crafted headers); the CLI works on sane photos. + if ((int64_t) w * h > 64ll * 1024 * 1024 || w <= 0 || h <= 0) { stbi_image_free(px); return 0; } + + // center-crop to a square, then resize to a small size (mirror the CLI). + const int s = w < h ? w : h, left = (w - s) / 2, top = (h - s) / 2; + std::vector sq((size_t) s * s * 3); + for (int y = 0; y < s; y++) + std::memcpy(&sq[(size_t) y * s * 3], &px[((size_t)(top + y) * w + left) * 3], (size_t) s * 3); + stbi_image_free(px); + + const int outsz = 32; + std::vector rz((size_t) outsz * outsz * 3); + stbir_resize_uint8_linear(sq.data(), s, s, 0, rz.data(), outsz, outsz, 0, STBIR_RGB); + + std::vector chw((size_t) 3 * outsz * outsz); + for (int c = 0; c < 3; c++) + for (int i = 0; i < outsz * outsz; i++) + chw[(size_t) c * outsz * outsz + i] = rz[(size_t) i * 3 + c] / 255.0f; + return 0; +} diff --git a/fuzz/fuzz_pose.cpp b/fuzz/fuzz_pose.cpp new file mode 100644 index 0000000..4d737c5 --- /dev/null +++ b/fuzz/fuzz_pose.cpp @@ -0,0 +1,62 @@ +// libFuzzer harness for the public pose C-API surface (free_splatter_estimate_poses +// + the accumulator). These now take caller-supplied float buffers across the ABI, +// so a binding that feeds NaN/Inf/degenerate geometry must not crash, read OOB, or +// trip UBSan (e.g. a non-finite float -> int voxel-coord cast). The gaussian buffer +// is engine output, but the C-API is a public boundary; keep it robust. +#include "free_splatter.h" + +#include +#include +#include +#include + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t * data, size_t size) { + if (size < 8) return 0; + + // small, fuzz-driven geometry so many inputs are explored cheaply + const int gc = 23; + const int H = 4 + (data[0] % 5) * 4; // 4,8,12,16,20 + const int W = 4 + (data[1] % 5) * 4; + const int P = H * W; + + auto fill = [&](std::vector & buf, size_t off) { + // reinterpret the fuzz bytes as floats (NaN/Inf/denormals all appear), + // tiling deterministically to fill the whole byte range. + uint8_t * b = (uint8_t *) buf.data(); + const size_t nbytes = buf.size() * sizeof(float); + for (size_t i = 0; i < nbytes; i++) b[i] = data[(off + i) % size]; + }; + + // 1) estimate_poses on a 2-view buffer + { + std::vector g((size_t) 2 * P * gc); + fill(g, 2); + std::vector c2w((size_t) 2 * 16); + float focal = 0; + free_splatter_estimate_poses(g.data(), 2, H, W, gc, 0.05f, c2w.data(), &focal); + } + + // 2) accumulator: add a few pairs, then cloud / fuse / camera_path + free_splatter_accumulator * acc = free_splatter_accumulator_new(H, W, 0.05f); + if (acc) { + const int npairs = 1 + (data[2] % 3); // 1..3 pairs + std::vector g((size_t) 2 * P * gc); + for (int k = 0; k < npairs; k++) { + fill(g, (size_t) 3 + k); + free_splatter_accumulator_add_pair(acc, g.data(), gc); + } + free_splatter_point * cloud = nullptr; size_t nc = 0; + if (free_splatter_accumulator_cloud(acc, &cloud, &nc) == 0) free_splatter_buf_free(cloud); + + const float voxel = 0.005f + (data[3] % 16) * 0.01f; // 0.005..0.155 + const int k = 1 + (data[4] % 4); // 1..4 + free_splatter_point * fused = nullptr; size_t nf = 0; + if (free_splatter_accumulator_fuse(acc, voxel, k, &fused, &nf) == 0) free_splatter_buf_free(fused); + + float * path = nullptr; int32_t nfr = 0; + if (free_splatter_accumulator_camera_path(acc, &path, &nfr) == 0) free_splatter_buf_free(path); + + free_splatter_accumulator_free(acc); + } + return 0; +} diff --git a/src/pose.cpp b/src/pose.cpp index 78cc1b3..f71a8ac 100644 --- a/src/pose.cpp +++ b/src/pose.cpp @@ -218,6 +218,14 @@ Sim3 fit_similarity(const double * X, const double * Y, int N, bool with_scale) Sim3 fit_similarity_ransac(const double * X, const double * Y, int N, double thresh, int iters, std::vector & inliers, bool with_scale, uint64_t seed) { + // RANSAC's minimal sample is 3 pairs; with fewer there is nothing to sample + // (and the sampler would divide by N). Degenerate but valid at the public + // boundary (e.g. an image pair with no overlapping valid pixels): take all + // points as inliers, and a plain fit when there is at least one. + if (N < 3) { + inliers.assign(N, 1); + return (N >= 1) ? fit_similarity(X, Y, N, with_scale) : sim_identity(); + } Rng rng(seed); std::vector best(N, 0); int best_cnt = 0; @@ -962,19 +970,30 @@ FuseStats consensus_fuse(const std::vector & cloud, double voxel_fra st.raw_points = (int64_t) cloud.size(); if (cloud.empty()) return st; + // A non-finite point (NaN/Inf — possible since the cloud is engine output and + // this is a public boundary) would poison the extent and, worse, make the + // float->int voxel-coord cast UB. Skip them throughout; fuse only finite points. + auto finite = [](const AccumPoint & p) { + return std::isfinite(p.x) && std::isfinite(p.y) && std::isfinite(p.z); + }; // extent = rms distance to centroid; voxel = voxel_frac * extent; coords from min. double mean[3] = {0,0,0}, lo[3] = {1e300,1e300,1e300}; + int64_t nf = 0; for (const auto & p : cloud) { + if (!finite(p)) continue; mean[0]+=p.x; mean[1]+=p.y; mean[2]+=p.z; lo[0]=std::min(lo[0],(double)p.x); lo[1]=std::min(lo[1],(double)p.y); lo[2]=std::min(lo[2],(double)p.z); + nf++; } - for (int c = 0; c < 3; c++) mean[c] /= cloud.size(); + if (nf == 0) return st; + for (int c = 0; c < 3; c++) mean[c] /= nf; double ss = 0; for (const auto & p : cloud) { + if (!finite(p)) continue; const double dx=p.x-mean[0], dy=p.y-mean[1], dz=p.z-mean[2]; ss += dx*dx + dy*dy + dz*dz; } - const double ext = std::sqrt(ss / cloud.size()); + const double ext = std::sqrt(ss / nf); const double v = voxel_frac * ext; if (!(v > 0)) return st; @@ -988,10 +1007,16 @@ FuseStats consensus_fuse(const std::vector & cloud, double voxel_fra std::unordered_map grid; grid.reserve(cloud.size() / 2 + 1); + auto vcoord = [&](double x, double lov) -> int32_t { + const double c = std::floor((x - lov) / v); // finite (x finite, v>0) + // clamp to int32 so an extreme-but-finite coordinate can't overflow the cast + if (c < -2147483640.0) return -2147483640; + if (c > 2147483640.0) return 2147483640; + return (int32_t) c; + }; for (const auto & p : cloud) { - Key key{ (int32_t) std::floor((p.x - lo[0]) / v), - (int32_t) std::floor((p.y - lo[1]) / v), - (int32_t) std::floor((p.z - lo[2]) / v) }; + if (!finite(p)) continue; + Key key{ vcoord(p.x, lo[0]), vcoord(p.y, lo[1]), vcoord(p.z, lo[2]) }; Vox & vx = grid[key]; vx.sx+=p.x; vx.sy+=p.y; vx.sz+=p.z; vx.sr+=p.r; vx.sg+=p.g; vx.sb+=p.b; vx.cnt++; bool seen = false; From 9850bf32c8cb21aa00c4c2708ba5c289da172ed8 Mon Sep 17 00:00:00 2001 From: Richard Palethorpe Date: Sat, 27 Jun 2026 16:24:37 +0100 Subject: [PATCH 16/33] demo: accumulating-reconstruction viewer (cloud grows as photos are added) A web demo of the live idea: feed a photo stream, watch one world assemble. - web/accumulate.html: forks the index.html EWA splat renderer to play a SEQUENCE of clouds (the reconstruction from 2 photos, then 3, then 4, ...), with the input photos accumulating in the TOP-RIGHT filmstrip (newest highlighted) as each is folded in. Auto-advances + gentle auto-orbit; ?start=/auto=/ms=/spin= deep-link params. Seeds an unsorted draw order on each step so the cloud paints immediately (the depth-sort worker then refines it). - scripts/make_accumulate_demo.sh: one engine pass over the frames via `free_splatter-cli --accumulate` -> acc_2.splat..acc_N.splat + input thumbnails + manifest.json + the viewer, a self-contained servable dir. - CLI --splat-scale default 0.0015 -> 0.006 of extent (point clouds read as surfaces, not grains); documented in web/README.md. Verified end-to-end on a RealEstate10K clip (8 frames -> 7 steps): headless chromium/SwiftShader renders a coherent dining-room reconstruction that grows 259,702 -> 413,517 splats across the steps, filmstrip populating as designed. Co-Authored-By: Claude Opus 4.8 (1M context) --- scripts/make_accumulate_demo.sh | 53 +++++ tools/free_splatter-cli.cpp | 2 +- web/README.md | 22 ++ web/accumulate.html | 351 ++++++++++++++++++++++++++++++++ 4 files changed, 427 insertions(+), 1 deletion(-) create mode 100755 scripts/make_accumulate_demo.sh create mode 100644 web/accumulate.html diff --git a/scripts/make_accumulate_demo.sh b/scripts/make_accumulate_demo.sh new file mode 100755 index 0000000..4e6ce94 --- /dev/null +++ b/scripts/make_accumulate_demo.sh @@ -0,0 +1,53 @@ +#!/usr/bin/env bash +# Bake the accumulating-reconstruction demo: run the engine over a stream of +# photos, fold them into one growing world (free_splatter-cli --accumulate), and +# lay out a self-contained web demo — the cloud reconstructed from 2 photos, then +# 3, then 4, …, plus the input thumbnails and the accumulate.html viewer. +# +# MODEL=.cache/freesplatter-scene-f16.gguf \ +# scripts/make_accumulate_demo.sh OUT_DIR FRAME0 FRAME1 FRAME2 ... +# +# Then serve it: cd OUT_DIR && python3 -m http.server 8080 (open index.html) +set -euo pipefail +cd "$(dirname "$0")/.." +ROOT=$(pwd) +MODEL=${MODEL:-.cache/freesplatter-scene-f16.gguf} +MAXSPLATS=${MAXSPLATS:-500000} +DEVICE=${DEVICE:-cpu} +CLI=${CLI:-$ROOT/build/release/bin/free_splatter-cli} + +[ "$#" -ge 3 ] || { echo "usage: $0 OUT_DIR FRAME0 FRAME1 [FRAME2 ...] (>=2 frames)" >&2; exit 1; } +OUT=$1; shift +FRAMES=("$@") +mkdir -p "$OUT" + +# 1) input thumbnails: center-crop square + resize, mirroring the engine's own +# preprocessing so the strip shows exactly the view that was reconstructed. +for i in "${!FRAMES[@]}"; do + ffmpeg -y -loglevel error -i "${FRAMES[$i]}" \ + -vf "crop='min(iw,ih)':'min(iw,ih)',scale=360:360" "$OUT/view_$i.jpg" +done + +# 2) accumulate: one engine pass over the stream writes acc_2.splat, acc_3.splat, +# ... (the cloud after each photo is folded in). +"$CLI" --device "$DEVICE" --accumulate --splat-prefix "$OUT/acc" \ + --max-splats "$MAXSPLATS" "$MODEL" "${FRAMES[@]}" + +# 3) manifest.json: one step per added photo (acc_n.splat + the n thumbnails, +# the last being the one just added). +n=${#FRAMES[@]} +{ + echo '{ "steps": [' + for ((k=2;k<=n;k++)); do + imgs="" + for ((j=0;j "$OUT/manifest.json" + +# 4) the viewer, self-contained next to its assets +cp "$ROOT/web/accumulate.html" "$OUT/index.html" +echo "demo baked -> $OUT" +echo "serve: (cd $OUT && python3 -m http.server 8080) then open http://localhost:8080/" diff --git a/tools/free_splatter-cli.cpp b/tools/free_splatter-cli.cpp index 683db95..0ad110e 100644 --- a/tools/free_splatter-cli.cpp +++ b/tools/free_splatter-cli.cpp @@ -151,7 +151,7 @@ int main(int argc, char ** argv) { float opac_thr = 5e-3f; long max_splats = 0; bool accumulate = false, fuse = false; - float voxel = 0.02f, splat_scale = 0.0015f; + float voxel = 0.02f, splat_scale = 0.006f; // splat radius as a fraction of cloud extent int fuse_k = 2; std::vector inputs; diff --git a/web/README.md b/web/README.md index 7ca0ec3..094e258 100644 --- a/web/README.md +++ b/web/README.md @@ -24,6 +24,28 @@ python3 -m http.server -d web 8000 # then open http://localhost:8000 Load a different file with `?splat=other.splat`. +## Accumulating-reconstruction demo (`accumulate.html`) + +A second viewer that plays the **growing world** the pose pipeline builds: the +cloud reconstructed from 2 photos, then 3, then 4, … as each new view is folded +in (`free_splatter-cli --accumulate` → cross-run Sim(3) alignment). The input +photos appear one-per-step in the top-right filmstrip, newest highlighted; the +WebGL renderer is the same EWA splatting as `index.html`. + +```sh +# bake the demo (engine pass over a photo stream -> a self-contained demo dir) +nix develop -c scripts/make_accumulate_demo.sh .cache/demo/accumulate \ + frames/f0000.png frames/f0020.png frames/f0040.png frames/f0060.png ... + +# serve it (the bake copies accumulate.html in as index.html) +python3 -m http.server -d .cache/demo/accumulate 8080 # open http://localhost:8080 +``` + +URL params: `?ms=2600` step interval, `?start=N` begin at step N, `?auto=0` no +auto-advance, `?spin=0` no auto-orbit. The cloud shown is the raw accumulated +cloud; `--fuse` also writes a consensus-fused `acc_fused.splat` (the edge-haze +floaters removed). + ## Sanity-checking the data separately The `.splat` uses the antimatter15 layout, so you can verify the *data* (apart diff --git a/web/accumulate.html b/web/accumulate.html new file mode 100644 index 0000000..2fe6b39 --- /dev/null +++ b/web/accumulate.html @@ -0,0 +1,351 @@ + + + + + + +free-splatter.cpp — accumulating reconstruction + + + + +
+

accumulating reconstruction

+
free-splatter.cpp — one world from a growing photo stream.
+
+
photos folded in0
+
splats0
+
fps
+
starting…
+
+
+
+
drag to orbit · scroll to zoom · the cloud grows as each photo is added
+ + + + From ee5b02c3c2a30df148ae5565bfa12e85a45d90d4 Mon Sep 17 00:00:00 2001 From: Richard Palethorpe Date: Sat, 27 Jun 2026 18:33:37 +0100 Subject: [PATCH 17/33] pose: carry gaussian scale + rotation through accumulation (render as splats) The accumulated cloud kept only position+color, so the demo rendered as an isotropic point cloud. A similarity x->s*(R@x)+t scales a gaussian's covariance by s^2 and rotates it by R, so the shape transforms cleanly: carry it. - AccumPoint / free_splatter_point gain scale[3] + rotation quaternion (w,x,y,z). - Accumulator::add_pair de-interleaves the engine's scale (ch16:19) and rotation (ch19:23); add_view sets scale_world = T.s * scale_local and q_world = quat(T.R) * q_local (new mat3_to_quat / quat_mul / quat_normalize helpers, with a zero/NaN-quaternion -> identity guard). - consensus_fuse averages the scale and keeps a representative orientation. - write_cloud_splat emits the real anisotropic scale + rotation (OpenCV->OpenGL remap, same as the single-run write_splat); --splat-scale is now a radius multiplier (default 1.0), not an isotropic fraction. Verified: anisotropic scale spans the engine's [1e-4,0.02] (mean axis-ratio ~29) with per-gaussian orientations; golden test_pose green; fuzz_pose clean (the identity-quaternion guard keeps the new quat math UB-free on garbage input). Co-Authored-By: Claude Opus 4.8 (1M context) --- include/free_splatter.h | 8 +++-- src/free_splatter.cpp | 2 ++ src/pose.cpp | 59 +++++++++++++++++++++++++++++++++---- src/pose.h | 9 ++++-- tests/test_pose.cpp | 8 +++-- tools/free_splatter-cli.cpp | 36 ++++++++++------------ 6 files changed, 91 insertions(+), 31 deletions(-) diff --git a/include/free_splatter.h b/include/free_splatter.h index 2913c94..f486d56 100644 --- a/include/free_splatter.h +++ b/include/free_splatter.h @@ -93,11 +93,15 @@ FREE_SPLATTER_API int free_splatter_estimate_poses( int32_t gaussian_channels, float opacity_threshold, float * cam2world_out, float * focal_out); -// One accumulated point in the global frame: position, color in [0,1], and the -// source frame it came from (for consensus fusion). +// One accumulated 3D gaussian in the global frame: position, color in [0,1], the +// anisotropic scale and rotation quaternion (w,x,y,z) — transformed through the +// cross-run Sim(3) so the cloud renders as oriented splats — and the source frame +// it came from (for consensus fusion). typedef struct { float x, y, z; float r, g, b; + float sx, sy, sz; + float qw, qx, qy, qz; int32_t frame; } free_splatter_point; diff --git a/src/free_splatter.cpp b/src/free_splatter.cpp index 53fd94f..9dade9d 100644 --- a/src/free_splatter.cpp +++ b/src/free_splatter.cpp @@ -191,6 +191,8 @@ int emit_points(const std::vector & src, for (size_t i = 0; i < src.size(); i++) { (*out)[i].x = src[i].x; (*out)[i].y = src[i].y; (*out)[i].z = src[i].z; (*out)[i].r = src[i].r; (*out)[i].g = src[i].g; (*out)[i].b = src[i].b; + (*out)[i].sx = src[i].sx; (*out)[i].sy = src[i].sy; (*out)[i].sz = src[i].sz; + (*out)[i].qw = src[i].qw; (*out)[i].qx = src[i].qx; (*out)[i].qy = src[i].qy; (*out)[i].qz = src[i].qz; (*out)[i].frame = src[i].frame; } *n_out = src.size(); diff --git a/src/pose.cpp b/src/pose.cpp index f71a8ac..6d94f71 100644 --- a/src/pose.cpp +++ b/src/pose.cpp @@ -854,6 +854,38 @@ PoseResult estimate_poses(const std::vector & points, namespace { inline float clamp01(float v) { return v < 0.0f ? 0.0f : (v > 1.0f ? 1.0f : v); } const double SH_C0 = 0.28209479177387814; // SH degree-0 basis (DC -> rgb) + +// quaternion (w,x,y,z) from a rotation matrix (Shepperd's method) +std::array mat3_to_quat(const Mat3 & R) { + const double tr = R(0,0) + R(1,1) + R(2,2); + double w,x,y,z; + if (tr > 0) { + double s = std::sqrt(tr + 1.0) * 2; + w = 0.25*s; x = (R(2,1)-R(1,2))/s; y = (R(0,2)-R(2,0))/s; z = (R(1,0)-R(0,1))/s; + } else if (R(0,0) > R(1,1) && R(0,0) > R(2,2)) { + double s = std::sqrt(1.0 + R(0,0) - R(1,1) - R(2,2)) * 2; + w = (R(2,1)-R(1,2))/s; x = 0.25*s; y = (R(0,1)+R(1,0))/s; z = (R(0,2)+R(2,0))/s; + } else if (R(1,1) > R(2,2)) { + double s = std::sqrt(1.0 + R(1,1) - R(0,0) - R(2,2)) * 2; + w = (R(0,2)-R(2,0))/s; x = (R(0,1)+R(1,0))/s; y = 0.25*s; z = (R(1,2)+R(2,1))/s; + } else { + double s = std::sqrt(1.0 + R(2,2) - R(0,0) - R(1,1)) * 2; + w = (R(1,0)-R(0,1))/s; x = (R(0,2)+R(2,0))/s; y = (R(1,2)+R(2,1))/s; z = 0.25*s; + } + return {w,x,y,z}; +} +// Hamilton product (w,x,y,z): compose rotation a then... q = a ⊗ b applies b first. +std::array quat_mul(const std::array & a, const std::array & b) { + return { a[0]*b[0] - a[1]*b[1] - a[2]*b[2] - a[3]*b[3], + a[0]*b[1] + a[1]*b[0] + a[2]*b[3] - a[3]*b[2], + a[0]*b[2] - a[1]*b[3] + a[2]*b[0] + a[3]*b[1], + a[0]*b[3] + a[1]*b[2] - a[2]*b[1] + a[3]*b[0] }; +} +std::array quat_normalize(std::array q) { + const double n = std::sqrt(q[0]*q[0]+q[1]*q[1]+q[2]*q[2]+q[3]*q[3]); + if (n > 1e-12) for (auto & c : q) c /= n; else q = {1,0,0,0}; + return q; +} } // namespace Accumulator::Accumulator(int H, int W, double opacity_threshold, @@ -862,8 +894,12 @@ Accumulator::Accumulator(int H, int W, double opacity_threshold, riters_(ransac_iters), seed_(seed), T_(sim_identity()), final_cam_(mat4_identity()) {} void Accumulator::add_view(const float * pts, const float * op, const float * rgb, - const Sim3 & T, int frame) { + const float * scl, const float * rot, const Sim3 & T, int frame) { const int P = H_ * W_; + // A similarity x -> s*(R@x)+t scales each gaussian's covariance by s^2 and + // rotates it by R: so the world-frame axis lengths are s*scale and the world + // rotation is qT ⊗ q_local. Compute qT (from T.R) once for the whole view. + const std::array qT = mat3_to_quat(T.R); for (int i = 0; i < P; i++) { if (op[i] <= thr_) continue; Vec3 x = { pts[3*i], pts[3*i+1], pts[3*i+2] }; @@ -871,6 +907,10 @@ void Accumulator::add_view(const float * pts, const float * op, const float * rg AccumPoint p; p.x = (float) w[0]; p.y = (float) w[1]; p.z = (float) w[2]; p.r = rgb[3*i]; p.g = rgb[3*i+1]; p.b = rgb[3*i+2]; + p.sx = (float) (T.s * scl[3*i]); p.sy = (float) (T.s * scl[3*i+1]); p.sz = (float) (T.s * scl[3*i+2]); + std::array ql = quat_normalize({ rot[4*i], rot[4*i+1], rot[4*i+2], rot[4*i+3] }); + std::array qw = quat_normalize(quat_mul(qT, ql)); + p.qw = (float) qw[0]; p.qx = (float) qw[1]; p.qy = (float) qw[2]; p.qz = (float) qw[3]; p.frame = frame; cloud_.push_back(p); } @@ -881,6 +921,7 @@ ChainLink Accumulator::add_pair(const float * g, int gc, double focal) { // de-interleave the two views: points (ch 0:3), opacity (ch 15, already // sigmoid-activated), rgb = clip(SH_DC * C0 + 0.5, 0, 1) (ch 3:6). std::vector pts0(3*P), pts1(3*P), op0(P), op1(P), rgb0(3*P), rgb1(3*P); + std::vector scl0(3*P), scl1(3*P), rot0(4*P), rot1(4*P); for (int i = 0; i < P; i++) { const float * a0 = g + (size_t) i * gc; const float * a1 = g + (size_t) (P + i) * gc; @@ -890,6 +931,9 @@ ChainLink Accumulator::add_pair(const float * g, int gc, double focal) { rgb0[3*i+c] = clamp01((float) (a0[3+c] * SH_C0 + 0.5)); rgb1[3*i+c] = clamp01((float) (a1[3+c] * SH_C0 + 0.5)); } + // gaussian shape: scale ch16:19, rotation quaternion (w,x,y,z) ch19:23 + for (int c = 0; c < 3; c++) { scl0[3*i+c] = a0[16+c]; scl1[3*i+c] = a1[16+c]; } + for (int c = 0; c < 4; c++) { rot0[4*i+c] = a0[19+c]; rot1[4*i+c] = a1[19+c]; } } // recover this pair's cameras (view-0 frame, use_first_focal) @@ -904,8 +948,8 @@ ChainLink Accumulator::add_pair(const float * g, int gc, double focal) { link.sim = sim_identity(); link.global = T_; link.scale = 1.0; link.inlier_frac = 1.0; link.valid_frac = 1.0; link.resid_frac = 0.0; - add_view(pts0.data(), op0.data(), rgb0.data(), T_, next_frame_++); // f_0 - add_view(pts1.data(), op1.data(), rgb1.data(), T_, next_frame_++); // f_1 + add_view(pts0.data(), op0.data(), rgb0.data(), scl0.data(), rot0.data(), T_, next_frame_++); // f_0 + add_view(pts1.data(), op1.data(), rgb1.data(), scl1.data(), rot1.data(), T_, next_frame_++); // f_1 } else { // fit Sim(3) from this run's view 0 (the shared frame, run-k coords) to the // previous run's view 1 (same frame, run-(k-1) coords). Mask = valid in both. @@ -940,7 +984,7 @@ ChainLink Accumulator::add_pair(const float * g, int gc, double focal) { link.valid_frac = (double) n / P; link.resid_frac = (scene > 0 && n_inl > 0) ? rms3(rA.data(), n_inl) / scene : 0.0; - add_view(pts1.data(), op1.data(), rgb1.data(), T_, next_frame_++); // f_{k+1} + add_view(pts1.data(), op1.data(), rgb1.data(), scl1.data(), rot1.data(), T_, next_frame_++); // f_{k+1} } cams_.push_back(mat4_mul(sim_matrix(T_), c2w0)); // frame f_k camera (view 0) @@ -997,7 +1041,8 @@ FuseStats consensus_fuse(const std::vector & cloud, double voxel_fra const double v = voxel_frac * ext; if (!(v > 0)) return st; - struct Vox { double sx=0,sy=0,sz=0,sr=0,sg=0,sb=0; int64_t cnt=0; std::vector frames; }; + struct Vox { double sx=0,sy=0,sz=0,sr=0,sg=0,sb=0; double ssx=0,ssy=0,ssz=0; int64_t cnt=0; + float q[4]={1,0,0,0}; bool has_q=false; std::vector frames; }; struct Key { int32_t i,j,k; bool operator==(const Key&o) const { return i==o.i&&j==o.j&&k==o.k; } }; struct KeyHash { size_t operator()(const Key&q) const { uint64_t h = (uint64_t)(uint32_t)q.i * 0x9E3779B97F4A7C15ULL; @@ -1019,6 +1064,8 @@ FuseStats consensus_fuse(const std::vector & cloud, double voxel_fra Key key{ vcoord(p.x, lo[0]), vcoord(p.y, lo[1]), vcoord(p.z, lo[2]) }; Vox & vx = grid[key]; vx.sx+=p.x; vx.sy+=p.y; vx.sz+=p.z; vx.sr+=p.r; vx.sg+=p.g; vx.sb+=p.b; vx.cnt++; + vx.ssx+=p.sx; vx.ssy+=p.sy; vx.ssz+=p.sz; + if (!vx.has_q) { vx.q[0]=p.qw; vx.q[1]=p.qx; vx.q[2]=p.qy; vx.q[3]=p.qz; vx.has_q=true; } // representative orientation bool seen = false; for (int32_t f : vx.frames) if (f == p.frame) { seen = true; break; } if (!seen) vx.frames.push_back(p.frame); @@ -1033,6 +1080,8 @@ FuseStats consensus_fuse(const std::vector & cloud, double voxel_fra AccumPoint p; p.x = (float) (vx.sx / vx.cnt); p.y = (float) (vx.sy / vx.cnt); p.z = (float) (vx.sz / vx.cnt); p.r = (float) (vx.sr / vx.cnt); p.g = (float) (vx.sg / vx.cnt); p.b = (float) (vx.sb / vx.cnt); + p.sx = (float) (vx.ssx / vx.cnt); p.sy = (float) (vx.ssy / vx.cnt); p.sz = (float) (vx.ssz / vx.cnt); + p.qw = vx.q[0]; p.qx = vx.q[1]; p.qy = vx.q[2]; p.qz = vx.q[3]; p.frame = (int32_t) vx.frames.size(); // support count (informational) fused.push_back(p); } diff --git a/src/pose.h b/src/pose.h index 7641d1a..d4d96fc 100644 --- a/src/pose.h +++ b/src/pose.h @@ -111,11 +111,15 @@ PoseResult estimate_poses(const std::vector & points, bool normalize = false, uint64_t seed = 0); // ---- accumulation: chain successive runs into one world (mirrors accumulate.py) -// One accumulated point in the global frame, tagged with the source frame it came -// from (consensus fusion needs the frame id). rgb is in [0,1]. +// One accumulated 3D gaussian in the global frame, tagged with the source frame +// it came from (consensus fusion needs the frame id). Carries the full anisotropic +// shape (scale + rotation quaternion w,x,y,z), transformed through the cross-run +// Sim(3), so the cloud renders as oriented splats, not isotropic points. rgb [0,1]. struct AccumPoint { float x, y, z; float r, g, b; + float sx, sy, sz; // anisotropic gaussian scale (global frame) + float qw, qx, qy, qz; // gaussian rotation quaternion (w,x,y,z), global frame int32_t frame; }; @@ -156,6 +160,7 @@ class Accumulator { private: void add_view(const float * pts, const float * op, const float * rgb, + const float * scl, const float * rot, // P*3 scale, P*4 quat (w,x,y,z) const Sim3 & T, int frame); int H_, W_; diff --git a/tests/test_pose.cpp b/tests/test_pose.cpp index 53e5ec9..405409e 100644 --- a/tests/test_pose.cpp +++ b/tests/test_pose.cpp @@ -527,11 +527,15 @@ static void test_consensus_fuse() { std::vector cloud; // 5 surface points, each corroborated by frames {0,1,2} (tight cluster -> one // voxel, support 3); 20 isolated single-frame floaters far apart (support 1). + // AccumPoint: {xyz, rgb, scale[3], quat(w,x,y,z), frame}; scale/quat are + // unused by fusion's counts here, set to a unit gaussian / identity rotation. for (int loc = 0; loc < 5; loc++) for (int fr = 0; fr < 3; fr++) - cloud.push_back({ (float)(2.0*loc) + 0.001f*fr, 0.0f, 0.0f, 0.5f, 0.5f, 0.5f, fr }); + cloud.push_back({ (float)(2.0*loc) + 0.001f*fr, 0.0f, 0.0f, 0.5f, 0.5f, 0.5f, + 1.0f, 1.0f, 1.0f, 1.0f, 0.0f, 0.0f, 0.0f, fr }); for (int j = 0; j < 20; j++) - cloud.push_back({ 40.0f + 2.0f*j, 0.0f, 0.0f, 0.2f, 0.2f, 0.2f, j % 4 }); + cloud.push_back({ 40.0f + 2.0f*j, 0.0f, 0.0f, 0.2f, 0.2f, 0.2f, + 1.0f, 1.0f, 1.0f, 1.0f, 0.0f, 0.0f, 0.0f, j % 4 }); std::vector fused; FuseStats st = consensus_fuse(cloud, /*voxel_frac=*/0.02, /*k=*/2, fused); diff --git a/tools/free_splatter-cli.cpp b/tools/free_splatter-cli.cpp index 0ad110e..2d64f38 100644 --- a/tools/free_splatter-cli.cpp +++ b/tools/free_splatter-cli.cpp @@ -93,32 +93,28 @@ static bool write_splat(const float * g, size_t n, int gc, float opac_thr, return true; } -// Write an accumulated point cloud (xyz + rgb, no per-point covariance) as an -// antimatter15 .splat, each point a small isotropic gaussian. Caps to max_splats -// by stride-subsampling. scale_frac sets the splat radius as a fraction of the -// cloud's rms extent. +// Write an accumulated gaussian cloud as an antimatter15 .splat, emitting each +// point's true anisotropic scale + rotation (carried through the Sim(3) by the +// accumulator) so it renders as oriented splats, not isotropic dots. scale_mult +// scales the radii (1.0 = as-predicted). Caps to max_splats by stride-subsampling. static bool write_cloud_splat(const free_splatter_point * pts, size_t n, size_t max_splats, - float scale_frac, const char * path) { - // rms extent about the centroid -> per-point isotropic scale - double m[3] = {0,0,0}; - for (size_t i = 0; i < n; i++) { m[0]+=pts[i].x; m[1]+=pts[i].y; m[2]+=pts[i].z; } - if (n) { m[0]/=n; m[1]/=n; m[2]/=n; } - double ss = 0; - for (size_t i = 0; i < n; i++) { const double dx=pts[i].x-m[0], dy=pts[i].y-m[1], dz=pts[i].z-m[2]; ss += dx*dx+dy*dy+dz*dz; } - const float ext = (float) std::sqrt(ss / (n ? n : 1)); - const float scale = std::max(scale_frac * ext, 1e-6f); - + float scale_mult, const char * path) { const size_t stride = (max_splats > 0 && n > max_splats) ? (n + max_splats - 1) / max_splats : 1; std::ofstream f(path, std::ios::binary); if (!f) { std::fprintf(stderr, "cannot write %s\n", path); return false; } auto u8 = [](float v) -> unsigned char { float t = v < 0 ? 0 : v > 255 ? 255 : v; return (unsigned char) t; }; size_t written = 0; for (size_t i = 0; i < n; i += stride) { - // OpenCV (y down, z forward) -> viewer OpenGL (y up): negate y,z. - float pos[3] = { pts[i].x, -pts[i].y, -pts[i].z }; - float sc[3] = { scale, scale, scale }; - unsigned char rgba[4] = { u8(pts[i].r*255.0f), u8(pts[i].g*255.0f), u8(pts[i].b*255.0f), 255 }; - unsigned char rot[4] = { 255, 128, 128, 128 }; // identity quaternion (w,x,y,z) + const free_splatter_point & p = pts[i]; + // OpenCV (y down, z forward) -> viewer OpenGL (y up): negate y,z; quaternion + // (w,x,y,z) -> (-x, w, -z, y) (the same remap as the single-run write_splat). + float pos[3] = { p.x, -p.y, -p.z }; + float sc[3] = { scale_mult*p.sx, scale_mult*p.sy, scale_mult*p.sz }; + unsigned char rgba[4] = { u8(p.r*255.0f), u8(p.g*255.0f), u8(p.b*255.0f), 255 }; + float q[4] = { -p.qx, p.qw, -p.qz, p.qy }; + float nrm = std::sqrt(q[0]*q[0]+q[1]*q[1]+q[2]*q[2]+q[3]*q[3]) + 1e-12f; + unsigned char rot[4]; + for (int c = 0; c < 4; c++) rot[c] = u8(q[c]/nrm * 128.0f + 128.0f); f.write((const char *) pos, 12); f.write((const char *) sc, 12); f.write((const char *) rgba, 4); @@ -151,7 +147,7 @@ int main(int argc, char ** argv) { float opac_thr = 5e-3f; long max_splats = 0; bool accumulate = false, fuse = false; - float voxel = 0.02f, splat_scale = 0.006f; // splat radius as a fraction of cloud extent + float voxel = 0.02f, splat_scale = 1.0f; // multiplier on the predicted gaussian radii int fuse_k = 2; std::vector inputs; From 546046ce79be0ff6eba7bc50b763827309b76990 Mon Sep 17 00:00:00 2001 From: Richard Palethorpe Date: Sat, 27 Jun 2026 20:58:03 +0100 Subject: [PATCH 18/33] demo: add consensus-fused step + accumulate from cached .f32 pair dumps - free_splatter-cli --accumulate now also accepts pre-computed [2,H,W,gc] .f32 pair dumps (each one pair) instead of images, skipping the engine -- for fast re-bakes and fusion sweeps off cached runs. Verified byte-identical acc_8 to the image path on the same frames, and it runs in ~4s vs ~2min. - make_accumulate_demo.sh passes --fuse and appends a final consensus-fused step to the manifest; the viewer shows a step's optional "label" (so the demo ends on "consensus-fused -- single-view floaters removed", 413k -> 149k splats, the edge-haze floaters gone). Co-Authored-By: Claude Opus 4.8 (1M context) --- scripts/make_accumulate_demo.sh | 15 ++++--- tools/free_splatter-cli.cpp | 69 ++++++++++++++++++++++----------- web/accumulate.html | 3 +- 3 files changed, 58 insertions(+), 29 deletions(-) diff --git a/scripts/make_accumulate_demo.sh b/scripts/make_accumulate_demo.sh index 4e6ce94..4ff775f 100755 --- a/scripts/make_accumulate_demo.sh +++ b/scripts/make_accumulate_demo.sh @@ -29,21 +29,24 @@ for i in "${!FRAMES[@]}"; do done # 2) accumulate: one engine pass over the stream writes acc_2.splat, acc_3.splat, -# ... (the cloud after each photo is folded in). -"$CLI" --device "$DEVICE" --accumulate --splat-prefix "$OUT/acc" \ +# ... (the cloud after each photo is folded in) + acc_fused.splat (the +# consensus surface: voxels seen by >= K frames, single-view floaters removed). +"$CLI" --device "$DEVICE" --accumulate --fuse --splat-prefix "$OUT/acc" \ --max-splats "$MAXSPLATS" "$MODEL" "${FRAMES[@]}" -# 3) manifest.json: one step per added photo (acc_n.splat + the n thumbnails, -# the last being the one just added). +# 3) manifest.json: one step per added photo (acc_n.splat + the n thumbnails, the +# last being the one just added), then a final consensus-fused step. n=${#FRAMES[@]} +allimgs="" +for ((j=0;j "$OUT/manifest.json" diff --git a/tools/free_splatter-cli.cpp b/tools/free_splatter-cli.cpp index 2d64f38..045d537 100644 --- a/tools/free_splatter-cli.cpp +++ b/tools/free_splatter-cli.cpp @@ -187,35 +187,60 @@ int main(int argc, char ** argv) { geo.image_width, geo.image_height, geo.in_channels, geo.gaussian_channels); if (inputs.empty()) { free_splatter_free(ctx); return 0; } - // ---- accumulate mode: chain consecutive image pairs into one world -------- + // ---- accumulate mode: chain a photo stream into one world ----------------- + // Two input forms: a stream of IMAGES (the engine runs on each consecutive + // pair), or pre-computed `.f32` pair dumps (each a [2,H,W,gc] engine output — + // the engine is skipped, for fast re-bakes / fusion sweeps off cached runs). if (accumulate) { - if (inputs.size() < 2) { std::fprintf(stderr, "--accumulate needs >=2 images\n"); free_splatter_free(ctx); return 2; } - const int sz = geo.image_width; - // decode every frame once (CHW per view) + const int sz = geo.image_width, gc = geo.gaussian_channels; + const bool dump_mode = std::all_of(inputs.begin(), inputs.end(), + [](const std::string & s){ return ends_with(s, ".f32"); }); + if (!dump_mode && inputs.size() < 2) { + std::fprintf(stderr, "--accumulate needs >=2 images (or N .f32 pair dumps)\n"); free_splatter_free(ctx); return 2; } + const size_t npairs = dump_mode ? inputs.size() : inputs.size() - 1; + const int64_t per_view = (int64_t) geo.in_channels * sz * sz; + const size_t pair_floats = (size_t) 2 * sz * sz * gc; + + // image mode: decode every frame once (CHW per view) std::vector> frames; - for (const std::string & p : inputs) { - std::vector img; - if (!load_image_chw(p.c_str(), sz, img)) { free_splatter_free(ctx); return 1; } - frames.push_back(std::move(img)); - } + if (!dump_mode) + for (const std::string & p : inputs) { + std::vector img; + if (!load_image_chw(p.c_str(), sz, img)) { free_splatter_free(ctx); return 1; } + frames.push_back(std::move(img)); + } + free_splatter_accumulator * acc = free_splatter_accumulator_new(sz, sz, opac_thr); if (!acc) { std::fprintf(stderr, "accumulator alloc failed\n"); free_splatter_free(ctx); return 1; } - const int64_t per_view = (int64_t) geo.in_channels * sz * sz; - for (size_t k = 0; k + 1 < frames.size(); k++) { - std::vector pair(2 * per_view); - std::memcpy(&pair[0], frames[k].data(), per_view * sizeof(float)); - std::memcpy(&pair[per_view], frames[k+1].data(), per_view * sizeof(float)); - float * g = nullptr; size_t ng = 0; - if (free_splatter_run(ctx, pair.data(), 2, sz, sz, &g, &ng) != 0) { - std::fprintf(stderr, "run pair %zu failed: %s\n", k, free_splatter_last_error(ctx)); - free_splatter_accumulator_free(acc); free_splatter_free(ctx); return 1; + for (size_t k = 0; k < npairs; k++) { + const float * g = nullptr; + std::vector dumpbuf; + float * runout = nullptr; + if (dump_mode) { + std::ifstream f(inputs[k], std::ios::binary | std::ios::ate); + if (!f) { std::fprintf(stderr, "cannot open %s\n", inputs[k].c_str()); free_splatter_accumulator_free(acc); free_splatter_free(ctx); return 1; } + const std::streamsize bytes = f.tellg(); f.seekg(0); + if ((size_t)(bytes / sizeof(float)) != pair_floats) { + std::fprintf(stderr, "dump %s: %zu floats, expected 2*%d*%d*%d\n", inputs[k].c_str(), + (size_t)(bytes/sizeof(float)), sz, sz, gc); + free_splatter_accumulator_free(acc); free_splatter_free(ctx); return 1; } + dumpbuf.resize(pair_floats); f.read((char *) dumpbuf.data(), bytes); + g = dumpbuf.data(); + } else { + std::vector pair(2 * per_view); + std::memcpy(&pair[0], frames[k].data(), per_view * sizeof(float)); + std::memcpy(&pair[per_view], frames[k+1].data(), per_view * sizeof(float)); + size_t ng = 0; + if (free_splatter_run(ctx, pair.data(), 2, sz, sz, &runout, &ng) != 0) { + std::fprintf(stderr, "run pair %zu failed: %s\n", k, free_splatter_last_error(ctx)); + free_splatter_accumulator_free(acc); free_splatter_free(ctx); return 1; } + g = runout; } - free_splatter_accumulator_add_pair(acc, g, geo.gaussian_channels); - free_splatter_buf_free(g); + free_splatter_accumulator_add_pair(acc, g, gc); + if (runout) free_splatter_buf_free(runout); const int nframes = free_splatter_accumulator_frame_count(acc); - std::printf("pair %zu (%s, %s) -> %d frames\n", k, - inputs[k].c_str(), inputs[k+1].c_str(), nframes); + std::printf("pair %zu -> %d frames\n", k, nframes); if (splat_prefix) { free_splatter_point * cloud = nullptr; size_t nc = 0; free_splatter_accumulator_cloud(acc, &cloud, &nc); diff --git a/web/accumulate.html b/web/accumulate.html index 2fe6b39..3a74918 100644 --- a/web/accumulate.html +++ b/web/accumulate.html @@ -325,7 +325,8 @@

accumulating reconstruction

const bytes = new Uint8Array(await resp.arrayBuffer()); setSplat(bytes); if(!framed) frameToData(1.25); // frame once; camera stays as it grows - setStatus(`${step.n} photos · ${count.toLocaleString()} splats`); + setStatus(step.label ? `${step.label} · ${count.toLocaleString()} splats` + : `${step.n} photos · ${count.toLocaleString()} splats`); } catch(err){ setStatus(err.message, true); console.error(err); } } function schedule(){ clearTimeout(timer); if(!playing) return; From 9fef9c907c700f008ad8e9115841b250a95d477f Mon Sep 17 00:00:00 2001 From: Richard Palethorpe Date: Sat, 27 Jun 2026 21:08:22 +0100 Subject: [PATCH 19/33] demo: make the consensus-fused step unmistakable + defeat stale caching The fused step (8th) showed the same "8 photos" panel as acc_8, so it read as a duplicate. Now a labelled step shows a prominent top-center banner ("consensus- fused -- single-view floaters removed -- N splats"), the stat reads "fused (8)", and manifest/splat fetches use cache:"no-store" so an updated demo dir is never served stale. Co-Authored-By: Claude Opus 4.8 (1M context) --- web/accumulate.html | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/web/accumulate.html b/web/accumulate.html index 3a74918..a3c4003 100644 --- a/web/accumulate.html +++ b/web/accumulate.html @@ -55,6 +55,14 @@ #strip .thumb.new .tag { background: rgba(91,140,255,.92); color: #fff; } #hint { position: fixed; bottom: 14px; left: 50%; transform: translateX(-50%); z-index: 10; color: #6b7280; font-size: 11.5px; background: rgba(16,19,24,.7); padding: 6px 12px; border-radius: 8px; } + /* prominent banner shown only on a labelled (consensus-fused) step */ + #banner { position: fixed; top: 18px; left: 50%; transform: translateX(-50%) translateY(-12px); z-index: 11; + display: none; opacity: 0; transition: opacity .4s ease, transform .4s ease; + padding: 9px 18px; border-radius: 999px; font-size: 13.5px; font-weight: 600; letter-spacing: .2px; + color: #eafff0; background: linear-gradient(90deg, rgba(34,160,90,.95), rgba(46,120,210,.95)); + border: 1px solid rgba(255,255,255,.25); box-shadow: 0 8px 26px rgba(0,0,0,.5); } + #banner.show { display: block; opacity: 1; transform: translateX(-50%) translateY(0); } + #banner b { font-weight: 750; } @@ -70,6 +78,7 @@

accumulating reconstruction

+
drag to orbit · scroll to zoom · the cloud grows as each photo is added
diff --git a/server/web/index.html b/server/web/index.html index 2c0cb09..d2c0d79 100644 --- a/server/web/index.html +++ b/server/web/index.html @@ -73,6 +73,7 @@

Photos → 3D gaussian splat

Drop photos of the same scene from different angles — they must overlap. One feed-forward pass on the GPU (Vulkan).
+
📷 Drag photos here, or click to choose
JPG / PNG · overlapping views
From 1dd5b2f338bdf0ac177570b9e27a05716404e9a9 Mon Sep 17 00:00:00 2001 From: Richard Palethorpe Date: Sun, 28 Jun 2026 12:01:10 +0100 Subject: [PATCH 32/33] server: / is a menu of the demo pages (one server, one port) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The pages were already one server, but / was the reconstruct page and the baked accumulating demos were being served as separate static instances on other ports. Now / is a landing menu and everything is reachable from this one server: - server/web/index.html: new menu — cards link to /reconstruct.html (photos -> splat), /accumulate.html (accumulating scenes + video upload), /demo.html (storyboard). - the reconstruct page moves to /reconstruct.html (was index.html); each page gets a "‹ all demos" link back to the menu. - the standalone per-scene static servers are obsolete: those scenes are listed by /api/scenes and switched in /accumulate.html, so there's nothing left to run on a separate port. Verified on the single Vulkan server: /, /reconstruct.html, /accumulate.html, /demo.html, /api/scenes all 200; the menu renders three cards linking the pages. Co-Authored-By: Claude Opus 4.8 (1M context) --- server/README.md | 4 + server/main.go | 2 +- server/web/accumulate.html | 1 + server/web/demo.html | 1 + server/web/index.html | 581 +++--------------------------------- server/web/reconstruct.html | 556 ++++++++++++++++++++++++++++++++++ 6 files changed, 606 insertions(+), 539 deletions(-) create mode 100644 server/web/reconstruct.html diff --git a/server/README.md b/server/README.md index a66521c..ec424e4 100644 --- a/server/README.md +++ b/server/README.md @@ -9,6 +9,10 @@ This is a thin Go HTTP server that binds the `free_splatter` C API with engine — the numerical port, validation, and build details live in the repo root `CLAUDE.md`. +One server, one port. `/` is a **menu** linking to the demo pages, all served from +here: `/reconstruct.html` (photos → splat), `/accumulate.html` (accumulating scenes ++ video upload), `/demo.html` (storyboard). + ## Run ```sh diff --git a/server/main.go b/server/main.go index 7cc91ba..981f2a0 100644 --- a/server/main.go +++ b/server/main.go @@ -13,7 +13,7 @@ // POST /api/scene/from-video multipart "video" + "name" -> {job} (gated in-process accumulate) // GET /api/scene/status/{job} -> {state,total,done,kept,scene,error} // GET /scenes-assets/... -> a scene's manifest.json + .splat + thumbnails -// GET / , /accumulate.html -> embedded frontend (reconstruct page + scene viewer) +// GET / -> menu; /reconstruct.html, /accumulate.html, /demo.html are the pages package main import ( diff --git a/server/web/accumulate.html b/server/web/accumulate.html index e45c8e3..5b5365a 100644 --- a/server/web/accumulate.html +++ b/server/web/accumulate.html @@ -82,6 +82,7 @@
+ ‹ all demos

accumulating reconstruction

free-splatter.cpp — one world from a growing photo stream.
diff --git a/server/web/demo.html b/server/web/demo.html index f0549c2..ab78579 100644 --- a/server/web/demo.html +++ b/server/web/demo.html @@ -45,6 +45,7 @@
+ ‹ all demos

free-splatter.cpp · demo render

loading storyboard…
diff --git a/server/web/index.html b/server/web/index.html index d2c0d79..cc73877 100644 --- a/server/web/index.html +++ b/server/web/index.html @@ -1,555 +1,60 @@ -free-splatter.cpp — photos to gaussian splat +free-splatter.cpp — demos - - -
-
-

Photos → 3D gaussian splat

- -
Drop photos of the same scene from different angles — they must overlap. One feed-forward pass on the GPU (Vulkan).
- -
-
📷 Drag photos here, or click to choose
-
JPG / PNG · overlapping views
-
- -
- -
- +
+

free-splatter.cpp

+

FreeSplatter, in C++/GGML on the GPU — pick a demo.

-
- -
-

free-splatter.cpp

-
3D gaussian scene from your photos.
-
views
-
splats0
-
reconstruct
-
fps
-
render100%
- - -
- - -
drag to orbit · scroll to zoom · WASD/arrows to move
- - +
one server · Vulkan engine · WebGL render
diff --git a/server/web/reconstruct.html b/server/web/reconstruct.html new file mode 100644 index 0000000..9ab2a73 --- /dev/null +++ b/server/web/reconstruct.html @@ -0,0 +1,556 @@ + + + + + + +free-splatter.cpp — photos to gaussian splat + + + + + +
+
+ ‹ all demos +

Photos → 3D gaussian splat

+ +
Drop photos of the same scene from different angles — they must overlap. One feed-forward pass on the GPU (Vulkan).
+
+
📷 Drag photos here, or click to choose
+
JPG / PNG · overlapping views
+
+ +
+ +
+ +
+
+ +
+ ‹ all demos +

free-splatter.cpp

+
3D gaussian scene from your photos.
+
views
+
splats0
+
reconstruct
+
fps
+
render100%
+ + +
+
+
build-up100%
+ +
+ + +
+
+
+ +
drag to orbit · scroll to zoom · WASD/arrows to move
+ + + + From f045b948e8def8e664e276f98176042610f5f2f7 Mon Sep 17 00:00:00 2001 From: Richard Palethorpe Date: Mon, 29 Jun 2026 09:14:00 +0100 Subject: [PATCH 33/33] Tree-merge accumulation + vibrance slider for the accumulate demo MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a batch hierarchical (balanced binary tree) alternative to the linear Sim(3) accumulator, to cut the registration drift that smears the far end of a long photo stream. Adjacent submaps of `block` frames overlap by `overlap` frames and each merge fits one Sim(3) over all shared frames at once, so the fit is over-determined and per-frame depth noise averages out — drift compounds over ~log2(N) hops instead of N. block=2,overlap=1 is the plain overlap-by-one tree. - Engine (src/pose.{h,cpp}): tree_accumulate_overlap + consensus_fuse of an arbitrary frame-tagged cloud (best-frame mode de-ghosts the overlapping per-frame copies); shared fit_band_sim RANSAC registration primitive. - C API: free_splatter_tree_overlap (full merge or staged side-by-side via block/overlap/max_levels/layout_spacing/per_node_cap) + free_splatter_fuse_cloud. - CLI: --accumulate-tree (--tree-block/--tree-overlap, --fuse) and --tree-stages, which bakes one laid-out .splat per merge level + a consensus-fused final stage and a manifest the viewer steps through. - Viewer (accumulate.html): vibrance/saturation slider (?sat= override), ported from reconstruct.html; reframes per stage for tree-stages manifests. Co-Authored-By: Claude Opus 4.8 (1M context) --- include/free_splatter.h | 32 ++++++ server/web/accumulate.html | 23 +++- src/free_splatter.cpp | 62 +++++++++- src/pose.cpp | 220 ++++++++++++++++++++++++++++++++++++ src/pose.h | 40 +++++++ tools/free_splatter-cli.cpp | 104 ++++++++++++++++- 6 files changed, 477 insertions(+), 4 deletions(-) diff --git a/include/free_splatter.h b/include/free_splatter.h index d27c8f4..22ea889 100644 --- a/include/free_splatter.h +++ b/include/free_splatter.h @@ -171,6 +171,38 @@ FREE_SPLATTER_API int free_splatter_refine_cloud( FREE_SPLATTER_API int free_splatter_accumulator_refine( free_splatter_accumulator * acc, float voxel_frac, int32_t iters, float alpha); +// Consensus-fuse an arbitrary point cloud (e.g. a tree root) using each point's +// `frame` tag: keep voxels (size voxel_frac*extent) seen by >= k distinct frames, +// emitting per `mode` (0 averaged, 1 kept, 2 best-frame — dense AND de-ghosted). +// *out malloc'd (free with free_splatter_buf_free). Returns 0 on success. +FREE_SPLATTER_API int free_splatter_fuse_cloud( + const free_splatter_point * pts, size_t n, float voxel_frac, int32_t k, int32_t mode, + free_splatter_point ** out, size_t * n_out); + +// Hierarchical (balanced binary tree) accumulation — batch alternative to the +// linear accumulator. pairs[k] is one engine output [2*H*W*gc] (pair k = frames +// k,k+1; pair k's view-1 and pair k+1's view-0 are the same frame). Adjacent +// submaps of `block` frames overlap by `overlap` frames, and each merge fits its +// Sim(3) over all shared frames at once (over-determined -> averages per-frame +// noise) — so drift compounds over ~log2(N) hops, not N, and is spread evenly +// instead of dumped on the last frame. block=2,overlap=1 is the plain +// overlap-by-one (single shared boundary) tree. +// +// The remaining args are for visualising the merge side by side (pass max_levels=-1, +// layout_spacing=0, per_node_cap=0, n_nodes=NULL for a plain full merge to the root): +// max_levels stops after that many rounds (-1 = full), so 0 returns the independent +// base submaps, 1 the first merge round, …; step 0..D to watch the tree collapse to +// one scene. The remaining nodes are laid out side by side (layout_spacing: 0 = none, +// <0 = auto, >0 = explicit units) and each capped to its most important +// (opacity*radius) points (per_node_cap>0) so every scene renders at its own detail. +// *n_nodes (optional) gets the remaining node count (1 once fully merged). *out +// malloc'd as free_splatter_point[*n_out] (free with free_splatter_buf_free). 0 on success. +FREE_SPLATTER_API int free_splatter_tree_overlap( + const float * const * pairs, int32_t n_pairs, int32_t gaussian_channels, + int32_t height, int32_t width, float opacity_threshold, int32_t block, int32_t overlap, + int32_t max_levels, float layout_spacing, int32_t per_node_cap, + free_splatter_point ** out, size_t * n_out, int32_t * n_nodes); + #ifdef __cplusplus } #endif diff --git a/server/web/accumulate.html b/server/web/accumulate.html index 5b5365a..5b6692e 100644 --- a/server/web/accumulate.html +++ b/server/web/accumulate.html @@ -91,6 +91,9 @@

accumulating reconstruction

fps
starting…
+
@@ -184,13 +187,19 @@

accumulating reconstruction

}`; const FRAG = `#version 300 es precision highp float; +uniform float u_sat; // saturation/vibrance boost (1.0 = as-decoded) in vec4 v_color; in vec2 v_pos; out vec4 o; void main() { float p = -dot(v_pos, v_pos); if (p < -4.0) discard; float alpha = exp(p) * v_color.a; - o = vec4(v_color.rgb * alpha, alpha); + // SH-DC-only color is flat/desaturated; push each splat's color away from its + // luma before the premultiplied blend (the blend itself also greys overlaps). + vec3 rgb = v_color.rgb; + float luma = dot(rgb, vec3(0.2126, 0.7152, 0.0722)); + rgb = clamp(mix(vec3(luma), rgb, u_sat), 0.0, 1.0); + o = vec4(rgb * alpha, alpha); }`; function shader(type, src){ const s=gl.createShader(type); gl.shaderSource(s,src); gl.compileShader(s); if(!gl.getShaderParameter(s,gl.COMPILE_STATUS)) throw new Error(gl.getShaderInfoLog(s)); return s; } @@ -212,6 +221,7 @@

accumulating reconstruction

const U = { proj: gl.getUniformLocation(prog,"u_proj"), view: gl.getUniformLocation(prog,"u_view"), focal: gl.getUniformLocation(prog,"u_focal"), viewport: gl.getUniformLocation(prog,"u_viewport"), + sat: gl.getUniformLocation(prog,"u_sat"), }; gl.uniform1i(gl.getUniformLocation(prog,"u_tex"), 0); gl.disable(gl.DEPTH_TEST); @@ -312,6 +322,12 @@

accumulating reconstruction

function resize(){ const dpr=Math.min(devicePixelRatio,2); canvas.width=innerWidth*dpr; canvas.height=innerHeight*dpr; gl.viewport(0,0,canvas.width,canvas.height); } addEventListener("resize", resize); resize(); +// vibrance (saturation) — cosmetic, demo-layer only; ?sat= overrides so headless/ +// video renders match what you dialed in. Same control as reconstruct.html. +let saturation = parseFloat(_q.get("sat") ?? "1.45"); +{ const vib = $("vibrance"); vib.value = saturation; + vib.addEventListener("input", e => { saturation = +e.target.value; }); } + let frames=0, fpsT=performance.now(), lastT=performance.now(); function render(){ const now=performance.now(), dt=(now-lastT)/1000; lastT=now; @@ -326,6 +342,7 @@

accumulating reconstruction

gl.useProgram(prog); gl.uniformMatrix4fv(U.view,false,view); gl.uniformMatrix4fv(U.proj,false,proj); gl.uniform2f(U.focal,fx,fy); gl.uniform2f(U.viewport,canvas.width,canvas.height); + gl.uniform1f(U.sat, saturation); gl.bindBuffer(gl.ARRAY_BUFFER,quadBuf); gl.enableVertexAttribArray(aCorner); gl.vertexAttribPointer(aCorner,2,gl.FLOAT,false,0,0); if(sortedCount>0){ gl.bindBuffer(gl.ARRAY_BUFFER,idxBuf); gl.enableVertexAttribArray(aIndex); @@ -363,7 +380,9 @@

accumulating reconstruction

if(!resp.ok) throw new Error("HTTP "+resp.status+" for "+step.splat); const bytes = new Uint8Array(await resp.arrayBuffer()); setSplat(bytes); - if(!framed) frameToData(0.9); // frame once; camera stays as it grows + // frame once (camera stays put as the cloud grows); but a tree-stages manifest + // sets "reframe":true so each stage (very different extents) refits the view. + if(!framed || (manifest && manifest.reframe)) frameToData(0.9); if (step.label) { banner.innerHTML = `✓ ${step.label} · ${count.toLocaleString()} splats`; setStatus(`${step.label} · ${count.toLocaleString()} splats`); diff --git a/src/free_splatter.cpp b/src/free_splatter.cpp index ab7462a..14b8595 100644 --- a/src/free_splatter.cpp +++ b/src/free_splatter.cpp @@ -5,6 +5,8 @@ #include "options.h" #include "pose.h" +#include +#include #include #include #include @@ -261,7 +263,20 @@ int free_splatter_accumulator_add_pair(free_splatter_accumulator * acc, const fl int32_t gaussian_channels) { if (!acc || !gaussians || gaussian_channels < 16) return -1; acc->gc = gaussian_channels; - acc->acc.add_pair(gaussians, gaussian_channels); + const free_splatter::pose::ChainLink link = acc->acc.add_pair(gaussians, gaussian_channels); + if (std::getenv("FREE_SPLATTER_DEBUG_LINKS")) { + // Surface the per-link registration diagnostics + the COMPOUNDED global drift + // (open-loop chain T_k = T_{k-1}.sim, so per-link error accumulates). + const auto & g = link.global; + const double tn = std::sqrt(g.t[0]*g.t[0] + g.t[1]*g.t[1] + g.t[2]*g.t[2]); + double c = (g.R(0,0) + g.R(1,1) + g.R(2,2) - 1.0) * 0.5; + c = c > 1.0 ? 1.0 : (c < -1.0 ? -1.0 : c); + std::fprintf(stderr, + "[link] pair=%d local: s=%.4f inlier=%.3f valid=%.3f resid=%.4f | " + "global: s=%.4f rot=%.2fdeg |t|=%.4f\n", + acc->acc.pair_count(), link.sim.s, link.inlier_frac, link.valid_frac, + link.resid_frac, g.s, std::acos(c) * 57.29577951308232, tn); + } return 0; } @@ -306,6 +321,51 @@ int free_splatter_accumulator_refine(free_splatter_accumulator * acc, float voxe return 0; } +int free_splatter_fuse_cloud(const free_splatter_point * pts, size_t n, float voxel_frac, + int32_t k, int32_t mode, free_splatter_point ** out, size_t * n_out) { + if (out) *out = nullptr; + if (n_out) *n_out = 0; + if (!pts || !out || !n_out || k < 1 || !(voxel_frac > 0)) return -1; + std::vector cloud(n); + for (size_t i = 0; i < n; i++) { + const free_splatter_point & p = pts[i]; free_splatter::pose::AccumPoint & a = cloud[i]; + a.x = p.x; a.y = p.y; a.z = p.z; a.r = p.r; a.g = p.g; a.b = p.b; a.opacity = p.opacity; + a.sx = p.sx; a.sy = p.sy; a.sz = p.sz; a.qw = p.qw; a.qx = p.qx; a.qy = p.qy; a.qz = p.qz; + a.frame = p.frame; + } + std::vector fused; + free_splatter::pose::consensus_fuse(cloud, voxel_frac, k, fused, mode); + return emit_points(fused, out, n_out); +} + +int free_splatter_tree_overlap(const float * const * pairs, int32_t n_pairs, int32_t gaussian_channels, + int32_t height, int32_t width, float opacity_threshold, + int32_t block, int32_t overlap, + int32_t max_levels, float layout_spacing, int32_t per_node_cap, + free_splatter_point ** out, size_t * n_out, int32_t * n_nodes) { + if (out) *out = nullptr; + if (n_out) *n_out = 0; + if (n_nodes) *n_nodes = 0; + if (!pairs || n_pairs < 1 || gaussian_channels < 16 || height < 1 || width < 1 || !out || !n_out) + return -1; + for (int32_t i = 0; i < n_pairs; i++) if (!pairs[i]) return -1; + std::vector ps(pairs, pairs + n_pairs); + std::vector merges; + int nn = 0; + std::vector cloud = + free_splatter::pose::tree_accumulate_overlap(ps, height, width, gaussian_channels, opacity_threshold, + block, overlap, 0.02, 300, 0, &merges, + max_levels, layout_spacing, &nn, per_node_cap); + if (n_nodes) *n_nodes = nn; + if (std::getenv("FREE_SPLATTER_DEBUG_LINKS")) + for (const auto & m : merges) + std::fprintf(stderr, + "[tree] L%d range[%d..%d] sharedframes=%d s=%.4f inlier=%.3f valid=%.3f resid=%.4f\n", + m.level, m.lo_frame, m.hi_frame, m.shared_frame, + m.scale, m.inlier_frac, m.valid_frac, m.resid_frac); + return emit_points(cloud, out, n_out); +} + int free_splatter_accumulator_camera_path(const free_splatter_accumulator * acc, float ** out, int32_t * n_frames) { if (out) *out = nullptr; diff --git a/src/pose.cpp b/src/pose.cpp index 15a15d7..7ba152e 100644 --- a/src/pose.cpp +++ b/src/pose.cpp @@ -1078,6 +1078,226 @@ std::vector Accumulator::camera_path() const { return path; } +// ---- hierarchical tree accumulation --------------------------------------- + +// One gaussian (gc-channel engine layout) -> AccumPoint at identity (leaf frame). +static AccumPoint tree_leaf_point(const float * a, int frame) { + AccumPoint p; + p.x = a[0]; p.y = a[1]; p.z = a[2]; + p.r = clamp01((float) (a[3] * SH_C0 + 0.5)); + p.g = clamp01((float) (a[4] * SH_C0 + 0.5)); + p.b = clamp01((float) (a[5] * SH_C0 + 0.5)); + p.opacity = a[15]; + p.sx = a[16]; p.sy = a[17]; p.sz = a[18]; + std::array q = quat_normalize({ a[19], a[20], a[21], a[22] }); + p.qw = (float) q[0]; p.qx = (float) q[1]; p.qy = (float) q[2]; p.qz = (float) q[3]; + p.frame = frame; + return p; +} + +// Apply a Sim(3) to a whole cloud (position s*(R@x)+t, scale *s, orientation qS⊗q). +static void tree_apply_sim_cloud(std::vector & c, const Sim3 & S) { + const std::array qS = mat3_to_quat(S.R); + for (auto & p : c) { + Vec3 w = sim_apply(S, { p.x, p.y, p.z }); + p.x = (float) w[0]; p.y = (float) w[1]; p.z = (float) w[2]; + p.sx = (float) (S.s * p.sx); p.sy = (float) (S.s * p.sy); p.sz = (float) (S.s * p.sz); + std::array q = quat_normalize(quat_mul(qS, { p.qw, p.qx, p.qy, p.qz })); + p.qw = (float) q[0]; p.qx = (float) q[1]; p.qy = (float) q[2]; p.qz = (float) q[3]; + } +} +static void tree_apply_sim_pts(std::vector & pts, const Sim3 & S) { + for (size_t i = 0; i + 3 <= pts.size(); i += 3) { + Vec3 w = sim_apply(S, { pts[i], pts[i+1], pts[i+2] }); + pts[i] = (float) w[0]; pts[i+1] = (float) w[1]; pts[i+2] = (float) w[2]; + } +} + +// Robustly register one band of shared-frame correspondences (XA -> XB, both flat +// 3*n): RANSAC similarity at a scene-relative threshold, plus the inlier residual. +// The single registration primitive for both tree merges (caller derives the +// inlier/valid fractions, which it normalises differently per tree). +struct BandFit { Sim3 S; int n_inl; double scene; double resid; }; +static BandFit fit_band_sim(const std::vector & XA, const std::vector & XB, + double rthr, int riters, uint64_t seed) { + const int n = (int) (XA.size() / 3); + BandFit f{ sim_identity(), 0, 0.0, 0.0 }; + if (n < 3) return f; + double mB[3]; mean3(XB.data(), n, mB); + std::vector yc(3 * n); + for (int k = 0; k < n; k++) for (int c = 0; c < 3; c++) yc[3*k+c] = XB[3*k+c] - mB[c]; + f.scene = rms3(yc.data(), n); + std::vector inl; + f.S = fit_similarity_ransac(XA.data(), XB.data(), n, rthr * f.scene, riters, inl, true, seed); + std::vector rA; + for (int k = 0; k < n; k++) if (inl[k]) { + f.n_inl++; + Vec3 sa = sim_apply(f.S, { XA[3*k], XA[3*k+1], XA[3*k+2] }); + for (int c = 0; c < 3; c++) rA.push_back(sa[c] - XB[3*k+c]); + } + f.resid = (f.scene > 0 && f.n_inl > 0) ? rms3(rA.data(), f.n_inl) / f.scene : 0.0; + return f; +} + +// ---- multi-frame-overlap tree --------------------------------------------- + +// A submap covering a contiguous global frame range, carrying the per-pixel point +// arrays of every frame it spans (in the submap's local frame) so a neighbour can +// align on the whole band of frames they share, not just one. +struct ONode { + std::vector cloud; + int lo, hi; // inclusive global frame range + std::vector> fpts; // [f-lo] -> P*3 per-pixel points + std::vector> fmsk; // [f-lo] -> P opacity mask +}; + +// One pair (frames a, a+1) as a 2-frame submap (identity local frame). +static ONode opair(const float * g, int a, int P, int gc, double thr) { + ONode nd; nd.lo = a; nd.hi = a + 1; + nd.fpts.assign(2, {}); nd.fmsk.assign(2, {}); + nd.fpts[0].resize(3*P); nd.fpts[1].resize(3*P); + nd.fmsk[0].assign(P, 0); nd.fmsk[1].assign(P, 0); + nd.cloud.reserve(2*P); + for (int i = 0; i < P; i++) { + const float * a0 = g + (size_t) i * gc; + const float * a1 = g + (size_t) (P + i) * gc; + for (int c = 0; c < 3; c++) { nd.fpts[0][3*i+c] = a0[c]; nd.fpts[1][3*i+c] = a1[c]; } + nd.fmsk[0][i] = (a0[15] > thr) ? 1 : 0; + nd.fmsk[1][i] = (a1[15] > thr) ? 1 : 0; + if (a0[15] > thr) nd.cloud.push_back(tree_leaf_point(a0, a)); + if (a1[15] > thr) nd.cloud.push_back(tree_leaf_point(a1, a + 1)); + } + return nd; +} + +// Merge R into L's frame, fitting one Sim(3) over EVERY frame they share. Returns +// the combined submap; optionally records the merge diagnostics. +static ONode omerge(ONode & L, ONode & R, int P, double rthr, int riters, uint64_t seed, + std::vector * merges, int level) { + const int so = std::max(L.lo, R.lo), sh = std::min(L.hi, R.hi); // shared frame range + std::vector XA, XB; + for (int f = so; f <= sh; f++) { + const std::vector & ra = R.fpts[f - R.lo]; const std::vector & rm = R.fmsk[f - R.lo]; + const std::vector & la = L.fpts[f - L.lo]; const std::vector & lm = L.fmsk[f - L.lo]; + for (int i = 0; i < P; i++) if (rm[i] && lm[i]) { + XA.push_back(ra[3*i]); XA.push_back(ra[3*i+1]); XA.push_back(ra[3*i+2]); + XB.push_back(la[3*i]); XB.push_back(la[3*i+1]); XB.push_back(la[3*i+2]); + } + } + const int n = (int) (XA.size() / 3); + const int shared = sh - so + 1; + BandFit fit = fit_band_sim(XA, XB, rthr, riters, seed); + const Sim3 & S = fit.S; + if (merges) merges->push_back(TreeMerge{ level, std::min(L.lo,R.lo), std::max(L.hi,R.hi), shared, + S.s, (n > 0) ? (double) fit.n_inl / n : 0.0, (double) n / ((double) shared * P), fit.resid }); + + tree_apply_sim_cloud(R.cloud, S); + for (auto & a : R.fpts) tree_apply_sim_pts(a, S); + ONode m; m.lo = std::min(L.lo, R.lo); m.hi = std::max(L.hi, R.hi); + const int F = m.hi - m.lo + 1; + m.fpts.assign(F, {}); m.fmsk.assign(F, {}); + for (int f = L.lo; f <= L.hi; f++) { m.fpts[f-m.lo] = std::move(L.fpts[f-L.lo]); m.fmsk[f-m.lo] = std::move(L.fmsk[f-L.lo]); } + for (int f = R.lo; f <= R.hi; f++) if (m.fpts[f-m.lo].empty()) { m.fpts[f-m.lo] = std::move(R.fpts[f-R.lo]); m.fmsk[f-m.lo] = std::move(R.fmsk[f-R.lo]); } + m.cloud = std::move(L.cloud); + m.cloud.insert(m.cloud.end(), R.cloud.begin(), R.cloud.end()); + return m; +} + +std::vector tree_accumulate_overlap(const std::vector & pairs, + int H, int W, int gc, double thr, + int block, int overlap, + double rthr, int riters, uint64_t seed, + std::vector * merges, + int max_levels, double layout_spacing, + int * n_nodes_out, int per_node_cap) { + const int P = H * W; + const int M = (int) pairs.size(); + if (n_nodes_out) *n_nodes_out = 0; + if (M < 1) return {}; + const int nframes = M + 1; + overlap = std::max(1, overlap); + block = std::max(block, overlap + 1); + block = std::min(block, nframes); + overlap = std::min(overlap, block - 1); + const int step = std::max(1, block - overlap); + + // overlapping base submaps: each chains `block-1` consecutive pairs (single-frame + // internal links), adjacent submaps share `overlap` frames. + std::vector nodes; + for (int s = 0; s < M; s += step) { + const int w = std::min(block, nframes - s); // frames in this submap + if (w < 2) break; + ONode nd = opair(pairs[s], s, P, gc, thr); // frames s, s+1 + for (int p = s + 1; p <= s + w - 2; p++) { // fold the rest of the block + ONode nx = opair(pairs[p], p, P, gc, thr); + nd = omerge(nd, nx, P, rthr, riters, seed, nullptr, -1); + } + nodes.push_back(std::move(nd)); + if (s + block >= nframes) break; // last submap reached the end + } + + // median base-submap extent — the auto layout pitch for side-by-side stages. + // Only the auto-layout (layout_spacing<0, staged demo) path reads it, so the + // full-merge callers skip these two passes over every submap. + double leaf_extent = 1.0; + if (layout_spacing < 0.0) { + std::vector ex; + for (auto & nd : nodes) { + double m[3] = {0,0,0}; int64_t nf = 0; + for (auto & p : nd.cloud) if (std::isfinite(p.x) && std::isfinite(p.y) && std::isfinite(p.z)) { + m[0]+=p.x; m[1]+=p.y; m[2]+=p.z; nf++; } + if (nf == 0) continue; + for (int c = 0; c < 3; c++) m[c] /= nf; + double ss = 0; + for (auto & p : nd.cloud) if (std::isfinite(p.x) && std::isfinite(p.y) && std::isfinite(p.z)) { + double dx=p.x-m[0], dy=p.y-m[1], dz=p.z-m[2]; ss += dx*dx+dy*dy+dz*dz; } + ex.push_back(std::sqrt(ss / nf)); + } + if (!ex.empty()) { std::sort(ex.begin(), ex.end()); leaf_extent = ex[ex.size()/2]; } + } + + // balanced tree-merge of submaps; each merge fits over the overlap band. Stop + // after max_levels rounds (-1 = fully to the root) for the staged demo. + int level = 0; + while (nodes.size() > 1 && (max_levels < 0 || level < max_levels)) { + std::vector nxt; + nxt.reserve(nodes.size()/2 + 1); + for (size_t i = 0; i + 1 < nodes.size(); i += 2) + nxt.push_back(omerge(nodes[i], nodes[i+1], P, rthr, riters, seed, merges, level)); + if (nodes.size() % 2 == 1) nxt.push_back(std::move(nodes.back())); + nodes = std::move(nxt); level++; + } + if (n_nodes_out) *n_nodes_out = (int) nodes.size(); + if (nodes.empty()) return {}; + + // concatenate remaining nodes, laid out side by side (layout_spacing != 0) and + // capped per node (opacity*radius) — same as tree_accumulate. + const double sigma = (layout_spacing < 0.0) ? 4.0 * leaf_extent : layout_spacing; + const double gcentre = 0.5 * M; + auto importance = [](const AccumPoint & p) -> double { + const double vol = (double) std::max(p.sx,1e-9f) * std::max(p.sy,1e-9f) * std::max(p.sz,1e-9f); + return (double) std::max(p.opacity, 0.0f) * std::cbrt(vol); + }; + std::vector out; + size_t total = 0; + for (auto & nd : nodes) + total += (per_node_cap > 0) ? std::min((size_t) per_node_cap, nd.cloud.size()) : nd.cloud.size(); + out.reserve(total); + for (auto & nd : nodes) { + const double dx = (sigma != 0.0) ? (0.5 * (nd.lo + nd.hi) - gcentre) * sigma : 0.0; + if (per_node_cap > 0 && (int) nd.cloud.size() > per_node_cap) { // keep each scene's own detail + std::vector idx(nd.cloud.size()); + for (size_t i = 0; i < idx.size(); i++) idx[i] = i; + std::partial_sort(idx.begin(), idx.begin() + per_node_cap, idx.end(), + [&](size_t a, size_t b){ return importance(nd.cloud[a]) > importance(nd.cloud[b]); }); + for (int i = 0; i < per_node_cap; i++) { AccumPoint p = nd.cloud[idx[i]]; p.x = (float) (p.x + dx); out.push_back(p); } + } else { + for (const auto & src : nd.cloud) { AccumPoint p = src; p.x = (float) (p.x + dx); out.push_back(p); } + } + } + return out; +} + // ---- consensus fusion ----------------------------------------------------- FuseStats consensus_fuse(const std::vector & cloud, double voxel_frac, int k, diff --git a/src/pose.h b/src/pose.h index ba82a07..dd0e50a 100644 --- a/src/pose.h +++ b/src/pose.h @@ -206,6 +206,46 @@ class Accumulator { int next_frame_ = 0; }; +// ---- hierarchical tree accumulation (balanced binary merge) ---------------- +// Linear chaining composes N Sim(3)s in series, so registration drift compounds +// over N hops and piles onto the last frame (the far end smears). This instead +// builds short submaps and merges them pairwise up a balanced binary tree — so +// drift compounds over only ~log2(N) hops and is spread evenly rather than dumped +// on the end. +struct TreeMerge { // diagnostics per merge (defs as ChainLink) + int level, lo_frame, hi_frame, shared_frame; + double scale, inlier_frac, valid_frac, resid_frac; +}; + +// Multi-frame-overlap tree. pairs[k]: one engine output [2*H*W*gc] (pair k = frames +// k,k+1; pair k's view-1 and pair k+1's view-0 are the same physical frame). +// Adjacent submaps of `block` frames overlap by `overlap` frames and each merge fits +// its Sim(3) over ALL shared frames at once — the fit is over-determined, so the +// per-frame depth-inconsistency noise (the reason a single-frame alignment is only +// ~13% rigid-consistent) averages out and the compounded drift drops. Base submaps +// are short windows of `block` frames (block > overlap), chained internally by +// single-frame links, then merged in a balanced tree on their overlap bands. The +// degenerate block=2,overlap=1 is the plain overlap-by-one (single shared boundary) +// tree. Returns the merged root cloud in the first submap's frame; `merges` +// (optional) gets one entry per merge. +// +// For the staged side-by-side demo: max_levels caps the merge rounds (-1 = full): +// 0 returns the independent base submaps, 1 the first merge round, ... . The +// remaining (>1) nodes are shifted apart along X by their frame-range midpoint when +// layout_spacing != 0 (0 = none, <0 = auto 4x median submap extent, >0 = that +// spacing), each capped to its top per_node_cap points by opacity*radius (0 = +// uncapped) so every laid-out scene renders at its own detail. n_nodes_out +// (optional) reports how many nodes remain (1 once fully merged). +std::vector tree_accumulate_overlap(const std::vector & pairs, + int H, int W, int gaussian_channels, + double opacity_threshold = 0.05, + int block = 4, int overlap = 2, + double ransac_thresh_frac = 0.02, + int ransac_iters = 300, uint64_t seed = 0, + std::vector * merges = nullptr, + int max_levels = -1, double layout_spacing = 0.0, + int * n_nodes_out = nullptr, int per_node_cap = 0); + // ---- consensus fusion (mirrors fuse.py) ----------------------------------- struct FuseStats { int64_t raw_points, voxels, kept_voxels, points_kept, points_dropped; diff --git a/tools/free_splatter-cli.cpp b/tools/free_splatter-cli.cpp index af96083..36aee40 100644 --- a/tools/free_splatter-cli.cpp +++ b/tools/free_splatter-cli.cpp @@ -151,7 +151,8 @@ int main(int argc, char ** argv) { const char * model = nullptr, * splat_prefix = nullptr; float opac_thr = 5e-3f; long max_splats = 0; - bool accumulate = false, fuse = false, parallax = false; + bool accumulate = false, fuse = false, parallax = false, tree = false, tree_stages = false; + int tov_block = 4, tov_overlap = 2; // tree: submap width / shared frames (2/1 = overlap-by-one) int fuse_mode = 1; // 0 averaged, 1 kept, 2 best-frame float voxel = 0.02f, splat_scale = 1.0f; // multiplier on the predicted gaussian radii int fuse_k = 2; @@ -168,6 +169,10 @@ int main(int argc, char ** argv) { else if (a == "--opacity-threshold" && i+1> tree_pairs; + // emit the accumulated cloud after a pair is folded in (optional de-ghost) auto emit_step = [&](int nframes) { if (!splat_prefix) return; @@ -300,6 +309,7 @@ int main(int argc, char ** argv) { (size_t)(bytes/sizeof(float)), sz, sz, gc); free_splatter_accumulator_free(acc); free_splatter_free(ctx); return 1; } std::vector dumpbuf(pair_floats); f.read((char *) dumpbuf.data(), bytes); + if (tree || tree_stages) { tree_pairs.push_back(std::move(dumpbuf)); std::printf("pair %zu collected\n", k); continue; } free_splatter_accumulator_add_pair(acc, dumpbuf.data(), gc); const int nframes = free_splatter_accumulator_frame_count(acc); std::printf("pair %zu -> %d frames\n", k, nframes); @@ -329,6 +339,12 @@ int main(int argc, char ** argv) { } std::printf("keep frame %zu: parallax %.1f deg (vs kept frame %zu)\n", j, px.tri_angle_deg, last_kept); } + if (tree || tree_stages) { + tree_pairs.emplace_back(runout, runout + ng); + free_splatter_buf_free(runout); + std::printf("pair (%zu,%zu) collected\n", last_kept, j); + last_kept = j; continue; + } free_splatter_accumulator_add_pair(acc, runout, gc); free_splatter_buf_free(runout); const int nframes = free_splatter_accumulator_frame_count(acc); @@ -338,6 +354,92 @@ int main(int argc, char ** argv) { } if (skipped) std::printf("gated: skipped %d low-parallax frame(s)\n", skipped); } + if (tree || tree_stages) { + std::vector ps; ps.reserve(tree_pairs.size()); + for (auto & v : tree_pairs) ps.push_back(v.data()); + if (ps.empty()) { std::fprintf(stderr, "tree: need >=1 pair\n"); + free_splatter_accumulator_free(acc); free_splatter_free(ctx); return 1; } + const std::string pfx = splat_prefix ? splat_prefix : "stage"; + + if (tree_stages) { + // write one laid-out .splat per merge level (stage 0 = the independent + // leaf scenes side by side, ... then the single merged world, then a + // consensus-fused clean scene) + a manifest the viewer steps through. + std::string dir = ".", base = pfx; + const size_t slash = pfx.find_last_of('/'); + if (slash != std::string::npos) { dir = pfx.substr(0, slash); base = pfx.substr(slash + 1); } + // per-scene budget: each laid-out scene keeps its own detail (a single + // shared cap would starve every scene to ~1/N and fill it with floaters). + const int per_node = (max_splats > 0 && max_splats < 200000) ? (int) max_splats : 150000; + std::vector labels; + auto stage_path = [&](char * p, size_t cap) { std::snprintf(p, cap, "%s_%zu.splat", pfx.c_str(), labels.size()); }; + // multi-scene stages of the multi-frame-overlap tree: each laid-out + // scene gets its own per_node budget. + for (int L = 0; L <= 64; L++) { + free_splatter_point * sc = nullptr; size_t nsc = 0; int nnodes = 0; + if (free_splatter_tree_overlap(ps.data(), (int) ps.size(), gc, sz, sz, + opac_thr, tov_block, tov_overlap, L, -1.0f /*auto layout*/, per_node, &sc, &nsc, &nnodes) != 0) { + std::fprintf(stderr, "tree-stages failed at level %d\n", L); + free_splatter_accumulator_free(acc); free_splatter_free(ctx); return 1; } + if (nnodes <= 1) { free_splatter_buf_free(sc); break; } // single scene: full budget below + char path[1024]; stage_path(path, sizeof path); + write_cloud_splat(sc, nsc, 0 /*already per-node capped*/, splat_scale, path); + free_splatter_buf_free(sc); + std::printf("stage %zu: %d scenes\n", labels.size(), nnodes); + labels.push_back(L == 0 ? (std::to_string(nnodes) + " independent scenes") + : ("merge " + std::to_string(L) + " — " + std::to_string(nnodes) + " scenes")); + } + // the single merged world, at full budget: first the raw union (the + // overlapping per-frame copies, ghosted), then the consensus-fused + // (best-frame) clean scene — same contrast as the linear demo's acc/fused. + free_splatter_point * root = nullptr; size_t nroot = 0; + if (free_splatter_tree_overlap(ps.data(), (int) ps.size(), gc, sz, sz, opac_thr, + tov_block, tov_overlap, -1, 0.0f, 0, &root, &nroot, nullptr) == 0) { + char path[1024]; stage_path(path, sizeof path); + write_cloud_splat(root, nroot, (size_t) max_splats, splat_scale, path); + std::printf("stage %zu: merged raw union (%zu pts)\n", labels.size(), nroot); + labels.push_back("merged — one scene (raw union)"); + free_splatter_point * fz = nullptr; size_t nfz = 0; + if (free_splatter_fuse_cloud(root, nroot, voxel, fuse_k, 2 /*best-frame*/, &fz, &nfz) == 0 && nfz > 0) { + char path2[1024]; stage_path(path2, sizeof path2); + write_cloud_splat(fz, nfz, (size_t) max_splats, splat_scale, path2); + std::printf("stage %zu: consensus-fused (%zu -> %zu pts)\n", labels.size(), nroot, nfz); + labels.push_back("consensus-fused — one clean scene"); + free_splatter_buf_free(fz); + } + free_splatter_buf_free(root); + } + std::ofstream mf(dir + "/manifest.json"); + mf << "{ \"reframe\": true, \"steps\": [\n"; + for (size_t L = 0; L < labels.size(); L++) + mf << " {\"splat\":\"" << base << "_" << L << ".splat\",\"images\":[],\"n\":" << (L + 1) + << ",\"label\":\"" << labels[L] << "\"}" << (L + 1 < labels.size() ? "," : "") << "\n"; + mf << " ] }\n"; + std::printf("tree-stages: %zu stages -> %s/manifest.json\n", labels.size(), dir.c_str()); + } else { + // one merged world from the overlap tree (block=tov_block, overlap=tov_overlap; + // 2/1 = the plain overlap-by-one tree). With --fuse, also the de-ghosted scene. + std::printf("tree-accumulate: %zu pairs, block=%d overlap=%d\n", ps.size(), tov_block, tov_overlap); + free_splatter_point * tc = nullptr; size_t ntc = 0; + if (free_splatter_tree_overlap(ps.data(), (int) ps.size(), gc, sz, sz, opac_thr, + tov_block, tov_overlap, -1, 0.0f, 0, &tc, &ntc, nullptr) != 0) { + std::fprintf(stderr, "tree accumulate failed\n"); + free_splatter_accumulator_free(acc); free_splatter_free(ctx); return 1; } + char path[1024]; + std::snprintf(path, sizeof path, "%s_tree.splat", pfx.c_str()); + write_cloud_splat(tc, ntc, (size_t) max_splats, splat_scale, path); + if (fuse) { // best-frame consensus, de-ghosted + free_splatter_point * fz = nullptr; size_t nfz = 0; + if (free_splatter_fuse_cloud(tc, ntc, voxel, fuse_k, 2, &fz, &nfz) == 0 && nfz > 0) { + std::snprintf(path, sizeof path, "%s_tree_fused.splat", pfx.c_str()); + write_cloud_splat(fz, nfz, (size_t) max_splats, splat_scale, path); + free_splatter_buf_free(fz); + } + } + free_splatter_buf_free(tc); + } + free_splatter_accumulator_free(acc); free_splatter_free(ctx); return 0; + } if (fuse && splat_prefix) { if (refine) free_splatter_accumulator_refine(acc, refine_voxel, refine_iters, refine_alpha); free_splatter_point * fc = nullptr; size_t nf = 0;