localai-org · richiejp · Jul 1, 2026 · Jun 26, 2026 · Jun 26, 2026 · Jun 26, 2026
diff --git a/.gitignore b/.gitignore
@@ -20,6 +20,9 @@ __pycache__/
 # reference code (third-party, fetched on demand — never vendored here).
 .cache/
 
+# Throwaway working dir for ad-hoc harnesses / scratch (never committed).
+/scratchpad/
+
 # Model + data artifacts (generated; never committed)
 *.gguf
 *.safetensors
@@ -50,5 +53,6 @@ timeout-*
 /server/fsserver
 /server/server
 
-# Demo-video drop folder (user photos; baked into .cache/demo)
+# Demo drop folders (user photos / videos; baked into .cache/demo)
 /demo-photos/
+/demo-vids/
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -2,19 +2,46 @@
 
 ## What this project is
 
-A GGML/C++ inference engine for the **neural-network front half** of
-TencentARC/FreeSplatter: image patch tokenizer → multi-view self-attention
-transformer → per-pixel 3D-Gaussian parameter head. Given N uncalibrated views
-it returns, per input pixel, the activated Gaussian parameters.
-
-**Scope:** pieces 1–3 only. The **rasterizer and the PnP pose solver are OUT OF
-SCOPE** — they are a downstream consumer of the `[N, H, W, gaussian_channels]`
-tensor this engine produces. Do not add them here.
+A GGML/C++ engine for the **neural-network front half** of
+TencentARC/FreeSplatter (image patch tokenizer → multi-view self-attention
+transformer → per-pixel 3D-Gaussian parameter head) **plus the downstream
+camera-pose recovery and cross-run registration that consume its output**. Given
+N uncalibrated views it returns, per input pixel, the activated Gaussian
+parameters; **PnP** then recovers each view's camera, and successive runs are
+aligned into one accumulating world — the path toward live reconstruction from a
+moving camera.
+
+**Scope (updated):** the engine (pieces 1–3), **PnP pose recovery (now IN
+scope)**, and the cross-run Sim(3) alignment / accumulation. Keep the seam at the
+`[N, H, W, gaussian_channels]` tensor clean — it is the contract between the
+engine and the pose consumer. Rendering itself stays in the demo viewer
+(Vulkan/WebGL), not the engine.
 
 Target checkpoint first: **`freesplatter-scene`** (`gaussian_channels=23`,
 `sh_residual=true`, black background). The transformer backbone is identical
 across all three variants, so object/object-2dgs are cheap follow-ons.
 
+## Language & dependency policy
+
+- **Everything ships in C++** (the engine, PnP, and the alignment/accumulation
+  once proven). **Go only for the demo web server** — the purego layer that drives
+  the C API → Vulkan + WebGL viewer.
+- **The CLI and the C API must have NO Python dependency at runtime.** Every piece
+  of current and future functionality is reachable from `free_splatter-cli` and
+  `include/free_splatter.h` without invoking Python.
+- **Python is confined to two places, neither shipped:**
+  1. **Dev-time reference / conversion / validation** that runs in
+     `docker/Dockerfile.cuda` (`scripts/hf_dump.py`, `convert.py`,
+     `compare_taps.py`, …) — the only place torch runs; never a runtime dependency.
+  2. **The `pose/` research prototype — now DONE and DELETED.** It was the
+     temporary Python (numpy + cv2) prototype that proved the
+     accumulating-reconstruction approach. That approach is proven, the whole
+     pipeline (focal, Sim(3) align, robust PnP, accumulation, loop closure,
+     consensus fusion) is **rewritten in C++** (`src/pose.{h,cpp}`), exposed via
+     `free_splatter-cli` + `include/free_splatter.h`, and the Python prototype has
+     been **removed** (see git history for it and its layer-by-layer parity
+     harnesses). No Python remains in the pose path.
+
 ## Validation is the backbone (non-negotiable)
 
 This is a numerical port. Correctness means matching the PyTorch reference
@@ -47,11 +74,16 @@ This is a numerical port. Correctness means matching the PyTorch reference
 
 ## Per-component discipline
 
-Each component (`image`, `gguf_loader`, `backend`, `model`, the head) has its own
+Each component (`image`, `gguf_loader`, `backend`, `model`, the head, and
+**`pose` = PnP + focal + Sim(3) alignment + accumulation/loop/fusion**) has its own
 unit test and is made independently green **before** cross-component parity.
 Component boundaries match the file layout. Keep the seam at the
-`[N,H,W,gaussian_channels]` tensor clean — that is the contract with any
-rasterizer.
+`[N,H,W,gaussian_channels]` tensor clean — that is the contract between the engine
+and the pose/rendering consumers. The C++ `pose` component (`tests/test_pose.cpp`,
+asset-free golden tier) carries the parity discipline the Python prototype
+established: **bit-exact to upstream `estimate_poses`** and **validated against
+independent ground-truth poses** — see git history for the prototype's
+`check_upstream_parity.py` / `re10k_experiment.py` harnesses.
 
 ## Debugging philosophy (mandatory sequence)
 

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -122,6 +122,7 @@ add_library(free_splatter ${_fs_libtype}
     src/gguf_loader.cpp
     src/model.cpp
     src/image.cpp
+    src/pose.cpp
     src/free_splatter.cpp
 )
 target_include_directories(free_splatter PUBLIC include)
@@ -144,12 +145,19 @@ if (FREE_SPLATTER_BUILD_TOOLS AND NOT EMSCRIPTEN)
 endif()
 
 if (FREE_SPLATTER_FUZZ)
-    foreach(target fuzz_image fuzz_options)
+    foreach(target fuzz_image fuzz_options fuzz_pose)
         add_executable(${target} fuzz/${target}.cpp)
         target_link_libraries(${target} PRIVATE free_splatter)
         target_include_directories(${target} PRIVATE src)
         target_link_options(${target} PRIVATE -fsanitize=fuzzer)  # libFuzzer main
     endforeach()
+    # fuzz_decode also drives the vendored (third-party) stb decoder, so it needs
+    # the stb implementation TU + headers (built warning-quiet like the CLI).
+    add_executable(fuzz_decode fuzz/fuzz_decode.cpp tools/stb_impl.cpp)
+    target_link_libraries(fuzz_decode PRIVATE free_splatter)
+    target_include_directories(fuzz_decode PRIVATE src ${CMAKE_SOURCE_DIR}/third_party/stb)
+    target_link_options(fuzz_decode PRIVATE -fsanitize=fuzzer)
+    set_source_files_properties(tools/stb_impl.cpp PROPERTIES COMPILE_OPTIONS "-w")
 endif()
 
 # --- WASM (Emscripten) -------------------------------------------------------

diff --git a/bench/free_splatter-bench.cpp b/bench/free_splatter-bench.cpp
@@ -88,7 +88,7 @@ int main(int argc, char ** argv) {
     }
 
     std::printf("model: %s  device=%s  %dx%d  in=%d  gc=%d  views=%d  S=%lld  load=%.1f ms\n",
-                model, device ? device : "cpu", geo.image_width, geo.image_height,
+                model, device ? device : "vulkan", geo.image_width, geo.image_height,
                 geo.in_channels, geo.gaussian_channels, views,
                 (long long) ((int64_t) views * (geo.image_height/8) * (geo.image_width/8)), load_ms);
 
@@ -124,7 +124,7 @@ int main(int argc, char ** argv) {
                 "(%.2f views/s, %.2f scenes/s)\n", mn, med, mean, mx, vps, 1000.0 / med);
     std::printf("RESULT engine device=%s views=%d gc=%d load_ms=%.1f min_ms=%.1f "
                 "median_ms=%.1f mean_ms=%.1f max_ms=%.1f views_per_s=%.3f\n",
-                device ? device : "cpu", views, geo.gaussian_channels, load_ms,
+                device ? device : "vulkan", views, geo.gaussian_channels, load_ms,
                 mn, med, mean, mx, vps);
     return 0;
 }
diff --git a/flake.nix b/flake.nix
@@ -11,7 +11,10 @@
       # hf_dump.py) run in docker/Dockerfile.cuda alongside the GPU model. The
       # M0 gate synthesizes its test GGUF in C++ (tests/test_loader.cpp), so the
       # build + fast test tier need no Python at all.
-      pyEnv = pkgs.python3.withPackages (ps: with ps; [ numpy ]);
+      # opencv4 (cv2): the downstream pose/ prototype mirrors FreeSplatter's
+      # cv2.solvePnPRansac for the exact-upstream PnP path; numpy-only fallback
+      # otherwise. Not needed by the engine build/test.
+      pyEnv = pkgs.python3.withPackages (ps: with ps; [ numpy opencv4 ]);
     in
     {
       devShells.${system} = {

diff --git a/fuzz/README.md b/fuzz/README.md
@@ -6,8 +6,10 @@ libFuzzer harnesses for the **untrusted** input surfaces, built with
 ```sh
 nix develop .#fuzz
 cmake --preset fuzz && cmake --build --preset fuzz
-mkdir -p corpus_img && ./build/fuzz/fuzz_image -max_total_time=60 corpus_img
-mkdir -p corpus_opt && ./build/fuzz/fuzz_options -max_total_time=60 corpus_opt
+mkdir -p corpus_img    && ./build/fuzz/fuzz_image   -max_total_time=60 corpus_img
+mkdir -p corpus_opt    && ./build/fuzz/fuzz_options -max_total_time=60 corpus_opt
+mkdir -p corpus_pose   && ./build/fuzz/fuzz_pose    -max_total_time=60 corpus_pose
+mkdir -p corpus_decode && ./build/fuzz/fuzz_decode  -max_total_time=60 corpus_decode
 ```
 
 ## Targets
@@ -18,9 +20,20 @@ mkdir -p corpus_opt && ./build/fuzz/fuzz_options -max_total_time=60 corpus_opt
   crashes, reads OOB, or returns a populated buffer on a rejected input.
 - **`fuzz_options`** — the options-builder / device-string setters with
   arbitrary byte strings; asserts no crash and NULL-safe frees.
+- **`fuzz_pose`** — the public pose C-API: `free_splatter_estimate_poses` and the
+  accumulator (`add_pair` → `cloud` / `fuse` / `camera_path`) driven with
+  arbitrary float gaussian buffers (NaN/Inf/denormals) and fuzz-chosen geometry.
+  Caught a real SIGFPE (the RANSAC sampler's `% N` with N=0 valid
+  correspondences) and motivated the non-finite guards in `consensus_fuse`
+  (the float→int voxel-coordinate cast).
+- **`fuzz_decode`** — the untrusted image-FILE path: arbitrary bytes →
+  `stb_image` → center-crop + resize → CHW (the surface a user photo crosses in
+  the CLI / demo). stb is vendored THIRD-PARTY; per CLAUDE.md we fuzz the boundary
+  and would guard (not patch) any stb-internal sanitizer trip — none seen so far.
 
 ## Trust boundary (intentional)
 
 The **GGUF model file is TRUSTED** and is deliberately **not** fuzzed: it is our
-own converter's output, loaded by us. Image inputs and the public C-API string
-arguments are **untrusted** and fuzzed. Keep this asymmetry — see `CLAUDE.md`.
+own converter's output, loaded by us. Image inputs (encoded files and decoded
+pixels) and the public C-API arguments are **untrusted** and fuzzed. Keep this
+asymmetry — see `CLAUDE.md`.
diff --git a/fuzz/fuzz_decode.cpp b/fuzz/fuzz_decode.cpp
@@ -0,0 +1,39 @@
+// libFuzzer harness for the UNTRUSTED image-FILE decode path: arbitrary bytes ->
+// stb_image -> the CLI's center-crop + resize -> [0,1] CHW. This is the actual
+// surface a user photo crosses in `free_splatter-cli` / the demo. stb_image is a
+// vendored THIRD-PARTY decoder; per CLAUDE.md we fuzz the boundary and, where stb
+// itself trips a sanitizer on malformed input, GUARD it rather than patch stb.
+#include "stb_image.h"
+#include "stb_image_resize2.h"
+
+#include <cstdint>
+#include <cstring>
+#include <vector>
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t * data, size_t size) {
+    int w = 0, h = 0, n = 0;
+    // Decode arbitrary bytes as an image, forcing RGB (the CLI's contract).
+    unsigned char * px = stbi_load_from_memory(data, (int) size, &w, &h, &n, 3);
+    if (!px) return 0;                      // rejected -> nothing to do
+
+    // Guard against a decoded size that would overflow the crop/resize math (stb
+    // can report large dims on crafted headers); the CLI works on sane photos.
+    if ((int64_t) w * h > 64ll * 1024 * 1024 || w <= 0 || h <= 0) { stbi_image_free(px); return 0; }
+
+    // center-crop to a square, then resize to a small size (mirror the CLI).
+    const int s = w < h ? w : h, left = (w - s) / 2, top = (h - s) / 2;
+    std::vector<unsigned char> sq((size_t) s * s * 3);
+    for (int y = 0; y < s; y++)
+        std::memcpy(&sq[(size_t) y * s * 3], &px[((size_t)(top + y) * w + left) * 3], (size_t) s * 3);
+    stbi_image_free(px);
+
+    const int outsz = 32;
+    std::vector<unsigned char> rz((size_t) outsz * outsz * 3);
+    stbir_resize_uint8_linear(sq.data(), s, s, 0, rz.data(), outsz, outsz, 0, STBIR_RGB);
+
+    std::vector<float> chw((size_t) 3 * outsz * outsz);
+    for (int c = 0; c < 3; c++)
+        for (int i = 0; i < outsz * outsz; i++)
+            chw[(size_t) c * outsz * outsz + i] = rz[(size_t) i * 3 + c] / 255.0f;
+    return 0;
+}
diff --git a/fuzz/fuzz_pose.cpp b/fuzz/fuzz_pose.cpp
@@ -0,0 +1,67 @@
+// libFuzzer harness for the public pose C-API surface (free_splatter_estimate_poses
+// + the accumulator). These now take caller-supplied float buffers across the ABI,
+// so a binding that feeds NaN/Inf/degenerate geometry must not crash, read OOB, or
+// trip UBSan (e.g. a non-finite float -> int voxel-coord cast). The gaussian buffer
+// is engine output, but the C-API is a public boundary; keep it robust.
+#include "free_splatter.h"
+
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+#include <vector>
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t * data, size_t size) {
+    if (size < 8) return 0;
+
+    // small, fuzz-driven geometry so many inputs are explored cheaply
+    const int gc = 23;
+    const int H = 4 + (data[0] % 5) * 4;          // 4,8,12,16,20
+    const int W = 4 + (data[1] % 5) * 4;
+    const int P = H * W;
+
+    auto fill = [&](std::vector<float> & buf, size_t off) {
+        // reinterpret the fuzz bytes as floats (NaN/Inf/denormals all appear),
+        // tiling deterministically to fill the whole byte range.
+        uint8_t * b = (uint8_t *) buf.data();
+        const size_t nbytes = buf.size() * sizeof(float);
+        for (size_t i = 0; i < nbytes; i++) b[i] = data[(off + i) % size];
+    };
+
+    // 1) estimate_poses on a 2-view buffer
+    {
+        std::vector<float> g((size_t) 2 * P * gc);
+        fill(g, 2);
+        std::vector<float> c2w((size_t) 2 * 16);
+        float focal = 0;
+        free_splatter_estimate_poses(g.data(), 2, H, W, gc, 0.05f, c2w.data(), &focal);
+    }
+
+    // 2) accumulator: add a few pairs, then cloud / fuse / camera_path
+    free_splatter_accumulator * acc = free_splatter_accumulator_new(H, W, 0.05f);
+    if (acc) {
+        const int npairs = 1 + (data[2] % 3);     // 1..3 pairs
+        std::vector<float> g((size_t) 2 * P * gc);
+        for (int k = 0; k < npairs; k++) {
+            fill(g, (size_t) 3 + k);
+            free_splatter_accumulator_add_pair(acc, g.data(), gc);
+        }
+        free_splatter_point * cloud = nullptr; size_t nc = 0;
+        if (free_splatter_accumulator_cloud(acc, &cloud, &nc) == 0) {
+            free_splatter_refine_cloud(cloud, nc, 0.01f + (data[6] % 8) * 0.02f, 1 + (data[7] % 4), 0.5f);
+            free_splatter_buf_free(cloud);
+        }
+        free_splatter_accumulator_refine(acc, 0.03f, 2, 0.5f);   // de-ghost on garbage
+
+        const float voxel = 0.005f + (data[3] % 16) * 0.01f;   // 0.005..0.155
+        const int k = 1 + (data[4] % 4);                       // 1..4
+        const int mode = data[5] % 3;                          // averaged / kept / best
+        free_splatter_point * fused = nullptr; size_t nf = 0;
+        if (free_splatter_accumulator_fuse(acc, voxel, k, mode, &fused, &nf) == 0) free_splatter_buf_free(fused);
+
+        float * path = nullptr; int32_t nfr = 0;
+        if (free_splatter_accumulator_camera_path(acc, &path, &nfr) == 0) free_splatter_buf_free(path);
+
+        free_splatter_accumulator_free(acc);
+    }
+    return 0;
+}