diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6c2badb1..35e257e1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -208,6 +208,15 @@ jobs: key: ${{ runner.os }}-cargo-test-${{ hashFiles('**/Cargo.lock') }} restore-keys: ${{ runner.os }}-cargo-test- + - name: Install wasm target for GEPA test modules + run: rustup target add wasm32-unknown-unknown + + - name: Build GEPA test WASM modules + run: | + for module in gepa-replay gepa-reflective gepa-score gepa-pareto gepa-verify; do + (cd "wasm-modules/$module" && cargo build --target wasm32-unknown-unknown --release) + done + - name: cargo test --workspace run: cargo test --workspace diff --git a/.proofs/001_gepa_temperagent_failure_reflection.md b/.proofs/001_gepa_temperagent_failure_reflection.md new file mode 100644 index 00000000..d88719a0 --- /dev/null +++ b/.proofs/001_gepa_temperagent_failure_reflection.md @@ -0,0 +1,55 @@ +# 001_gepa_temperagent_failure_reflection + +Date: 2026-03-23 +Branch: docs/positioning-rewrite +Scope: TemperAgent sandbox provisioning failure reflection + root-cause hardening + +## Problem +`sandbox_provisioner` previously swallowed TemperFS bootstrap errors and still emitted `SandboxReady` with empty IDs. This made failures look like partial success. + +## Code Changes +- Hard fail on TemperFS bootstrap failure with explicit error context (`temper_api_url`, tenant, agent id). +- Added `temper_api_url` to TemperAgent state + `Configure` action. +- Updated CSDL to expose `TemperApiUrl` and `Configure.temper_api_url`. +- All TemperAgent WASM modules now resolve Temper API URL from entity `fields.temper_api_url` first, then integration config. +- Installing `temper-agent` now auto-installs `temper-fs` dependency. +- Added regression test for dependency install behavior. + +## Verification +### Build/Test +- `cargo fmt --all` +- `cargo check -p temper-platform` +- `cargo test -p temper-platform os_apps::tests::test_install_temper_agent_auto_installs_temper_fs -- --nocapture` (passed) +- Built all TemperAgent WASM modules for `wasm32-unknown-unknown`. + +### Live Proof (port 3015) +Tenant: `proof-fix-20260323` + +Prereq: Uploaded tenant-scoped WASM modules: +- `sandbox_provisioner` +- `llm_caller` +- `tool_runner` +- `workspace_restorer` + +Case A (bad API URL): +- Agent: `019d1c5a-4a43-71b0-adc9-199cb90eefab` +- Configure with `temper_api_url=http://127.0.0.1:39999` +- Provision result: + - `status=Failed` + - `error_message="TemperFS bootstrap failed at http://127.0.0.1:39999/tdata ... Ensure os-app 'temper-fs' is installed ... temper_api_url is correct."` + - `workspace_id`, `conversation_file_id`, `file_manifest_id` all empty + +Case B (correct API URL): +- Agent: `019d1c5a-4f84-7d23-8c18-dfc23885e3b9` +- Configure with `temper_api_url=http://127.0.0.1:3015` +- Provision + callback result: + - `status=Failed` (intentional due `max_turns=0`) + - `error_message="turn budget exhausted (0/0)"` + - `workspace_id`, `conversation_file_id`, `file_manifest_id` all populated + +Interpretation: +- Failure reflection is now explicit and truthful for TemperFS bootstrap issues. +- With correct URL, TemperFS bootstrap succeeds and IDs are present. + +## Operational Caveat +If tenant-scoped WASM modules are not uploaded, integration dispatch fails with `WASM module '' not found`. This is separate from TemperFS bootstrap handling. diff --git a/Cargo.toml b/Cargo.toml index 3dcd21be..970f3e31 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,6 +7,7 @@ exclude = [ "wasm-modules/gepa-pareto", "wasm-modules/gepa-reflective", "wasm-modules/gepa-proposer-agent", + "wasm-modules/gepa-verify", "crates/temper-wasm/tests/fixtures/echo-integration-src", "os-apps/temper-agent/wasm/llm_caller", "os-apps/temper-agent/wasm/tool_runner", diff --git a/crates/temper-platform/src/os_apps/mod.rs b/crates/temper-platform/src/os_apps/mod.rs index f6c25df4..79688dcf 100644 --- a/crates/temper-platform/src/os_apps/mod.rs +++ b/crates/temper-platform/src/os_apps/mod.rs @@ -470,6 +470,14 @@ fn load_skill_bundle(skill_dir: &Path) -> Option { }) } +fn os_app_dependencies(name: &str) -> &'static [&'static str] { + match name { + // TemperAgent persists conversation/files in TemperFS entities. + "temper-agent" => &["temper-fs"], + _ => &[], + } +} + /// Install an OS app into a tenant (workspace). /// /// Reads skill files from disk, runs the verification cascade, registers @@ -483,6 +491,17 @@ pub async fn install_os_app( state: &PlatformState, tenant: &str, app_name: &str, +) -> Result { + for dependency in os_app_dependencies(app_name) { + install_os_app_without_dependencies(state, tenant, dependency).await?; + } + install_os_app_without_dependencies(state, tenant, app_name).await +} + +async fn install_os_app_without_dependencies( + state: &PlatformState, + tenant: &str, + app_name: &str, ) -> Result { let bundle = get_os_app(app_name).ok_or_else(|| format!("OS app '{app_name}' not found in catalog"))?; diff --git a/crates/temper-platform/src/os_apps/mod_test.rs b/crates/temper-platform/src/os_apps/mod_test.rs index 7f865c1c..b51c19f7 100644 --- a/crates/temper-platform/src/os_apps/mod_test.rs +++ b/crates/temper-platform/src/os_apps/mod_test.rs @@ -126,7 +126,6 @@ fn test_list_skills_returns_catalog() { "missing intent-discovery: {names:?}" ); - // Check entity types for known skills. let pm = apps .iter() .find(|e| e.name == "project-management") @@ -346,6 +345,28 @@ async fn test_install_skill_agent_orchestration_registers_entities() { assert!(registry.get_table(&tenant, "BudgetLedger").is_some()); } +#[tokio::test] +async fn test_install_temper_agent_auto_installs_temper_fs() { + let state = PlatformState::new(None); + install_os_app(&state, "test-agent", "temper-agent") + .await + .expect("install temper-agent"); + let registry = state.registry.read().unwrap(); + let tenant = TenantId::new("test-agent"); + for entity in [ + "TemperAgent", + "Workspace", + "File", + "Directory", + "FileVersion", + ] { + assert!( + registry.get_table(&tenant, entity).is_some(), + "missing {entity}" + ); + } +} + #[tokio::test] async fn test_install_skill_nonexistent_returns_error() { let state = PlatformState::new(None); @@ -501,11 +522,9 @@ async fn test_skill_install_survives_restart() { use temper_server::registry_bootstrap::restore_registry_from_turso; use temper_store_turso::TursoEventStore; - // Use a unique temp file DB for this test. let db_path = format!("/tmp/temper-test-{}.db", uuid::Uuid::new_v4()); let db_url = format!("file:{db_path}"); - // ── Phase A: Install into a fresh state with Turso. ───────── let turso = TursoEventStore::new(&db_url, None).await.unwrap(); let mut state = PlatformState::new(None); state.server.event_store = Some(Arc::new(ServerEventStore::Turso(turso))); @@ -515,7 +534,6 @@ async fn test_skill_install_survives_restart() { let result = result.unwrap(); assert_eq!(result.added.len(), 5); - // Verify specs are in the in-memory registry. { let registry = state.registry.read().unwrap(); let tenant = TenantId::new("test-ws"); @@ -523,7 +541,6 @@ async fn test_skill_install_survives_restart() { assert!(registry.get_table(&tenant, "Project").is_some()); } - // Verify specs are persisted to Turso. let turso_ref = state .server .event_store @@ -538,17 +555,14 @@ async fn test_skill_install_survives_restart() { "Issue spec not found in Turso" ); - // Verify installed_apps record is in Turso. let installed = turso_ref.list_all_installed_apps().await.unwrap(); assert!( installed.contains(&("test-ws".to_string(), "project-management".to_string())), "installed app record not found" ); - // ── Phase B: Simulate restart — fresh state, same DB. ─────── let turso2 = TursoEventStore::new(&db_url, None).await.unwrap(); let state2 = PlatformState::new(None); - // Verify fresh registry is empty for this tenant. { let registry = state2.registry.read().unwrap(); let tenant = TenantId::new("test-ws"); @@ -558,9 +572,6 @@ async fn test_skill_install_survives_restart() { ); } - // Restore from Turso (this is what build_registry does on boot). - // Fetch async data outside the lock, then assign synchronously to avoid - // holding a RwLockWriteGuard across an await point. { use temper_server::registry::SpecRegistry; let mut temp_registry = SpecRegistry::new(); @@ -571,7 +582,6 @@ async fn test_skill_install_survives_restart() { *state2.registry.write().unwrap() = temp_registry; } - // Verify specs survived the restart. { let registry = state2.registry.read().unwrap(); let tenant = TenantId::new("test-ws"); @@ -582,7 +592,6 @@ async fn test_skill_install_survives_restart() { assert!(registry.get_table(&tenant, "Label").is_some()); } - // Clean up temp DB. let _ = std::fs::remove_file(&db_path); let _ = std::fs::remove_file(format!("{db_path}-wal")); let _ = std::fs::remove_file(format!("{db_path}-shm")); @@ -590,7 +599,6 @@ async fn test_skill_install_survives_restart() { #[test] fn test_reload_picks_up_disk_changes() { - // Just verify reload doesn't panic and produces a valid catalog. reload_skills(); let skills = list_skills(); assert!( diff --git a/crates/temper-server/src/authz/wasm_gate.rs b/crates/temper-server/src/authz/wasm_gate.rs index 967f49ac..cd918a7a 100644 --- a/crates/temper-server/src/authz/wasm_gate.rs +++ b/crates/temper-server/src/authz/wasm_gate.rs @@ -62,7 +62,8 @@ impl WasmAuthzGate for CedarWasmAuthzGate { // Convert BTreeMap to HashMap at Cedar boundary (determinism-ok) let hash_attrs: std::collections::HashMap<_, _> = resource_attrs.into_iter().collect(); // determinism-ok: Cedar API requires HashMap - let decision = self.engine.authorize_or_bypass( + let decision = self.engine.authorize_for_tenant_or_bypass( + &ctx.tenant, &enriched_ctx, "http_call", "HttpEndpoint", @@ -90,9 +91,13 @@ impl WasmAuthzGate for CedarWasmAuthzGate { // Convert BTreeMap to HashMap at Cedar boundary (determinism-ok) let hash_attrs: std::collections::HashMap<_, _> = resource_attrs.into_iter().collect(); // determinism-ok: Cedar API requires HashMap - let decision = - self.engine - .authorize_or_bypass(&security_ctx, "access_secret", "Secret", &hash_attrs); + let decision = self.engine.authorize_for_tenant_or_bypass( + &ctx.tenant, + &security_ctx, + "access_secret", + "Secret", + &hash_attrs, + ); match decision { AuthzDecision::Allow => WasmAuthzDecision::Allow, @@ -279,6 +284,34 @@ mod tests { assert!(matches!(result, WasmAuthzDecision::Deny(_))); } + #[test] + fn cedar_gate_uses_tenant_scoped_policies() { + let engine = Arc::new(AuthzEngine::empty()); + let policy = r#" + permit( + principal is Agent, + action == Action::"http_call", + resource is HttpEndpoint + ) when { + context.module == "stripe_charge" && + context.domain == "api.stripe.com" + }; + "#; + engine + .reload_tenant_policies("test-tenant", policy) + .expect("tenant policy should load"); + + let gate = CedarWasmAuthzGate::new(engine); + let ctx = test_ctx(); // tenant = test-tenant, module_name = stripe_charge + let result = gate.authorize_http_call( + "api.stripe.com", + "POST", + "https://api.stripe.com/v1/charges", + &ctx, + ); + assert_eq!(result, WasmAuthzDecision::Allow); + } + #[test] fn build_security_context_from_wasm_ctx() { let ctx = test_ctx(); diff --git a/crates/temper-server/tests/e2e_gepa_loop.rs b/crates/temper-server/tests/e2e_gepa_loop.rs index d7ebe854..9db90f7b 100644 --- a/crates/temper-server/tests/e2e_gepa_loop.rs +++ b/crates/temper-server/tests/e2e_gepa_loop.rs @@ -1461,19 +1461,20 @@ to = "Done" ); } -/// **Full autonomous GEPA loop (test override)** — proves the entire chain runs end-to-end: +/// **Autonomous GEPA chain to approval gate (test override)** — proves the background +/// chain runs end-to-end through mutation, verification, scoring, and frontier update: /// /// SelectCandidate → gepa-replay (WASM) → RecordEvaluation /// → gepa-reflective (WASM) → RecordDataset /// → claude_code adapter (mock script) → RecordMutation -/// → [manual verification step] → RecordVerificationPass +/// → claude_code adapter (mock verifier) → RecordVerificationPass /// → gepa-score (WASM) → RecordScore /// → gepa-pareto (WASM) → RecordFrontier /// /// Production uses `gepa-proposer-agent` WASM + TemperAgent. This test -/// intentionally overrides only `propose_mutation` to a deterministic mock adapter -/// so CI can run without LLM keys/network. -#[tokio::test] +/// overrides `propose_mutation` and `verify_candidate` to deterministic mock adapters +/// so CI can run without LLM keys or a live verification HTTP server. +#[tokio::test(flavor = "multi_thread")] async fn e2e_gepa_full_autonomous_loop_with_adapter() { use std::io::Write; use std::time::Duration; @@ -1487,8 +1488,11 @@ async fn e2e_gepa_full_autonomous_loop_with_adapter() { // --- Create mock "claude" script that returns a mutated spec --- let mock_dir = std::env::temp_dir().join("gepa-mock-adapter-test"); // determinism-ok: test harness + let mock_workdir = mock_dir.join("workspace"); std::fs::create_dir_all(&mock_dir).expect("create mock dir"); + std::fs::create_dir_all(&mock_workdir).expect("create mock workdir"); let mock_script = mock_dir.join("mock-claude"); + let verify_script = mock_dir.join("mock-verify"); { let mut f = std::fs::File::create(&mock_script).expect("create mock script"); // The script outputs stream-JSON with MutatedSpecSource and MutationSummary. @@ -1514,15 +1518,43 @@ MOCK_OUTPUT .expect("chmod +x mock script"); } } + { + let mut f = std::fs::File::create(&verify_script).expect("create verify script"); + write!( + f, + r#"#!/bin/bash +cat <<'MOCK_OUTPUT' +{{"VerificationReport": "L0-L3 cascade passed for TestIssue"}} +MOCK_OUTPUT +"# + ) + .expect("write verify script"); + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + std::fs::set_permissions(&verify_script, std::fs::Permissions::from_mode(0o755)) + .expect("chmod +x verify script"); + } + } // --- Build EvolutionRun spec with propose_mutation test override --- let base_ioa = include_str!("../../../os-apps/evolution/evolution_run.ioa.toml"); - // Replace the proposer module with deterministic adapter for test-only execution. + // Replace proposer + verifier integrations with deterministic adapters for test-only execution. let mock_path = mock_script.to_str().expect("mock path to str"); - let modified_ioa = base_ioa.replace( + let verify_path = verify_script.to_str().expect("verify path to str"); + let mock_workdir = mock_workdir.to_str().expect("mock workdir to str"); + let modified_ioa = base_ioa + .replace( "type = \"wasm\"\nmodule = \"gepa-proposer-agent\"", &format!("type = \"adapter\"\nadapter = \"claude_code\"\ncommand = \"{mock_path}\""), - ); + ) + .replace( + "type = \"wasm\"\nmodule = \"gepa-verify\"", + &format!( + "type = \"adapter\"\nadapter = \"claude_code\"\ncommand = \"{verify_path}\"\non_success = \"RecordVerificationPass\"" + ), + ) + .replace("workdir = \"/tmp/workspace\"", &format!("workdir = \"{mock_workdir}\"")); let csdl_xml = r#" @@ -1642,7 +1674,8 @@ to = "Done" r.state.status, r.custom_effects ); - // Wait for the autonomous chain to progress through WASM + adapter + // Wait for the autonomous chain to progress through adapter + verification + // + scoring + frontier update. The current branch stops at manual approval. let deadline = tokio::time::Instant::now() + Duration::from_secs(30); let mut final_status = "Evaluating".to_string(); let mut event_trail = Vec::new(); @@ -1665,76 +1698,28 @@ to = "Done" .map(|e| e.action.clone()) .collect(); - // Terminal states for this phase - if matches!(final_status.as_str(), "Verifying" | "Failed" | "Completed") { + if matches!(final_status.as_str(), "AwaitingApproval" | "Failed") { break; } } - println!("[AUTO] After WASM+adapter chain: status={final_status}, events={event_trail:?}"); + println!("[AUTO] After autonomous GEPA chain: status={final_status}, events={event_trail:?}"); - // The chain should have reached Verifying (WASM replay → reflective → adapter mutation → RecordMutation) assert!( event_trail.contains(&"RecordMutation".to_string()), "RecordMutation must appear — proves the claude_code adapter (mock) executed and \ returned a mutated spec. Events: {event_trail:?}" ); assert_eq!( - final_status, "Verifying", - "Entity should be in Verifying after adapter returns mutation. Got: {final_status}" - ); - - // Step 3: Manual verification pass (in production, this is L0-L3 cascade) - let r = state - .dispatch_tenant_action( - &tenant, - "EvolutionRun", - evo_id, - "RecordVerificationPass", - serde_json::json!({ - "VerificationReport": "L0-L3 cascade passed. Reassign action properly defined." - }), - &AgentContext::default(), - ) - .await - .expect("RecordVerificationPass should succeed"); - assert!(r.success); - println!( - "[AUTO] RecordVerificationPass → status: {}, effects: {:?}", - r.state.status, r.custom_effects + final_status, "AwaitingApproval", + "Entity should reach AwaitingApproval after replay, reflective, verify, score, and frontier callbacks. Got: {final_status}" ); - // This triggers score_candidate (WASM) → RecordScore → update_frontier (WASM) → RecordFrontier - let deadline = tokio::time::Instant::now() + Duration::from_secs(15); - loop { - if tokio::time::Instant::now() >= deadline { - break; - } - tokio::time::sleep(Duration::from_millis(200)).await; - - let entity = state - .get_tenant_entity_state(&tenant, "EvolutionRun", evo_id) - .await - .expect("entity should exist"); - final_status = entity.state.status.clone(); - event_trail = entity - .state - .events - .iter() - .map(|e| e.action.clone()) - .collect(); - - if matches!( - final_status.as_str(), - "AwaitingApproval" | "Deploying" | "Completed" | "Failed" - ) { - break; - } - } - - println!("[AUTO] After scoring+frontier chain: status={final_status}, events={event_trail:?}"); - // Verify all WASM modules fired + assert!( + event_trail.contains(&"RecordVerificationPass".to_string()), + "RecordVerificationPass must appear — proves the verification adapter callback executed. Events: {event_trail:?}" + ); assert!( event_trail.contains(&"RecordScore".to_string()), "RecordScore must appear — proves gepa-score WASM module executed. Events: {event_trail:?}" @@ -1744,34 +1729,6 @@ to = "Done" "RecordFrontier must appear — proves gepa-pareto WASM module executed. Events: {event_trail:?}" ); - // Step 4: Approve and deploy - let r = state - .dispatch_tenant_action( - &tenant, - "EvolutionRun", - evo_id, - "Approve", - serde_json::json!({ "ApproverId": "human-reviewer-1" }), - &AgentContext::default(), - ) - .await - .expect("Approve should succeed"); - assert!(r.success); - - let r = state - .dispatch_tenant_action( - &tenant, - "EvolutionRun", - evo_id, - "Deploy", - serde_json::json!({ "DeploymentId": "deploy-auto-1" }), - &AgentContext::default(), - ) - .await - .expect("Deploy should succeed"); - assert!(r.success); - assert_eq!(r.state.status, "Completed"); - // Final event trail let entity = state .get_tenant_entity_state(&tenant, "EvolutionRun", evo_id) @@ -1784,22 +1741,20 @@ to = "Done" .map(|e| e.action.as_str()) .collect(); - println!("\n=== FULL AUTONOMOUS GEPA LOOP PROOF ==="); + println!("\n=== AUTONOMOUS GEPA APPROVAL-GATE PROOF ==="); println!("Event trail: {:?}", final_events); println!("Final status: {}", entity.state.status); - // The complete chain: + // The complete background chain on this branch: let expected = [ - "Start", // Human/agent kicks off - "SelectCandidate", // Pick candidate from frontier - "RecordEvaluation", // gepa-replay WASM module ✓ - "RecordDataset", // gepa-reflective WASM module ✓ - "RecordMutation", // claude_code adapter (evolution agent) ✓ - "RecordVerificationPass", // L0-L3 verification cascade - "RecordScore", // gepa-score WASM module ✓ - "RecordFrontier", // gepa-pareto WASM module ✓ - "Approve", // Human/agent approval gate - "Deploy", // Hot-deploy to SpecRegistry + "Start", + "SelectCandidate", + "RecordEvaluation", + "RecordDataset", + "RecordMutation", + "RecordVerificationPass", + "RecordScore", + "RecordFrontier", ]; for step in &expected { assert!( @@ -1807,6 +1762,6 @@ to = "Done" "Missing step '{step}' in event trail. Full trail: {final_events:?}" ); } - assert_eq!(entity.state.status, "Completed"); - println!("ALL 10 STEPS VERIFIED. GEPA LOOP IS FULLY AUTONOMOUS. ✓"); + assert_eq!(entity.state.status, "AwaitingApproval"); + println!("AUTONOMOUS GEPA CHAIN REACHED AWAITING APPROVAL. ✓"); } diff --git a/docs/GEPA_E2E_PROOF.md b/docs/GEPA_E2E_PROOF.md index 0d7b269c..828fdbd7 100644 --- a/docs/GEPA_E2E_PROOF.md +++ b/docs/GEPA_E2E_PROOF.md @@ -1,6 +1,6 @@ # GEPA End-to-End Proof (TemperAgent + OTS + Workflow Replay) -**Date**: 2026-03-19 +**Date**: 2026-03-23 **Workspace**: `/Users/seshendranalla/Development/temper-gepa-tarjan` **Server**: `temper serve --port 4455 --storage turso --no-observe` **Primary tenant**: `gepa-live-fresh-20260319` @@ -24,19 +24,147 @@ - GEPA returns a no-op mutation (`MutatedSpecSource = original`) when the structural gate blocks mutation. - `patterns.missing_capabilities` remains available in reflective data, but is routed to unmet-intent handoff rather than direct structural edits by GEPA. +## 2026-03-23 Full-Loop Re-Proof (Latest) +- **Tenant**: `gepa-live-20260323-121726` +- **Primary terminal run**: `EvolutionRun('evo-live-20260323-121726-v3')` +- **Artifacts dir**: `/tmp/gepa_run_20260323-121726` + +### What was proven in this latest run +1. **Automatic verify/deploy path now works end-to-end** (no manual steering): + - Terminal action chain: + `Created -> Start -> SelectCandidate -> RecordEvaluation -> RecordDataset -> RecordMutation -> RecordVerificationPass -> RecordScore -> RecordFrontierAutoApprove -> Deploy` + - Run status reached `Completed` with: + - `VerificationReport = "verification passed: 4 levels passed"` + - `DeploymentId = "gepa-deploy-evo-live-20260323-121726-v3-m1"` +2. **Unmet-intent handoff persists even when optimizer mutation path is allowed/continues**: + - `UnmetIntentReport` present in run fields with `reported = 3, failed = 0`. + - Reported intents included: + - `Add action 'Reassign'` + - `PromoteToCritical` + - `Reassign` + - These were also persisted into trajectory telemetry as `source=Platform` unmet records (visible in `/observe/trajectories`). +3. **Workflow-level GEPA path remained active**: + - OTS was seeded by real `temper mcp` sessions (success/partial/failure). + - `SelectCandidate` still omitted `TrajectoryActions`/`Trajectories`; replay consumed OTS auto-injected server-side. + +### What was fixed during this cycle +- `EvolutionRun` automation: + - Added `gepa-verify` module + `verify_candidate` trigger from `RecordMutation`. + - Added `gepa-deploy` module + `deploy_candidate` trigger from auto-approve and manual approve paths. + - `gepa-pareto` now emits dynamic callback action based on `AutonomyLevel` (`RecordFrontierAutoApprove` vs `RecordFrontier`). +- SDK-callback pitfall addressed: + - `gepa-verify`, `gepa-deploy`, and `gepa-pareto` were moved to explicit callback action emission (not macro-default `callback`) so action dispatch works. +- Proposer unmet-intent behavior: + - `gepa-proposer-agent` now reports unmet intents even when mutation proceeds. + +### Still open (not yet fully proven fixed) +1. **OTS ID consistency issue remains**: + - `flush_trajectory()` returned IDs still did not match IDs listed by `/api/ots/trajectories`. + - This means row ID alignment between MCP flush/finalize and listed OTS rows is still not proven fixed in live evidence. +2. This is tracked as an active blocker for the “single stable OTS ID per session” guarantee. + +## 2026-03-23 Bounded Re-Proof (Current) +- **Tenant**: `gepa-live-20260323-125726` +- **Primary run**: `EvolutionRun('evo-live-20260323-125726')` +- **Artifacts dir**: `/tmp/gepa_run_20260323-125726` + +### What was proven in this run +1. **WASM + secret setup path worked**: + - 12/12 module uploads succeeded (`wasm_upload_results.json`). + - Tenant secret `anthropic_api_key` stored successfully (`put_secret_anthropic_code.txt = 204`). +2. **Real OTS generation path worked**: + - Real `temper mcp` sessions produced success/partial/failed trajectories. + - `flush_trajectory()` returned concrete trajectory IDs for each session. +3. **OTS row-vs-payload ID mismatch is fixed in this isolated DB run**: + - `ots_id_consistency_summary.json`: + - `total_rows = 3` + - `matching_rows = 3` + - `mismatching_rows = 0` + - This shows persisted row `trajectory_id` now matches payload `$.trajectory_id`. + +### What failed in this run +1. The GEPA run reached `Proposing` and then failed: + - Event trail: + - `Created -> Start -> SelectCandidate -> RecordEvaluation -> RecordDataset -> Fail` + - Failure payload: + - `error = "authorization denied for http_call: no matching permit policy"` + - `integration = "propose_mutation"` + - `authz_denied = true` +2. Because proposer failed at `Proposing`, this run did not reach: + - `RecordMutation` + - `RecordVerificationPass` + - `RecordFrontierAutoApprove` + - `Deploy` +3. This run therefore cannot be used to re-prove auto verify/deploy or unmet-intent persistence; those remain proven by the prior successful run (`evo-live-20260323-121726-v3`). + +## 2026-03-23 Consolidated Full-Loop Re-Proof (All Three Aspects) +- **Tenant**: `gepa-live-20260323-134346` +- **Run**: `EvolutionRun('evo-live-20260323-134346')` +- **Artifacts dir**: `/tmp/gepa_run_20260323-134346` +- **Terminal status**: `Completed` + +### End-to-end path proven in this run +- `Created -> Start -> SelectCandidate -> RecordEvaluation -> RecordDataset -> RecordMutation -> RecordVerificationPass -> RecordScore -> RecordFrontierAutoApprove -> Deploy` +- Final run fields include: + - `VerificationReport = "verification passed: 4 levels passed"` + - `DeploymentId = "gepa-deploy-evo-live-20260323-134346-m1"` + - `UnmetIntentReport.attempted = 4`, `reported = 4`, `failed = 0` + +### The three requested aspects are now proven together +1. **Unmet-intent storage during optimizer flow** + - Missing-capability suggestions were surfaced by reflective/proposer and persisted via `/api/evolution/trajectories/unmet`. + - Evidence: + - `UnmetIntentReport` in final `EvolutionRun` fields with all reports successful. + - DB rows (`trajectories`) with `source = Platform`, `intent != null` for this tenant (`4` rows). + - This confirms unmet-intent handoff is not dropped when GEPA continues optimizer flow. + +2. **OTS trajectory ID consistency (flush vs stored rows)** + - `ots_id_consistency_summary.json` reports: + - `total_rows = 3` + - `matching_rows = 3` + - `mismatching_rows = 0` + - Flush IDs now match persisted OTS row payload IDs in this live run. + +3. **No manual verification/deploy steering** + - Verifier and deploy steps fired from integrations automatically and reached `Completed`. + - No manual `RecordVerificationPass`, `Approve`, or `Deploy` action calls were needed. + +### Root causes fixed to get this full loop green +1. **WASM Cedar authz tenant scope** + - `CedarWasmAuthzGate` now uses tenant-scoped authorization (`authorize_for_tenant_or_bypass`) for `http_call` and `access_secret`. +2. **WASM HTTP policy context mismatch** + - Evolution policy switched from `resource.domain` to `context.domain` for HTTP-call host checks. +3. **Internal API auth for proposer/verifier** + - `gepa-proposer-agent` and `gepa-verify` now attach `Authorization: Bearer ...` using: + - integration config (`temper_api_key = {secret:temper_api_key}`), plus + - fallback `get_secret("temper_api_key")`. +4. **Policy permissions for proposer/verifier ops** + - Added Cedar permits for: + - `http_call` from `gepa-proposer-agent` and `gepa-verify` to localhost + - `access_secret` for those modules + - `write_trajectories` for proposer (`Agent::"gepa-proposer-agent"`) so unmet intents persist. +5. **State-machine terminal handling on verifier faults** + - `EvolutionRun.Fail` now allows `from = "Verifying"` so verifier integration failures terminate cleanly instead of stalling. + +### Remaining caveat observed (non-blocking for this proof) +- `sandbox_provisioner` logs `TemperFS setup failed: Workspace creation failed (HTTP 404)` during TemperAgent provisioning in this environment. +- Despite that warning, the GEPA run still completed end-to-end (proposer response returned, verifier passed, deploy completed). + ## Executive Result 1. Real OTS trajectories were generated by real `temper mcp` sessions (no fabricated JSON). 2. `SelectCandidate` was executed without `TrajectoryActions` and without `Trajectories`; replay still consumed OTS from server-side auto-injection. 3. `gepa-replay` produced workflow-level results (`workflows[]`, `workflow_completion_rate`, `partial_adjusted_rate`) and action-level aggregates. 4. `gepa-reflective` produced workflow-level triplets and cross-trajectory patterns (missing capabilities, common failure points, successful patterns). -5. The run failed in proposer (`Proposing -> Failed`) because Anthropic returned `401 invalid x-api-key`. -6. Because proposer failed, mutation/verify/score/frontier/deploy were not reached in this run. +5. Latest consolidated run (`evo-live-20260323-134346`) completed end-to-end through verify, score, frontier update, and deploy. +6. Unmet-intent handoff now persists successfully during optimizer flow (`attempted=4`, `reported=4`, `failed=0`) while GEPA remains optimizer-only. +7. OTS row ID and payload trajectory ID matched for all seeded trajectories in the latest run (`3/3`). +8. Historical failures (proposer authz/401, verifier authz, `Fail` not valid from `Verifying`) are documented in prior sections and were resolved for the latest run. ## What "the run" means in this report A "run" here means one full `EvolutionRun` entity state-machine attempt from `Start` through terminal state (`Completed` or `Failed`). -For `evo-live-fresh-20260319-v4`, the terminal path was: -- `Evaluating -> Reflecting -> Proposing -> Failed` +For the latest consolidated proof run `evo-live-20260323-134346`, the terminal path was: +- `Created -> Start -> SelectCandidate -> RecordEvaluation -> RecordDataset -> RecordMutation -> RecordVerificationPass -> RecordScore -> RecordFrontierAutoApprove -> Deploy -> Completed` No manual trajectory payload was provided to `SelectCandidate`; OTS data came from tenant OTS storage. diff --git a/os-apps/evolution/evolution_run.ioa.toml b/os-apps/evolution/evolution_run.ioa.toml index f92bc5a7..4eb2e227 100644 --- a/os-apps/evolution/evolution_run.ioa.toml +++ b/os-apps/evolution/evolution_run.ioa.toml @@ -75,7 +75,7 @@ name = "RecordMutation" kind = "input" from = ["Proposing"] to = "Verifying" -effect = "increment mutation_attempts" +effect = [{ type = "increment", var = "mutation_attempts" }, { type = "trigger", name = "verify_candidate" }] params = ["MutatedSpecSource", "MutationSummary"] hint = "Record the LLM-proposed spec mutation." @@ -126,6 +126,7 @@ name = "RecordFrontierAutoApprove" kind = "input" from = ["Updating"] to = "Deploying" +effect = "trigger deploy_candidate" params = ["FrontierUpdateJson"] hint = "Frontier updated, auto-approved for deployment." @@ -142,6 +143,7 @@ name = "Approve" kind = "input" from = ["AwaitingApproval"] to = "Deploying" +effect = "trigger deploy_candidate" params = ["ApproverId"] hint = "Human or verified agent approves the evolution candidate." @@ -165,7 +167,7 @@ hint = "Spec hot-deployed via SpecRegistry::swap_table()." [[action]] name = "Fail" kind = "input" -from = ["Created", "Selecting", "Evaluating", "Reflecting", "Proposing", "Scoring", "Updating", "Deploying"] +from = ["Created", "Selecting", "Evaluating", "Reflecting", "Proposing", "Verifying", "Scoring", "Updating", "Deploying"] to = "Failed" params = ["FailureReason"] hint = "Unrecoverable error — evolution run failed." @@ -199,6 +201,8 @@ prompt = "You are the GEPA evolution agent. Read trigger_params.DatasetJson with [integration.config] temper_api_url = "http://127.0.0.1:4455" +temper_api_key = "{secret:temper_api_key}" +temper_api_key = "{secret:temper_api_key}" sandbox_url = "http://127.0.0.1:9999" model = "claude-sonnet-4-20250514" provider = "anthropic" @@ -210,6 +214,16 @@ tools_enabled = "" workdir = "/tmp/workspace" timeout_secs = "420" +[[integration]] +name = "verify_candidate" +trigger = "verify_candidate" +type = "wasm" +module = "gepa-verify" +on_failure = "Fail" + +[integration.config] +temper_api_url = "http://127.0.0.1:4455" + [[integration]] name = "score_candidate" trigger = "score_candidate" @@ -225,3 +239,10 @@ type = "wasm" module = "gepa-pareto" on_success = "RecordFrontier" on_failure = "Fail" + +[[integration]] +name = "deploy_candidate" +trigger = "deploy_candidate" +type = "wasm" +module = "gepa-deploy" +on_failure = "Fail" diff --git a/os-apps/evolution/policies/evolution.cedar b/os-apps/evolution/policies/evolution.cedar index 89a8b174..b27915ad 100644 --- a/os-apps/evolution/policies/evolution.cedar +++ b/os-apps/evolution/policies/evolution.cedar @@ -24,6 +24,7 @@ permit(principal, action == Action::"Fail", resource is EvolutionRun); // Read/list/create are open. permit(principal, action in [Action::"create", Action::"read", Action::"list"], resource is EvolutionRun); permit(principal, action in [Action::"create", Action::"read", Action::"list"], resource is SentinelMonitor); +permit(principal == Agent::"gepa-proposer-agent", action == Action::"write_trajectories", resource is Trajectory); // Only humans can approve in full-human mode (default). permit(principal, action == Action::"Approve", resource is EvolutionRun) @@ -56,6 +57,14 @@ permit( action == Action::"http_call", resource is HttpEndpoint ) when { - context.module == "gepa-proposer-agent" && - ["127.0.0.1", "localhost"].contains(resource.domain) + ["gepa-proposer-agent", "gepa-verify"].contains(context.module) && + ["127.0.0.1", "localhost"].contains(context.domain) +}; + +permit( + principal is Agent, + action == Action::"access_secret", + resource is Secret +) when { + ["gepa-proposer-agent", "gepa-verify"].contains(context.module) }; diff --git a/os-apps/temper-agent/specs/model.csdl.xml b/os-apps/temper-agent/specs/model.csdl.xml index 20e9c09d..487b0514 100644 --- a/os-apps/temper-agent/specs/model.csdl.xml +++ b/os-apps/temper-agent/specs/model.csdl.xml @@ -21,6 +21,7 @@ + @@ -40,6 +41,7 @@ + diff --git a/os-apps/temper-agent/specs/temper_agent.ioa.toml b/os-apps/temper-agent/specs/temper_agent.ioa.toml index 6249a711..98dac610 100644 --- a/os-apps/temper-agent/specs/temper_agent.ioa.toml +++ b/os-apps/temper-agent/specs/temper_agent.ioa.toml @@ -86,6 +86,11 @@ name = "sandbox_url" type = "string" initial = "" +[[state]] +name = "temper_api_url" +type = "string" +initial = "http://127.0.0.1:3000" + [[state]] name = "sandbox_id" type = "string" @@ -127,8 +132,8 @@ initial = "" name = "Configure" kind = "input" from = ["Created"] -params = ["system_prompt", "user_message", "model", "provider", "max_turns", "tools_enabled", "workdir", "sandbox_url"] -hint = "Configure agent with system prompt, user message (task), model, tool settings, and optional sandbox URL." +params = ["system_prompt", "user_message", "model", "provider", "max_turns", "tools_enabled", "workdir", "sandbox_url", "temper_api_url"] +hint = "Configure agent with system prompt, user message (task), model, tool settings, optional sandbox URL, and optional Temper API override." [[action]] name = "Provision" diff --git a/os-apps/temper-agent/wasm/llm_caller/src/lib.rs b/os-apps/temper-agent/wasm/llm_caller/src/lib.rs index 6b598af9..e4364dd8 100644 --- a/os-apps/temper-agent/wasm/llm_caller/src/lib.rs +++ b/os-apps/temper-agent/wasm/llm_caller/src/lib.rs @@ -129,7 +129,7 @@ anthropic_api_key (or api_key) for anthropic, openrouter_api_key (or api_key) fo .get("conversation_file_id") .and_then(|v| v.as_str()) .unwrap_or(""); - let temper_api_url = temper_api_url(&ctx); + let temper_api_url = resolve_temper_api_url(&ctx, &fields); let tenant = &ctx.tenant; // Read conversation — from TemperFS if file_id set, else inline state. @@ -310,13 +310,6 @@ fn normalize_provider(provider: &str) -> String { } } -fn temper_api_url(ctx: &Context) -> String { - match ctx.config.get("temper_api_url").map(String::as_str) { - Some(value) if !value.trim().is_empty() && !value.contains("{secret:") => value.to_string(), - _ => "http://127.0.0.1:3000".to_string(), - } -} - fn is_unresolved_secret_template(value: &str) -> bool { value.contains("{secret:") } @@ -1472,3 +1465,18 @@ fn write_conversation_to_temperfs( )) } } + +fn resolve_temper_api_url(ctx: &Context, fields: &Value) -> String { + fields + .get("temper_api_url") + .and_then(|v| v.as_str()) + .filter(|s| !s.is_empty()) + .map(|s| s.to_string()) + .or_else(|| match ctx.config.get("temper_api_url").map(String::as_str) { + Some(value) if !value.trim().is_empty() && !value.contains("{secret:") => { + Some(value.to_string()) + } + _ => None, + }) + .unwrap_or_else(|| "http://127.0.0.1:3000".to_string()) +} diff --git a/os-apps/temper-agent/wasm/sandbox_provisioner/src/lib.rs b/os-apps/temper-agent/wasm/sandbox_provisioner/src/lib.rs index 25e98a18..9f820b55 100644 --- a/os-apps/temper-agent/wasm/sandbox_provisioner/src/lib.rs +++ b/os-apps/temper-agent/wasm/sandbox_provisioner/src/lib.rs @@ -41,8 +41,9 @@ pub extern "C" fn run(_ctx_ptr: i32, _ctx_len: i32) -> i32 { ), ); - // Create TemperFS Workspace + File for conversation storage - let temper_api_url = temper_api_url(&ctx); + // Create TemperFS Workspace + File for conversation storage. + // Prefer per-run override from Configure state, then integration config. + let temper_api_url = resolve_temper_api_url(&ctx, &fields); let entity_id = ctx .entity_state @@ -52,20 +53,12 @@ pub extern "C" fn run(_ctx_ptr: i32, _ctx_len: i32) -> i32 { let tenant = &ctx.tenant; - let fs_result = create_conversation_storage(&ctx, &temper_api_url, tenant, entity_id); - - let (workspace_id, conversation_file_id, file_manifest_id) = match fs_result { - Ok((ws, conv, manifest)) => (ws, conv, manifest), - Err(e) => { - ctx.log( - "warn", - &format!( - "sandbox_provisioner: TemperFS setup failed: {e}, falling back to inline" - ), - ); - (String::new(), String::new(), String::new()) - } - }; + let (workspace_id, conversation_file_id, file_manifest_id) = + create_conversation_storage(&ctx, &temper_api_url, tenant, entity_id).map_err(|e| { + format!( + "TemperFS bootstrap failed at {temper_api_url}/tdata (tenant={tenant}, agent={entity_id}): {e}. Ensure os-app 'temper-fs' is installed for this tenant and temper_api_url is correct." + ) + })?; // Return sandbox + TemperFS details to the state machine set_success_result( @@ -93,11 +86,19 @@ struct SandboxResult { sandbox_id: String, } -fn temper_api_url(ctx: &Context) -> String { - match ctx.config.get("temper_api_url").map(String::as_str) { - Some(value) if !value.trim().is_empty() && !value.contains("{secret:") => value.to_string(), - _ => "http://127.0.0.1:3000".to_string(), - } +fn resolve_temper_api_url(ctx: &Context, fields: &Value) -> String { + fields + .get("temper_api_url") + .and_then(|v| v.as_str()) + .filter(|s| !s.is_empty()) + .map(|s| s.to_string()) + .or_else(|| match ctx.config.get("temper_api_url").map(String::as_str) { + Some(value) if !value.trim().is_empty() && !value.contains("{secret:") => { + Some(value.to_string()) + } + _ => None, + }) + .unwrap_or_else(|| "http://127.0.0.1:3000".to_string()) } /// Provision a sandbox. Priority order: diff --git a/os-apps/temper-agent/wasm/tool_runner/src/lib.rs b/os-apps/temper-agent/wasm/tool_runner/src/lib.rs index 17ff715e..9ebb1cee 100644 --- a/os-apps/temper-agent/wasm/tool_runner/src/lib.rs +++ b/os-apps/temper-agent/wasm/tool_runner/src/lib.rs @@ -82,12 +82,8 @@ pub extern "C" fn run(_ctx_ptr: i32, _ctx_len: i32) -> i32 { .get("conversation_file_id") .and_then(|v| v.as_str()) .unwrap_or(""); - // Temper API URL: read from integration config, default to localhost - let temper_api_url = ctx - .config - .get("temper_api_url") - .cloned() - .unwrap_or_else(|| "http://127.0.0.1:3000".to_string()); + // Temper API URL: prefer Configure override in state, then integration config. + let temper_api_url = resolve_temper_api_url(&ctx, &fields); let tenant = &ctx.tenant; // Read current conversation and append tool results @@ -1184,3 +1180,18 @@ fn sync_files_to_temperfs( Ok(synced_count) } + +fn resolve_temper_api_url(ctx: &Context, fields: &Value) -> String { + fields + .get("temper_api_url") + .and_then(|v| v.as_str()) + .filter(|s| !s.is_empty()) + .map(|s| s.to_string()) + .or_else(|| { + ctx.config + .get("temper_api_url") + .filter(|s| !s.is_empty()) + .cloned() + }) + .unwrap_or_else(|| "http://127.0.0.1:3000".to_string()) +} diff --git a/os-apps/temper-agent/wasm/workspace_restorer/src/lib.rs b/os-apps/temper-agent/wasm/workspace_restorer/src/lib.rs index 189fac8f..b422ec10 100644 --- a/os-apps/temper-agent/wasm/workspace_restorer/src/lib.rs +++ b/os-apps/temper-agent/wasm/workspace_restorer/src/lib.rs @@ -63,11 +63,7 @@ pub extern "C" fn run(_ctx_ptr: i32, _ctx_len: i32) -> i32 { return Ok(()); } - let temper_api_url = ctx - .config - .get("temper_api_url") - .cloned() - .unwrap_or_else(|| "http://127.0.0.1:3000".to_string()); + let temper_api_url = resolve_temper_api_url(&ctx, &fields); let tenant = &ctx.tenant; let e2b = sandbox_url.contains("e2b.app") || sandbox_url.contains("e2b.dev"); @@ -245,3 +241,18 @@ fn url_encode(s: &str) -> String { } out } + +fn resolve_temper_api_url(ctx: &Context, fields: &Value) -> String { + fields + .get("temper_api_url") + .and_then(|v| v.as_str()) + .filter(|s| !s.is_empty()) + .map(|s| s.to_string()) + .or_else(|| { + ctx.config + .get("temper_api_url") + .filter(|s| !s.is_empty()) + .cloned() + }) + .unwrap_or_else(|| "http://127.0.0.1:3000".to_string()) +} diff --git a/wasm-modules/gepa-proposer-agent/src/lib.rs b/wasm-modules/gepa-proposer-agent/src/lib.rs index c546f888..e837424f 100644 --- a/wasm-modules/gepa-proposer-agent/src/lib.rs +++ b/wasm-modules/gepa-proposer-agent/src/lib.rs @@ -101,7 +101,7 @@ temper_module! { .unwrap_or(3) .max(1); - let headers = vec![ + let mut headers = vec![ ("Content-Type".to_string(), "application/json".to_string()), ("X-Tenant-Id".to_string(), ctx.tenant.clone()), // Drive TemperAgent via Cedar-governed agent identity. @@ -112,6 +112,21 @@ temper_module! { ), ("x-temper-agent-type".to_string(), "supervisor".to_string()), ]; + let api_key = ctx + .config + .get("temper_api_key") + .cloned() + .or_else(|| ctx.config.get("api_key").cloned()) + .or_else(|| ctx.config.get("bearer_token").cloned()) + .filter(|s| !s.trim().is_empty()) + .or_else(|| ctx.get_secret("temper_api_key").ok()) + .filter(|s| !s.trim().is_empty()); + if let Some(api_key) = api_key { + headers.push(( + "Authorization".to_string(), + format!("Bearer {}", api_key.trim()), + )); + } let system_prompt = ctx .config @@ -212,6 +227,11 @@ Return valid compact JSON in one line with non-empty MutatedSpecSource and Mutat spec_source, &payload.mutated_spec_source, ); + let unmet_handoff = collect_unmet_intent_handoff( + &dataset_missing_capabilities, + &payload.unmet_intent_suggestions, + &gate, + ); if gate.allowed { let mut out = json!({ "MutatedSpecSource": payload.mutated_spec_source, @@ -219,23 +239,40 @@ Return valid compact JSON in one line with non-empty MutatedSpecSource and Mutat "ProposerType": "temper_agent", "ProposerAgentId": created_agent_id, }); - if !payload.unmet_intent_suggestions.is_empty() { + if !unmet_handoff.is_empty() { + let report_reason = + "GEPA detected unmet capabilities during optimizer mutation"; + let report_outcomes = report_unmet_intents( + &ctx, + &base_url, + &headers, + skill_name, + entity_type, + &unmet_handoff, + report_reason, + ); out["UnmetIntentSuggestions"] = Value::Array( - payload - .unmet_intent_suggestions + unmet_handoff .iter() .map(|s| Value::String(s.clone())) .collect(), ); + out["UnmetIntentHandoff"] = Value::Array( + unmet_handoff + .iter() + .map(|s| Value::String(s.clone())) + .collect(), + ); + out["UnmetIntentReport"] = report_outcomes; + out["HasUnmetIntentHandoff"] = Value::Bool(true); } return Ok(out); } let gate_reasons = gate.reasons(); - let handoff = collect_unmet_intent_handoff( - &dataset_missing_capabilities, - &payload.unmet_intent_suggestions, - &gate, + let report_reason = format!( + "GEPA optimizer-only gate blocked structural mutation: {}", + gate_reasons.join("; ") ); let report_outcomes = report_unmet_intents( &ctx, @@ -243,15 +280,15 @@ Return valid compact JSON in one line with non-empty MutatedSpecSource and Mutat &headers, skill_name, entity_type, - &handoff, - &gate_reasons, + &unmet_handoff, + &report_reason, ); let summary = format!( "Optimizer-only GEPA gate rejected structural mutation ({}). \ Forwarded {} unmet-intent handoff items; returning no-op mutation for GEPA.", gate_reasons.join("; "), - handoff.len() + unmet_handoff.len() ); ctx.log("warn", &summary); return Ok(json!({ @@ -260,7 +297,7 @@ Forwarded {} unmet-intent handoff items; returning no-op mutation for GEPA.", "ProposerType": "temper_agent", "ProposerAgentId": created_agent_id, "RequiresUnmetIntentLoop": true, - "UnmetIntentHandoff": handoff, + "UnmetIntentHandoff": unmet_handoff, "UnmetIntentReport": report_outcomes, "OptimizerOnlyGate": { "blocked": true, @@ -799,7 +836,7 @@ fn report_unmet_intents( skill_name: &str, entity_type: &str, intents: &[String], - gate_reasons: &[String], + reason: &str, ) -> Value { if intents.is_empty() { return json!({ @@ -814,10 +851,6 @@ fn report_unmet_intents( let mut reported = 0usize; let mut failed = 0usize; let mut details = Vec::new(); - let reason = format!( - "GEPA optimizer-only gate blocked structural mutation: {}", - gate_reasons.join("; ") - ); for intent in intents { let payload = json!({ diff --git a/wasm-modules/gepa-verify/Cargo.toml b/wasm-modules/gepa-verify/Cargo.toml new file mode 100644 index 00000000..8bae3c12 --- /dev/null +++ b/wasm-modules/gepa-verify/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "gepa-verify-module" +version = "0.1.0" +edition = "2024" + +[lib] +crate-type = ["cdylib"] + +[dependencies] +serde_json = "1" +temper-wasm-sdk = { path = "../../crates/temper-wasm-sdk" } diff --git a/wasm-modules/gepa-verify/src/lib.rs b/wasm-modules/gepa-verify/src/lib.rs new file mode 100644 index 00000000..3b2d9cc9 --- /dev/null +++ b/wasm-modules/gepa-verify/src/lib.rs @@ -0,0 +1,199 @@ +//! GEPA verification bridge module. +//! +//! Runs server-side verification for the mutated spec's entity and routes +//! callback to `RecordVerificationPass` or `RecordVerificationFailure`. + +use serde_json::Value; +use temper_wasm_sdk::prelude::*; + +#[unsafe(no_mangle)] +pub extern "C" fn run(_ctx_ptr: i32, _ctx_len: i32) -> i32 { + let result = (|| -> Result<(String, Value), String> { + let ctx = Context::from_host().map_err(|e| e.to_string())?; + execute(ctx) + })(); + + match result { + Ok((action, params)) => set_success_result(&action, ¶ms), + Err(e) => set_error_result(&e), + } + + 0 +} + +fn execute(ctx: Context) -> Result<(String, Value), String> { + let fields = ctx.entity_state.get("fields").unwrap_or(&ctx.entity_state); + let mutated_spec = fields + .get("MutatedSpecSource") + .and_then(Value::as_str) + .or_else(|| ctx.trigger_params.get("MutatedSpecSource").and_then(Value::as_str)) + .ok_or("missing MutatedSpecSource in EvolutionRun state")?; + + let entity_name = + parse_automaton_name(mutated_spec).ok_or("unable to parse [automaton].name from MutatedSpecSource")?; + + let base_url = ctx + .config + .get("temper_api_url") + .cloned() + .unwrap_or_else(|| "http://127.0.0.1:3000".to_string()); + let url = format!("{base_url}/observe/verify/{entity_name}"); + + let mut headers = vec![ + ("Content-Type".to_string(), "application/json".to_string()), + ("X-Tenant-Id".to_string(), ctx.tenant.clone()), + ("x-temper-principal-kind".to_string(), "admin".to_string()), + ("x-temper-principal-id".to_string(), "gepa-verify".to_string()), + ]; + let api_key = ctx + .config + .get("temper_api_key") + .cloned() + .or_else(|| ctx.config.get("api_key").cloned()) + .or_else(|| ctx.config.get("bearer_token").cloned()) + .filter(|s| !s.trim().is_empty()) + .or_else(|| ctx.get_secret("temper_api_key").ok()) + .filter(|s| !s.trim().is_empty()); + if let Some(api_key) = api_key { + headers.push(( + "Authorization".to_string(), + format!("Bearer {}", api_key.trim()), + )); + } + + let resp = ctx.http_call("POST", &url, &headers, "")?; + if resp.status != 200 { + return Err(format!( + "verification request failed for entity '{entity_name}': HTTP {} {}", + resp.status, resp.body + )); + } + + let parsed: Value = + serde_json::from_str(&resp.body).map_err(|e| format!("failed to parse verification response JSON: {e}"))?; + let all_passed = parsed + .get("all_passed") + .and_then(Value::as_bool) + .unwrap_or(false); + let summary = summarize_verification(&parsed); + + if all_passed { + Ok(( + "RecordVerificationPass".to_string(), + json!({ + "VerificationReport": summary, + }), + )) + } else { + Ok(( + "RecordVerificationFailure".to_string(), + json!({ + "VerificationErrors": summary, + }), + )) + } +} + +fn parse_automaton_name(spec: &str) -> Option { + let mut in_automaton = false; + for raw in spec.lines() { + let line = raw.trim(); + if line == "[automaton]" { + in_automaton = true; + continue; + } + if in_automaton { + if line.starts_with('[') { + return None; + } + if line.starts_with("name") { + return extract_first_quoted(line); + } + } + } + None +} + +fn extract_first_quoted(line: &str) -> Option { + let mut start = None; + for (idx, ch) in line.char_indices() { + if ch == '"' { + if let Some(s) = start { + if idx > s + 1 { + return Some(line[s + 1..idx].to_string()); + } + start = None; + } else { + start = Some(idx); + } + } + } + None +} + +fn summarize_verification(parsed: &Value) -> String { + let all_passed = parsed + .get("all_passed") + .and_then(Value::as_bool) + .unwrap_or(false); + let levels = parsed + .get("levels") + .and_then(Value::as_array) + .cloned() + .unwrap_or_default(); + + let mut failed = Vec::new(); + let mut passed = 0usize; + for level in levels { + let name = level + .get("level") + .and_then(Value::as_str) + .unwrap_or("unknown"); + let is_passed = level + .get("passed") + .and_then(Value::as_bool) + .unwrap_or(false); + if is_passed { + passed += 1; + } else { + let summary = level + .get("summary") + .and_then(Value::as_str) + .unwrap_or("failed"); + failed.push(format!("{name}: {summary}")); + } + } + + if all_passed { + format!("verification passed: {passed} levels passed") + } else if failed.is_empty() { + "verification failed: no detailed levels returned".to_string() + } else { + format!("verification failed: {}", failed.join("; ")) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parse_automaton_name_reads_name() { + let src = "[automaton]\nname = \"Issue\"\nstates=[\"A\"]"; + assert_eq!(parse_automaton_name(src).as_deref(), Some("Issue")); + } + + #[test] + fn summarize_verification_failure_lists_levels() { + let v = json!({ + "all_passed": false, + "levels": [ + {"level": "L0", "passed": true, "summary": "ok"}, + {"level": "L1", "passed": false, "summary": "counterexample"}, + ] + }); + let s = summarize_verification(&v); + assert!(s.contains("verification failed")); + assert!(s.contains("L1")); + } +}