hack-ink · yvette-carlisle · May 19, 2026 · May 19, 2026 · May 19, 2026
diff --git a/apps/decodex/src/orchestrator/agent_evidence.rs b/apps/decodex/src/orchestrator/agent_evidence.rs
@@ -149,6 +149,7 @@ struct AgentRunCapsule {
 	interactive_requested: bool,
 	process_id: Option<u32>,
 	process_alive: Option<bool>,
+	process_liveness_reason: Option<String>,
 	event_count: i64,
 	last_event_type: Option<String>,
 	last_event_at: Option<String>,
@@ -588,10 +589,11 @@ fn agent_run_capsule(
 		turn_id: run.turn_id.clone(),
 		thread_status: run.thread_status.clone(),
 		thread_active_flags: run.thread_active_flags.clone(),
-		interactive_requested: run.interactive_requested,
-		process_id: run.process_id,
-		process_alive: run.process_alive,
-		event_count: run.event_count,
+			interactive_requested: run.interactive_requested,
+			process_id: run.process_id,
+			process_alive: run.process_alive,
+			process_liveness_reason: run.process_liveness_reason.clone(),
+			event_count: run.event_count,
 		last_event_type: run.last_event_type.clone(),
 		last_event_at: run.last_event_at.clone(),
 		last_run_activity_at: run.last_run_activity_at.clone(),

diff --git a/apps/decodex/src/orchestrator/operator_dashboard.html b/apps/decodex/src/orchestrator/operator_dashboard.html
@@ -4787,7 +4787,12 @@ <h2 id="recent-title">Run History</h2>
 					facts.push(["Patch", "retained"]);
 				}
 				if (attention.process_alive != null) {
-					facts.push(["Process", attention.process_alive ? "alive" : "stopped"]);
+					facts.push([
+						"Process",
+						attention.process_alive
+							? "alive"
+							: processLivenessReasonLabel(attention.process_liveness_reason || "process_stopped"),
+					]);
 				}
 				if (attention.worktree_path) {
 					facts.push(["Worktree", attention.worktree_path]);
@@ -5355,7 +5360,33 @@ <h2 id="recent-title">Run History</h2>
 				if (run.process_alive == null) {
 					return `${run.process_id} (unknown)`;
 				}
-				return `${run.process_id} (${run.process_alive ? "alive" : "stopped"})`;
+				const reason = run.process_alive
+					? "process_alive"
+					: run.process_liveness_reason || "process_stopped";
+				return `${run.process_id} (${processLivenessReasonLabel(reason)})`;
+			}
+
+			function processLivenessReasonLabel(reason) {
+				switch (reason) {
+					case "process_alive":
+						return "alive";
+					case "process_stopped":
+						return "stopped";
+					case "host_boot_id_missing":
+						return "boot identity missing";
+					case "host_boot_id_unavailable":
+						return "boot identity unavailable";
+					case "host_boot_id_mismatch":
+						return "previous boot";
+					case "process_start_identity_missing":
+						return "start identity missing";
+					case "process_start_identity_unavailable":
+						return "start identity unavailable";
+					case "process_start_identity_mismatch":
+						return "process identity changed";
+					default:
+						return reason ? reason.replaceAll("_", " ") : "unknown";
+				}
 			}
 
 			function runExecutionLivenessSummary(run) {

diff --git a/apps/decodex/src/orchestrator/prompting.rs b/apps/decodex/src/orchestrator/prompting.rs
@@ -1,6 +1,14 @@
 const PROMPT_ONLY_INTERNAL_REVIEW_INSTRUCTION: &str =
 	"Review your work repeatedly and fix any logic bugs until no new issues are found.";
 
+fn build_retry_recovery_context(dispatch_mode: IssueDispatchMode) -> Option<String> {
+	(dispatch_mode == IssueDispatchMode::Retry).then(|| {
+		String::from(
+			"Recovery context\n- This is retry-style re-entry after a prior attempt stopped or could not prove live execution.\n- Treat the current worktree, tracker state, protocol events, and marker files as the durable source of truth. Do not assume in-memory model output or tool results survived.\n- Before editing, inspect the current branch, diff, and recent validation evidence, reconcile partial work already present, and continue from that state instead of restarting from scratch.",
+		)
+	})
+}
+
 fn review_pull_request_title(issue: &TrackerIssue) -> String {
 	let title = issue.title.trim();
 	let prefix = format!("{}:", issue.identifier);
@@ -56,6 +64,10 @@ where
 		"Commit contract\n- When you create a local commit for this lane, use a single-line `decodex/commit/1` JSON commit message.\n- Required fields: `schema`, `summary`, and `authority`.\n- `authority` must be the authoritative Linear issue identifier for this lane.\n- Optional fields: `related` and `breaking`.\n- Do not encode landing mode, CI status, closeout state, or other process-state fields in the commit message.",
 	));
 
+	if let Some(recovery_context) = build_retry_recovery_context(issue_run.dispatch_mode) {
+		sections.push(recovery_context);
+	}
+
 	let repair_architecture_guidance =
 		build_external_repair_architecture_guidance(project, state_store, issue_run);
 	let completed_state = workflow
@@ -150,6 +162,9 @@ where
 		.tracker()
 		.resolved_completed_state();
 	let internal_review_mode = project.codex().internal_review_mode();
+	let recovery_context = build_retry_recovery_context(issue_run.dispatch_mode)
+		.map(|section| format!("{section}\n\n"))
+		.unwrap_or_default();
 
 	match issue_run.dispatch_mode {
 			IssueDispatchMode::ReviewRepair => format!(
@@ -188,12 +203,13 @@ where
 			label_tool = ISSUE_LABEL_ADD_TOOL_NAME,
 			continuation_guidance = continuation_guidance,
 		),
-			_ => format!(
-				"Resolve Linear issue {identifier}: {title}\n\nDescription:\n{description}\n\nExecution checklist:\n- Move the issue to `{in_progress}` with `{transition_tool}`. Decodex already records the run-start Linear ledger, so do not leave a separate start comment.\n- Update `{progress_checkpoint_tool}` whenever the execution phase, focus, next action, blockers, evidence, or verification state changes materially.\n- Keep discovery bounded to the minimal implementation files needed for this issue; defer broader docs or upstream reading unless a concrete ambiguity blocks the change.\n- Implement the fix in the current worktree.\n{internal_review_guidance}- Treat failures from repo-native `canonicalize_commands`, `verify_commands`, or tracked rewrites left by that repo gate as continued repair by default: keep fixing the lane and rerun the gate instead of taking `manual_attention` unless the blocker is clearly toolchain, environment, or operator-owned.\n- Run the repository validation needed to justify a reviewable PR.\n- Commit the lane, push branch `{branch}`, and create or update a non-draft PR titled `{pr_title}` for that branch.\n{completion_guidance}- If the issue needs manual attention, add label `{needs_attention}` with `{label_tool}`, explain why in a comment, and then call `{terminal_finalize_tool}` with path `manual_attention`. Do not call `{review_handoff_tool}` in that case; `decodex` will stop the lane as a human-required failure without automatic retry.\n- Do not move the issue directly to `{success}` with `{transition_tool}`; `decodex` will finish that writeback after its own validation passes.\n- Do not report the run as complete or treat `{progress_checkpoint_tool}` as terminal completion until `{terminal_finalize_tool}` succeeds.{continuation_guidance}",
-				identifier = issue.identifier,
-				title = issue.title,
-				description = description,
-				transition_tool = ISSUE_TRANSITION_TOOL_NAME,
+				_ => format!(
+					"Resolve Linear issue {identifier}: {title}\n\nDescription:\n{description}\n\n{recovery_context}Execution checklist:\n- Move the issue to `{in_progress}` with `{transition_tool}`. Decodex already records the run-start Linear ledger, so do not leave a separate start comment.\n- Update `{progress_checkpoint_tool}` whenever the execution phase, focus, next action, blockers, evidence, or verification state changes materially.\n- Keep discovery bounded to the minimal implementation files needed for this issue; defer broader docs or upstream reading unless a concrete ambiguity blocks the change.\n- Implement the fix in the current worktree.\n{internal_review_guidance}- Treat failures from repo-native `canonicalize_commands`, `verify_commands`, or tracked rewrites left by that repo gate as continued repair by default: keep fixing the lane and rerun the gate instead of taking `manual_attention` unless the blocker is clearly toolchain, environment, or operator-owned.\n- Run the repository validation needed to justify a reviewable PR.\n- Commit the lane, push branch `{branch}`, and create or update a non-draft PR titled `{pr_title}` for that branch.\n{completion_guidance}- If the issue needs manual attention, add label `{needs_attention}` with `{label_tool}`, explain why in a comment, and then call `{terminal_finalize_tool}` with path `manual_attention`. Do not call `{review_handoff_tool}` in that case; `decodex` will stop the lane as a human-required failure without automatic retry.\n- Do not move the issue directly to `{success}` with `{transition_tool}`; `decodex` will finish that writeback after its own validation passes.\n- Do not report the run as complete or treat `{progress_checkpoint_tool}` as terminal completion until `{terminal_finalize_tool}` succeeds.{continuation_guidance}",
+					identifier = issue.identifier,
+					title = issue.title,
+					description = description,
+					recovery_context = recovery_context,
+					transition_tool = ISSUE_TRANSITION_TOOL_NAME,
 				label_tool = ISSUE_LABEL_ADD_TOOL_NAME,
 				progress_checkpoint_tool = ISSUE_PROGRESS_CHECKPOINT_TOOL_NAME,
 				review_handoff_tool = ISSUE_REVIEW_HANDOFF_TOOL_NAME,

diff --git a/apps/decodex/src/orchestrator/status.rs b/apps/decodex/src/orchestrator/status.rs
@@ -61,13 +61,20 @@ impl PostReviewOrchestrationStatus {
 struct OperatorRunTiming {
 	process_id: Option<u32>,
 	process_alive: Option<bool>,
+	process_liveness_reason: Option<String>,
 	last_run_activity_unix_epoch: Option<i64>,
 	last_protocol_activity_unix_epoch: Option<i64>,
 	last_progress_unix_epoch: Option<i64>,
 	idle_for_seconds: Option<i64>,
 	protocol_idle_for_seconds: Option<i64>,
 }
 
+#[derive(Clone, Copy)]
+struct MarkerProcessLiveness {
+	alive: bool,
+	reason: &'static str,
+}
+
 struct OperatorRunAppServerState {
 	thread_id: Option<String>,
 	turn_id: Option<String>,
@@ -1536,6 +1543,7 @@ where
 		retry_budget_attempts,
 		worktree_has_tracked_changes,
 	);
+	let process_liveness = marker.as_ref().and_then(marker_process_liveness_for_marker);
 
 	Ok(Some(OperatorQueuedIssueAttentionStatus {
 		summary,
@@ -1572,7 +1580,8 @@ where
 			.and_then(RunActivityMarker::last_event_type)
 			.map(str::to_owned),
 		event_count: marker.as_ref().map_or(0, RunActivityMarker::event_count),
-		process_alive: marker.as_ref().and_then(|marker| marker.process_id().map(process_is_alive)),
+		process_alive: process_liveness.map(|liveness| liveness.alive),
+		process_liveness_reason: process_liveness.map(|liveness| liveness.reason.to_owned()),
 		worktree_path: worktree_path
 			.exists()
 			.then(|| relative_worktree_path_for_path(project, &worktree_path)),
@@ -3274,13 +3283,63 @@ fn recoverable_worktree_identifiers(worktree_root: &Path) -> crate::prelude::Res
 }
 
 fn worktree_activity_marker_is_fresh(marker: &RunActivityMarker, now_unix_epoch: i64) -> bool {
-	marker.process_id().is_some_and(process_is_alive)
+	marker_process_is_alive(marker)
 		&& marker
 			.last_activity_unix_epoch()
 			.and_then(|last_activity| observed_idle_duration(last_activity, now_unix_epoch))
 			.is_some_and(|idle_for| idle_for < ACTIVE_RUN_IDLE_TIMEOUT)
 }
 
+fn marker_process_is_alive(marker: &RunActivityMarker) -> bool {
+	marker_process_liveness(marker).alive
+}
+
+fn marker_process_liveness_for_marker(
+	marker: &RunActivityMarker,
+) -> Option<MarkerProcessLiveness> {
+	marker.process_id().map(|_| marker_process_liveness(marker))
+}
+
+fn marker_process_liveness(marker: &RunActivityMarker) -> MarkerProcessLiveness {
+	let Some(process_id) = marker.process_id() else {
+		return MarkerProcessLiveness { alive: false, reason: "process_id_missing" };
+	};
+
+	if !process_is_alive(process_id) {
+		return MarkerProcessLiveness { alive: false, reason: "process_stopped" };
+	}
+
+	let Some(marker_host_boot_id) = marker.host_boot_id() else {
+		return MarkerProcessLiveness { alive: false, reason: "host_boot_id_missing" };
+	};
+	let Some(current_host_boot_id) = state::current_host_boot_id() else {
+		return MarkerProcessLiveness { alive: false, reason: "host_boot_id_unavailable" };
+	};
+
+	if marker_host_boot_id != current_host_boot_id.as_str() {
+		return MarkerProcessLiveness { alive: false, reason: "host_boot_id_mismatch" };
+	}
+
+	let Some(marker_process_start_identity) = marker.process_start_identity() else {
+		return MarkerProcessLiveness { alive: false, reason: "process_start_identity_missing" };
+	};
+	let Some(current_process_start_identity) = state::process_start_identity(process_id) else {
+		return MarkerProcessLiveness {
+			alive: false,
+			reason: "process_start_identity_unavailable",
+		};
+	};
+
+	if marker_process_start_identity != current_process_start_identity.as_str() {
+		return MarkerProcessLiveness {
+			alive: false,
+			reason: "process_start_identity_mismatch",
+		};
+	}
+
+	MarkerProcessLiveness { alive: true, reason: "process_alive" }
+}
+
 fn process_is_alive(process_id: u32) -> bool {
 	let Ok(process_id) = pid_t::try_from(process_id) else {
 		return false;
@@ -3409,11 +3468,12 @@ fn operator_run_status(
 		suspected_stall,
 		last_event_type: protocol_summary.last_event_type,
 		last_event_at: protocol_summary.last_event_at,
-		event_count: protocol_summary.event_count,
-		process_id: timing.process_id,
-		process_alive: timing.process_alive,
-		retry_kind,
-		next_retry_at: format_optional_unix_timestamp(retry_ready_at_unix_epoch),
+			event_count: protocol_summary.event_count,
+			process_id: timing.process_id,
+			process_alive: timing.process_alive,
+			process_liveness_reason: timing.process_liveness_reason,
+			retry_kind,
+			next_retry_at: format_optional_unix_timestamp(retry_ready_at_unix_epoch),
 		effective_model: app_server_state.effective_model,
 		effective_model_provider: app_server_state.effective_model_provider,
 		effective_cwd: app_server_state.effective_cwd,
@@ -3460,9 +3520,11 @@ fn operator_run_timing(
 		marker.and_then(RunActivityMarker::last_progress_unix_epoch),
 		last_protocol_activity_unix_epoch,
 	);
+	let process_liveness = marker.and_then(marker_process_liveness_for_marker);
 
 	OperatorRunTiming {
-		process_alive: process_id.map(process_is_alive),
+		process_alive: process_liveness.map(|liveness| liveness.alive),
+		process_liveness_reason: process_liveness.map(|liveness| liveness.reason.to_owned()),
 		process_id,
 		last_run_activity_unix_epoch,
 		last_protocol_activity_unix_epoch,
@@ -4696,7 +4758,7 @@ fn append_rendered_run(output: &mut String, run: &OperatorRunStatus) {
 	let accounts = render_accounts_summary(&run.accounts);
 
 	output.push_str(&format!(
-		"- run_id: {}\n  project_id: {}\n  issue_id: {}\n  issue_identifier: {}\n  title: {}\n  attempt: {}\n  status: {}\n  attempt_status: {}\n  phase: {}\n  wait_reason: {}\n  current_operation: {}\n  active_lease: {}\n  queue_lease_state: {}\n  queue_lease: {}\n  execution_liveness: {}\n  freshness_at: {}\n  freshness_source: {}\n  timing: run_idle={} protocol_idle={} last_progress={} protocol_event={} events={}\n  account: {}\n  accounts: {}\n  child_agent_activity: {}\n  protocol_activity: {}\n  context_pressure: {}\n  thread_id: {}\n  turn_id: {}\n  thread_status: {}\n  thread_active_flags: {}\n  interactive_requested: {}\n  continuation_pending: {}\n  branch: {}\n  worktree_path: {}\n  updated_at: {}\n  last_run_activity_at: {}\n  last_protocol_activity_at: {}\n  last_progress_at: {}\n  idle_for_seconds: {}\n  protocol_idle_for_seconds: {}\n  suspected_stall: {}\n  process_id: {}\n  process_alive: {}\n  retry_kind: {}\n  next_retry_at: {}\n  effective_model: {}\n  effective_model_provider: {}\n  effective_cwd: {}\n  effective_approval_policy: {}\n  effective_approvals_reviewer: {}\n  effective_sandbox_mode: {}\n  protocol_event: {}\n  event_count: {}\n",
+		"- run_id: {}\n  project_id: {}\n  issue_id: {}\n  issue_identifier: {}\n  title: {}\n  attempt: {}\n  status: {}\n  attempt_status: {}\n  phase: {}\n  wait_reason: {}\n  current_operation: {}\n  active_lease: {}\n  queue_lease_state: {}\n  queue_lease: {}\n  execution_liveness: {}\n  freshness_at: {}\n  freshness_source: {}\n  timing: run_idle={} protocol_idle={} last_progress={} protocol_event={} events={}\n  account: {}\n  accounts: {}\n  child_agent_activity: {}\n  protocol_activity: {}\n  context_pressure: {}\n  thread_id: {}\n  turn_id: {}\n  thread_status: {}\n  thread_active_flags: {}\n  interactive_requested: {}\n  continuation_pending: {}\n  branch: {}\n  worktree_path: {}\n  updated_at: {}\n  last_run_activity_at: {}\n  last_protocol_activity_at: {}\n  last_progress_at: {}\n  idle_for_seconds: {}\n  protocol_idle_for_seconds: {}\n  suspected_stall: {}\n  process_id: {}\n  process_alive: {}\n  process_liveness_reason: {}\n  retry_kind: {}\n  next_retry_at: {}\n  effective_model: {}\n  effective_model_provider: {}\n  effective_cwd: {}\n  effective_approval_policy: {}\n  effective_approvals_reviewer: {}\n  effective_sandbox_mode: {}\n  protocol_event: {}\n  event_count: {}\n",
 		run.run_id,
 		run.project_id,
 		run.issue_id,
@@ -4744,6 +4806,7 @@ fn append_rendered_run(output: &mut String, run: &OperatorRunStatus) {
 			|| String::from("none"),
 			|value| if value { String::from("yes") } else { String::from("no") },
 		),
+		run.process_liveness_reason.as_deref().unwrap_or("none"),
 		run.retry_kind.as_deref().unwrap_or("none"),
 		run.next_retry_at.as_deref().unwrap_or("none"),
 		run.effective_model.as_deref().unwrap_or("none"),