Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions apps/decodex/src/orchestrator/agent_evidence.rs
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ struct AgentRunCapsule {
interactive_requested: bool,
process_id: Option<u32>,
process_alive: Option<bool>,
process_liveness_reason: Option<String>,
event_count: i64,
last_event_type: Option<String>,
last_event_at: Option<String>,
Expand Down Expand Up @@ -588,10 +589,11 @@ fn agent_run_capsule(
turn_id: run.turn_id.clone(),
thread_status: run.thread_status.clone(),
thread_active_flags: run.thread_active_flags.clone(),
interactive_requested: run.interactive_requested,
process_id: run.process_id,
process_alive: run.process_alive,
event_count: run.event_count,
interactive_requested: run.interactive_requested,
process_id: run.process_id,
process_alive: run.process_alive,
process_liveness_reason: run.process_liveness_reason.clone(),
event_count: run.event_count,
last_event_type: run.last_event_type.clone(),
last_event_at: run.last_event_at.clone(),
last_run_activity_at: run.last_run_activity_at.clone(),
Expand Down
35 changes: 33 additions & 2 deletions apps/decodex/src/orchestrator/operator_dashboard.html
Original file line number Diff line number Diff line change
Expand Up @@ -4787,7 +4787,12 @@ <h2 id="recent-title">Run History</h2>
facts.push(["Patch", "retained"]);
}
if (attention.process_alive != null) {
facts.push(["Process", attention.process_alive ? "alive" : "stopped"]);
facts.push([
"Process",
attention.process_alive
? "alive"
: processLivenessReasonLabel(attention.process_liveness_reason || "process_stopped"),
]);
}
if (attention.worktree_path) {
facts.push(["Worktree", attention.worktree_path]);
Expand Down Expand Up @@ -5355,7 +5360,33 @@ <h2 id="recent-title">Run History</h2>
if (run.process_alive == null) {
return `${run.process_id} (unknown)`;
}
return `${run.process_id} (${run.process_alive ? "alive" : "stopped"})`;
const reason = run.process_alive
? "process_alive"
: run.process_liveness_reason || "process_stopped";
return `${run.process_id} (${processLivenessReasonLabel(reason)})`;
}

function processLivenessReasonLabel(reason) {
switch (reason) {
case "process_alive":
return "alive";
case "process_stopped":
return "stopped";
case "host_boot_id_missing":
return "boot identity missing";
case "host_boot_id_unavailable":
return "boot identity unavailable";
case "host_boot_id_mismatch":
return "previous boot";
case "process_start_identity_missing":
return "start identity missing";
case "process_start_identity_unavailable":
return "start identity unavailable";
case "process_start_identity_mismatch":
return "process identity changed";
default:
return reason ? reason.replaceAll("_", " ") : "unknown";
}
}

function runExecutionLivenessSummary(run) {
Expand Down
28 changes: 22 additions & 6 deletions apps/decodex/src/orchestrator/prompting.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,14 @@
const PROMPT_ONLY_INTERNAL_REVIEW_INSTRUCTION: &str =
"Review your work repeatedly and fix any logic bugs until no new issues are found.";

fn build_retry_recovery_context(dispatch_mode: IssueDispatchMode) -> Option<String> {
(dispatch_mode == IssueDispatchMode::Retry).then(|| {
String::from(
"Recovery context\n- This is retry-style re-entry after a prior attempt stopped or could not prove live execution.\n- Treat the current worktree, tracker state, protocol events, and marker files as the durable source of truth. Do not assume in-memory model output or tool results survived.\n- Before editing, inspect the current branch, diff, and recent validation evidence, reconcile partial work already present, and continue from that state instead of restarting from scratch.",
)
})
}

fn review_pull_request_title(issue: &TrackerIssue) -> String {
let title = issue.title.trim();
let prefix = format!("{}:", issue.identifier);
Expand Down Expand Up @@ -56,6 +64,10 @@ where
"Commit contract\n- When you create a local commit for this lane, use a single-line `decodex/commit/1` JSON commit message.\n- Required fields: `schema`, `summary`, and `authority`.\n- `authority` must be the authoritative Linear issue identifier for this lane.\n- Optional fields: `related` and `breaking`.\n- Do not encode landing mode, CI status, closeout state, or other process-state fields in the commit message.",
));

if let Some(recovery_context) = build_retry_recovery_context(issue_run.dispatch_mode) {
sections.push(recovery_context);
}

let repair_architecture_guidance =
build_external_repair_architecture_guidance(project, state_store, issue_run);
let completed_state = workflow
Expand Down Expand Up @@ -150,6 +162,9 @@ where
.tracker()
.resolved_completed_state();
let internal_review_mode = project.codex().internal_review_mode();
let recovery_context = build_retry_recovery_context(issue_run.dispatch_mode)
.map(|section| format!("{section}\n\n"))
.unwrap_or_default();

match issue_run.dispatch_mode {
IssueDispatchMode::ReviewRepair => format!(
Expand Down Expand Up @@ -188,12 +203,13 @@ where
label_tool = ISSUE_LABEL_ADD_TOOL_NAME,
continuation_guidance = continuation_guidance,
),
_ => format!(
"Resolve Linear issue {identifier}: {title}\n\nDescription:\n{description}\n\nExecution checklist:\n- Move the issue to `{in_progress}` with `{transition_tool}`. Decodex already records the run-start Linear ledger, so do not leave a separate start comment.\n- Update `{progress_checkpoint_tool}` whenever the execution phase, focus, next action, blockers, evidence, or verification state changes materially.\n- Keep discovery bounded to the minimal implementation files needed for this issue; defer broader docs or upstream reading unless a concrete ambiguity blocks the change.\n- Implement the fix in the current worktree.\n{internal_review_guidance}- Treat failures from repo-native `canonicalize_commands`, `verify_commands`, or tracked rewrites left by that repo gate as continued repair by default: keep fixing the lane and rerun the gate instead of taking `manual_attention` unless the blocker is clearly toolchain, environment, or operator-owned.\n- Run the repository validation needed to justify a reviewable PR.\n- Commit the lane, push branch `{branch}`, and create or update a non-draft PR titled `{pr_title}` for that branch.\n{completion_guidance}- If the issue needs manual attention, add label `{needs_attention}` with `{label_tool}`, explain why in a comment, and then call `{terminal_finalize_tool}` with path `manual_attention`. Do not call `{review_handoff_tool}` in that case; `decodex` will stop the lane as a human-required failure without automatic retry.\n- Do not move the issue directly to `{success}` with `{transition_tool}`; `decodex` will finish that writeback after its own validation passes.\n- Do not report the run as complete or treat `{progress_checkpoint_tool}` as terminal completion until `{terminal_finalize_tool}` succeeds.{continuation_guidance}",
identifier = issue.identifier,
title = issue.title,
description = description,
transition_tool = ISSUE_TRANSITION_TOOL_NAME,
_ => format!(
"Resolve Linear issue {identifier}: {title}\n\nDescription:\n{description}\n\n{recovery_context}Execution checklist:\n- Move the issue to `{in_progress}` with `{transition_tool}`. Decodex already records the run-start Linear ledger, so do not leave a separate start comment.\n- Update `{progress_checkpoint_tool}` whenever the execution phase, focus, next action, blockers, evidence, or verification state changes materially.\n- Keep discovery bounded to the minimal implementation files needed for this issue; defer broader docs or upstream reading unless a concrete ambiguity blocks the change.\n- Implement the fix in the current worktree.\n{internal_review_guidance}- Treat failures from repo-native `canonicalize_commands`, `verify_commands`, or tracked rewrites left by that repo gate as continued repair by default: keep fixing the lane and rerun the gate instead of taking `manual_attention` unless the blocker is clearly toolchain, environment, or operator-owned.\n- Run the repository validation needed to justify a reviewable PR.\n- Commit the lane, push branch `{branch}`, and create or update a non-draft PR titled `{pr_title}` for that branch.\n{completion_guidance}- If the issue needs manual attention, add label `{needs_attention}` with `{label_tool}`, explain why in a comment, and then call `{terminal_finalize_tool}` with path `manual_attention`. Do not call `{review_handoff_tool}` in that case; `decodex` will stop the lane as a human-required failure without automatic retry.\n- Do not move the issue directly to `{success}` with `{transition_tool}`; `decodex` will finish that writeback after its own validation passes.\n- Do not report the run as complete or treat `{progress_checkpoint_tool}` as terminal completion until `{terminal_finalize_tool}` succeeds.{continuation_guidance}",
identifier = issue.identifier,
title = issue.title,
description = description,
recovery_context = recovery_context,
transition_tool = ISSUE_TRANSITION_TOOL_NAME,
label_tool = ISSUE_LABEL_ADD_TOOL_NAME,
progress_checkpoint_tool = ISSUE_PROGRESS_CHECKPOINT_TOOL_NAME,
review_handoff_tool = ISSUE_REVIEW_HANDOFF_TOOL_NAME,
Expand Down
81 changes: 72 additions & 9 deletions apps/decodex/src/orchestrator/status.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,13 +61,20 @@ impl PostReviewOrchestrationStatus {
struct OperatorRunTiming {
process_id: Option<u32>,
process_alive: Option<bool>,
process_liveness_reason: Option<String>,
last_run_activity_unix_epoch: Option<i64>,
last_protocol_activity_unix_epoch: Option<i64>,
last_progress_unix_epoch: Option<i64>,
idle_for_seconds: Option<i64>,
protocol_idle_for_seconds: Option<i64>,
}

#[derive(Clone, Copy)]
struct MarkerProcessLiveness {
alive: bool,
reason: &'static str,
}

struct OperatorRunAppServerState {
thread_id: Option<String>,
turn_id: Option<String>,
Expand Down Expand Up @@ -1536,6 +1543,7 @@ where
retry_budget_attempts,
worktree_has_tracked_changes,
);
let process_liveness = marker.as_ref().and_then(marker_process_liveness_for_marker);

Ok(Some(OperatorQueuedIssueAttentionStatus {
summary,
Expand Down Expand Up @@ -1572,7 +1580,8 @@ where
.and_then(RunActivityMarker::last_event_type)
.map(str::to_owned),
event_count: marker.as_ref().map_or(0, RunActivityMarker::event_count),
process_alive: marker.as_ref().and_then(|marker| marker.process_id().map(process_is_alive)),
process_alive: process_liveness.map(|liveness| liveness.alive),
process_liveness_reason: process_liveness.map(|liveness| liveness.reason.to_owned()),
worktree_path: worktree_path
.exists()
.then(|| relative_worktree_path_for_path(project, &worktree_path)),
Expand Down Expand Up @@ -3274,13 +3283,63 @@ fn recoverable_worktree_identifiers(worktree_root: &Path) -> crate::prelude::Res
}

fn worktree_activity_marker_is_fresh(marker: &RunActivityMarker, now_unix_epoch: i64) -> bool {
marker.process_id().is_some_and(process_is_alive)
marker_process_is_alive(marker)
&& marker
.last_activity_unix_epoch()
.and_then(|last_activity| observed_idle_duration(last_activity, now_unix_epoch))
.is_some_and(|idle_for| idle_for < ACTIVE_RUN_IDLE_TIMEOUT)
}

fn marker_process_is_alive(marker: &RunActivityMarker) -> bool {
marker_process_liveness(marker).alive
}

fn marker_process_liveness_for_marker(
marker: &RunActivityMarker,
) -> Option<MarkerProcessLiveness> {
marker.process_id().map(|_| marker_process_liveness(marker))
}

fn marker_process_liveness(marker: &RunActivityMarker) -> MarkerProcessLiveness {
let Some(process_id) = marker.process_id() else {
return MarkerProcessLiveness { alive: false, reason: "process_id_missing" };
};

if !process_is_alive(process_id) {
return MarkerProcessLiveness { alive: false, reason: "process_stopped" };
}

let Some(marker_host_boot_id) = marker.host_boot_id() else {
return MarkerProcessLiveness { alive: false, reason: "host_boot_id_missing" };
};
let Some(current_host_boot_id) = state::current_host_boot_id() else {
return MarkerProcessLiveness { alive: false, reason: "host_boot_id_unavailable" };
};

if marker_host_boot_id != current_host_boot_id.as_str() {
return MarkerProcessLiveness { alive: false, reason: "host_boot_id_mismatch" };
}

let Some(marker_process_start_identity) = marker.process_start_identity() else {
return MarkerProcessLiveness { alive: false, reason: "process_start_identity_missing" };
};
let Some(current_process_start_identity) = state::process_start_identity(process_id) else {
return MarkerProcessLiveness {
alive: false,
reason: "process_start_identity_unavailable",
};
};

if marker_process_start_identity != current_process_start_identity.as_str() {
return MarkerProcessLiveness {
alive: false,
reason: "process_start_identity_mismatch",
};
}

MarkerProcessLiveness { alive: true, reason: "process_alive" }
}

fn process_is_alive(process_id: u32) -> bool {
let Ok(process_id) = pid_t::try_from(process_id) else {
return false;
Expand Down Expand Up @@ -3409,11 +3468,12 @@ fn operator_run_status(
suspected_stall,
last_event_type: protocol_summary.last_event_type,
last_event_at: protocol_summary.last_event_at,
event_count: protocol_summary.event_count,
process_id: timing.process_id,
process_alive: timing.process_alive,
retry_kind,
next_retry_at: format_optional_unix_timestamp(retry_ready_at_unix_epoch),
event_count: protocol_summary.event_count,
process_id: timing.process_id,
process_alive: timing.process_alive,
process_liveness_reason: timing.process_liveness_reason,
retry_kind,
next_retry_at: format_optional_unix_timestamp(retry_ready_at_unix_epoch),
effective_model: app_server_state.effective_model,
effective_model_provider: app_server_state.effective_model_provider,
effective_cwd: app_server_state.effective_cwd,
Expand Down Expand Up @@ -3460,9 +3520,11 @@ fn operator_run_timing(
marker.and_then(RunActivityMarker::last_progress_unix_epoch),
last_protocol_activity_unix_epoch,
);
let process_liveness = marker.and_then(marker_process_liveness_for_marker);

OperatorRunTiming {
process_alive: process_id.map(process_is_alive),
process_alive: process_liveness.map(|liveness| liveness.alive),
process_liveness_reason: process_liveness.map(|liveness| liveness.reason.to_owned()),
process_id,
last_run_activity_unix_epoch,
last_protocol_activity_unix_epoch,
Expand Down Expand Up @@ -4696,7 +4758,7 @@ fn append_rendered_run(output: &mut String, run: &OperatorRunStatus) {
let accounts = render_accounts_summary(&run.accounts);

output.push_str(&format!(
"- run_id: {}\n project_id: {}\n issue_id: {}\n issue_identifier: {}\n title: {}\n attempt: {}\n status: {}\n attempt_status: {}\n phase: {}\n wait_reason: {}\n current_operation: {}\n active_lease: {}\n queue_lease_state: {}\n queue_lease: {}\n execution_liveness: {}\n freshness_at: {}\n freshness_source: {}\n timing: run_idle={} protocol_idle={} last_progress={} protocol_event={} events={}\n account: {}\n accounts: {}\n child_agent_activity: {}\n protocol_activity: {}\n context_pressure: {}\n thread_id: {}\n turn_id: {}\n thread_status: {}\n thread_active_flags: {}\n interactive_requested: {}\n continuation_pending: {}\n branch: {}\n worktree_path: {}\n updated_at: {}\n last_run_activity_at: {}\n last_protocol_activity_at: {}\n last_progress_at: {}\n idle_for_seconds: {}\n protocol_idle_for_seconds: {}\n suspected_stall: {}\n process_id: {}\n process_alive: {}\n retry_kind: {}\n next_retry_at: {}\n effective_model: {}\n effective_model_provider: {}\n effective_cwd: {}\n effective_approval_policy: {}\n effective_approvals_reviewer: {}\n effective_sandbox_mode: {}\n protocol_event: {}\n event_count: {}\n",
"- run_id: {}\n project_id: {}\n issue_id: {}\n issue_identifier: {}\n title: {}\n attempt: {}\n status: {}\n attempt_status: {}\n phase: {}\n wait_reason: {}\n current_operation: {}\n active_lease: {}\n queue_lease_state: {}\n queue_lease: {}\n execution_liveness: {}\n freshness_at: {}\n freshness_source: {}\n timing: run_idle={} protocol_idle={} last_progress={} protocol_event={} events={}\n account: {}\n accounts: {}\n child_agent_activity: {}\n protocol_activity: {}\n context_pressure: {}\n thread_id: {}\n turn_id: {}\n thread_status: {}\n thread_active_flags: {}\n interactive_requested: {}\n continuation_pending: {}\n branch: {}\n worktree_path: {}\n updated_at: {}\n last_run_activity_at: {}\n last_protocol_activity_at: {}\n last_progress_at: {}\n idle_for_seconds: {}\n protocol_idle_for_seconds: {}\n suspected_stall: {}\n process_id: {}\n process_alive: {}\n process_liveness_reason: {}\n retry_kind: {}\n next_retry_at: {}\n effective_model: {}\n effective_model_provider: {}\n effective_cwd: {}\n effective_approval_policy: {}\n effective_approvals_reviewer: {}\n effective_sandbox_mode: {}\n protocol_event: {}\n event_count: {}\n",
run.run_id,
run.project_id,
run.issue_id,
Expand Down Expand Up @@ -4744,6 +4806,7 @@ fn append_rendered_run(output: &mut String, run: &OperatorRunStatus) {
|| String::from("none"),
|value| if value { String::from("yes") } else { String::from("no") },
),
run.process_liveness_reason.as_deref().unwrap_or("none"),
run.retry_kind.as_deref().unwrap_or("none"),
run.next_retry_at.as_deref().unwrap_or("none"),
run.effective_model.as_deref().unwrap_or("none"),
Expand Down
Loading