diff --git a/apps/decodex/src/orchestrator/daemon.rs b/apps/decodex/src/orchestrator/daemon.rs index 834cfa4..dd2733e 100644 --- a/apps/decodex/src/orchestrator/daemon.rs +++ b/apps/decodex/src/orchestrator/daemon.rs @@ -434,6 +434,8 @@ where worktree_mapping.as_ref(), )? { Some(ActiveRunDisposition::RetainedReviewComplete) + } else if stalled_run_has_retained_partial_progress(worktree_mapping.as_ref()) { + Some(ActiveRunDisposition::StalledRetainedPartialProgress { idle_for }) } else { Some(ActiveRunDisposition::Stalled { idle_for }) } diff --git a/apps/decodex/src/orchestrator/execution.rs b/apps/decodex/src/orchestrator/execution.rs index 5b21f22..cadae12 100644 --- a/apps/decodex/src/orchestrator/execution.rs +++ b/apps/decodex/src/orchestrator/execution.rs @@ -420,7 +420,8 @@ fn terminal_failure_lifecycle_event( issue_run: &IssueRunPlan, failure: TerminalFailureLifecycle<'_>, ) -> records::LinearExecutionEventRecord { - let event_type = if failure.manual_attention_requested { + let retained_partial_progress = failure.error_class == "partial_progress_retained"; + let event_type = if failure.manual_attention_requested || retained_partial_progress { "needs_attention" } else { "terminal_failure" @@ -447,12 +448,26 @@ fn terminal_failure_lifecycle_event( record.worktree_path = Some(failure.worktree_path.to_owned()); record.error_class = Some(failure.error_class.to_owned()); record.next_action = Some(failure.next_action.to_owned()); - record.blockers = Some(vec![format!("Run failed with `{}`.", failure.error_class)]); - record.evidence = Some(vec![format!( - "Attempt {} reached terminal failure handling.", - issue_run.attempt_number - )]); - record.summary = Some(String::from("Decodex run failed and needs attention.")); + + if retained_partial_progress { + record.blockers = Some(vec![String::from( + "Retained tracked worktree changes require operator recovery.", + )]); + record.evidence = Some(vec![format!( + "Attempt {} stopped with tracked worktree changes retained.", + issue_run.attempt_number + )]); + record.summary = Some(String::from("Decodex retained partial progress and needs attention.")); + record.terminal_path = Some(String::from("retained_partial_progress")); + } else { + record.blockers = Some(vec![format!("Run failed with `{}`.", failure.error_class)]); + record.evidence = Some(vec![format!( + "Attempt {} reached terminal failure handling.", + issue_run.attempt_number + )]); + record.summary = Some(String::from("Decodex run failed and needs attention.")); + } + record.pr_url = failure.pr_url.map(ToOwned::to_owned); record.target_state = Some(failure.target_state.to_owned()); @@ -1448,6 +1463,7 @@ fn run_failure_requires_terminal_attention(error: &Report) -> bool { error.downcast_ref::().is_some() || error.downcast_ref::().is_some() || error.downcast_ref::().is_some() + || error.downcast_ref::().is_some() || error.downcast_ref::().is_some() || error.downcast_ref::().is_some() || error.downcast_ref::().is_some() @@ -1626,6 +1642,7 @@ fn retained_partial_progress_error( fn terminal_failure_has_specific_error_class(error: &Report) -> bool { error.downcast_ref::().is_some() || error.downcast_ref::().is_some() + || error.downcast_ref::().is_some() || error.downcast_ref::().is_some() || error.downcast_ref::().is_some() || error.downcast_ref::().is_some() diff --git a/apps/decodex/src/orchestrator/reconciliation.rs b/apps/decodex/src/orchestrator/reconciliation.rs index 7fece89..f4a99ce 100644 --- a/apps/decodex/src/orchestrator/reconciliation.rs +++ b/apps/decodex/src/orchestrator/reconciliation.rs @@ -64,6 +64,8 @@ where worktree_mapping.as_ref(), )? { Some(ActiveRunDisposition::RetainedReviewComplete) + } else if stalled_run_has_retained_partial_progress(worktree_mapping.as_ref()) { + Some(ActiveRunDisposition::StalledRetainedPartialProgress { idle_for }) } else { Some(ActiveRunDisposition::Stalled { idle_for }) } @@ -150,12 +152,17 @@ where else { return Ok(Vec::new()); }; + let disposition = if stalled_run_has_retained_partial_progress(worktree_mapping.as_ref()) { + ActiveRunDisposition::StalledRetainedPartialProgress { idle_for } + } else { + ActiveRunDisposition::Stalled { idle_for } + }; Ok(vec![ActiveRunReconciliation { issue, run_attempt, worktree_mapping, - disposition: ActiveRunDisposition::Stalled { idle_for }, + disposition, workflow: workflow.clone(), }]) } @@ -241,6 +248,16 @@ where *idle_for, )?; }, + ActiveRunDisposition::StalledRetainedPartialProgress { idle_for } => { + reconcile_stalled_retained_partial_progress_run( + tracker, + project, + state_store, + worktree_manager, + &action, + *idle_for, + )?; + }, ActiveRunDisposition::StalledAlreadyNeedsAttention { idle_for } => { reconcile_stalled_attention_run(project, state_store, &action, *idle_for)?; }, @@ -361,6 +378,84 @@ where state_store.update_run_status(action.run_attempt.run_id(), "stalled")?; state_store.clear_lease(&action.issue.id)?; + let issue_run = stalled_reconciliation_issue_run(state_store, worktree_manager, action)?; + + write_reconciliation_operation_marker_best_effort( + &issue_run.worktree.path, + &issue_run.run_id, + issue_run.attempt_number, + RUN_OPERATION_RECONCILIATION, + ); + handle_failure( + tracker, + project, + &action.workflow, + state_store, + &issue_run, + &Report::new(StalledRunNeedsAttention { + issue_identifier: action.issue.identifier.clone(), + run_id: action.run_attempt.run_id().to_owned(), + idle_for, + }), + )?; + + Ok(()) +} + +fn reconcile_stalled_retained_partial_progress_run( + tracker: &T, + project: &ServiceConfig, + state_store: &StateStore, + worktree_manager: &WorktreeManager, + action: &ActiveRunReconciliation, + idle_for: Duration, +) -> Result<()> +where + T: IssueTracker, +{ + tracing::warn!( + project_id = project.service_id(), + issue_id = action.issue.id, + issue = action.issue.identifier, + run_id = action.run_attempt.run_id(), + disposition = "stalled_retained_partial_progress", + idle_for_s = idle_for.as_secs(), + "Reconciling stalled run with retained partial progress." + ); + + state_store.update_run_status(action.run_attempt.run_id(), "stalled")?; + state_store.clear_lease(&action.issue.id)?; + + let issue_run = stalled_reconciliation_issue_run(state_store, worktree_manager, action)?; + let worktree_path = relative_worktree_path(project, &issue_run.worktree); + + write_reconciliation_operation_marker_best_effort( + &issue_run.worktree.path, + &issue_run.run_id, + issue_run.attempt_number, + RUN_OPERATION_RECONCILIATION, + ); + handle_failure( + tracker, + project, + &action.workflow, + state_store, + &issue_run, + &Report::new(RetainedPartialProgress { + issue_identifier: action.issue.identifier.clone(), + run_id: action.run_attempt.run_id().to_owned(), + worktree_path, + }), + )?; + + Ok(()) +} + +fn stalled_reconciliation_issue_run( + state_store: &StateStore, + worktree_manager: &WorktreeManager, + action: &ActiveRunReconciliation, +) -> Result { let worktree = action.worktree_mapping.as_ref().map_or_else( || worktree_manager.plan_for_issue(&action.issue.identifier), |mapping| WorktreeSpec { @@ -372,7 +467,8 @@ where ); let retry_budget_base = retry_budget_base_for_issue_worktree(state_store, &action.issue.id, &worktree.path)?; - let issue_run = IssueRunPlan { + + Ok(IssueRunPlan { issue: action.issue.clone(), issue_state: planned_issue_state_for_dispatch( &action.workflow, @@ -388,28 +484,7 @@ where attempt_number: action.run_attempt.attempt_number(), run_id: action.run_attempt.run_id().to_owned(), retry_budget_base, - }; - - write_reconciliation_operation_marker_best_effort( - &issue_run.worktree.path, - &issue_run.run_id, - issue_run.attempt_number, - RUN_OPERATION_RECONCILIATION, - ); - handle_failure( - tracker, - project, - &action.workflow, - state_store, - &issue_run, - &Report::new(StalledRunNeedsAttention { - issue_identifier: action.issue.identifier.clone(), - run_id: action.run_attempt.run_id().to_owned(), - idle_for, - }), - )?; - - Ok(()) + }) } fn reconcile_stalled_attention_run( @@ -456,6 +531,15 @@ fn write_reconciliation_operation_marker_best_effort( } } +fn stalled_run_has_retained_partial_progress( + worktree_mapping: Option<&WorktreeMapping>, +) -> bool { + match worktree_mapping { + Some(mapping) => worktree_has_tracked_changes(mapping.worktree_path()), + None => false, + } +} + fn retained_review_handoff_matches_run( state_store: &StateStore, run_attempt: &RunAttempt, diff --git a/apps/decodex/src/orchestrator/run_cycle.rs b/apps/decodex/src/orchestrator/run_cycle.rs index ee9069b..bae03f6 100644 --- a/apps/decodex/src/orchestrator/run_cycle.rs +++ b/apps/decodex/src/orchestrator/run_cycle.rs @@ -2786,6 +2786,10 @@ where }; let disposition = if needs_attention { ActiveRunDisposition::StalledAlreadyNeedsAttention { idle_for } + } else if is_issue_active_for_run(issue, context.workflow) + && worktree_has_tracked_changes(worktree_mapping.worktree_path()) + { + ActiveRunDisposition::StalledRetainedPartialProgress { idle_for } } else if is_issue_active_for_run(issue, context.workflow) { ActiveRunDisposition::Stalled { idle_for } } else { diff --git a/apps/decodex/src/orchestrator/selection.rs b/apps/decodex/src/orchestrator/selection.rs index 5d1272b..b9a806f 100644 --- a/apps/decodex/src/orchestrator/selection.rs +++ b/apps/decodex/src/orchestrator/selection.rs @@ -154,10 +154,22 @@ fn format_terminal_failure_comment( next_action: &str, ) -> String { let pr_url_line = pr_url.map_or_else(String::new, |pr_url| format!("\n- pr_url: `{pr_url}`")); + let retained_partial_progress = error_class == "partial_progress_retained"; + let heading = if retained_partial_progress { + "decodex retained partial progress and needs attention" + } else { + "decodex run failed and needs attention" + }; + let timestamp_label = if retained_partial_progress { "recorded_at" } else { "failed_at" }; + let error_summary = if retained_partial_progress { + "Sensitive runtime details were withheld from the tracker comment; inspect the retained lane for the full recovery context." + } else { + "Sensitive runtime details were withheld from the tracker comment; inspect the local lane for the full failure context." + }; format!( - "decodex run failed and needs attention\n\n- run_id: `{run_id}`\n- attempt: `{attempt_number}`\n- failed_at: `{failed_at}`\n- branch: `{branch}`{pr_url_line}\n- worktree_path: `{worktree}`\n- error_class: `{error_class}`\n- next_action: `{next_action}`\n- error_summary: `Sensitive runtime details were withheld from the tracker comment; inspect the local lane for the full failure context.`", - failed_at = current_timestamp(), + "{heading}\n\n- run_id: `{run_id}`\n- attempt: `{attempt_number}`\n- {timestamp_label}: `{timestamp}`\n- branch: `{branch}`{pr_url_line}\n- worktree_path: `{worktree}`\n- error_class: `{error_class}`\n- next_action: `{next_action}`\n- error_summary: `{error_summary}`", + timestamp = current_timestamp(), branch = branch_name, worktree = worktree_path ) diff --git a/apps/decodex/src/orchestrator/tests/operator/status/queue.rs b/apps/decodex/src/orchestrator/tests/operator/status/queue.rs index 8b21036..44c9fdd 100644 --- a/apps/decodex/src/orchestrator/tests/operator/status/queue.rs +++ b/apps/decodex/src/orchestrator/tests/operator/status/queue.rs @@ -813,20 +813,23 @@ fn live_operator_status_snapshot_surfaces_stalled_retained_partial_progress() { tracker.issue_comments.borrow_mut().insert( issue.id.clone(), - vec![linear_execution_history_comment( - &issue, - "terminal_failure", - "2026-03-13T09:20:00Z", - "stalled-retained-partial-progress", - |record| { - record.error_class = Some(String::from("partial_progress_retained")); - record.next_action = Some(String::from( - "inspect retained worktree `.worktrees/PUB-110`, finish validation and PR handoff or reset the patch manually", - )); - record.summary = Some(String::from("Decodex run retained partial progress.")); - record.blockers = Some(vec![String::from( - "tracked worktree changes were retained after stalled reconciliation", - )]); + vec![linear_execution_history_comment( + &issue, + "needs_attention", + "2026-03-13T09:20:00Z", + "stalled-retained-partial-progress", + |record| { + record.error_class = Some(String::from("partial_progress_retained")); + record.next_action = Some(String::from( + "inspect retained worktree `.worktrees/PUB-110`, finish validation and PR handoff or reset the patch manually", + )); + record.terminal_path = Some(String::from("retained_partial_progress")); + record.summary = Some(String::from( + "Decodex retained partial progress and needs attention.", + )); + record.blockers = Some(vec![String::from( + "tracked worktree changes were retained after stalled reconciliation", + )]); record.evidence = Some(vec![String::from( "worktree `.worktrees/PUB-110` has tracked changes", )]); diff --git a/apps/decodex/src/orchestrator/tests/recovery/reconciliation.rs b/apps/decodex/src/orchestrator/tests/recovery/reconciliation.rs index e6051c0..5b5682a 100644 --- a/apps/decodex/src/orchestrator/tests/recovery/reconciliation.rs +++ b/apps/decodex/src/orchestrator/tests/recovery/reconciliation.rs @@ -1068,16 +1068,38 @@ fn stalled_run_reconciliation_routes_to_needs_attention_without_cleanup() { && comment.contains("needs attention") && comment.contains("clear label `decodex:needs-attention`") })); + assert!( + tracker + .comments + .borrow() + .iter() + .all(|comment| !comment.contains("retained partial progress")) + ); + + let ledger_event = tracker + .comments + .borrow() + .iter() + .find_map(|comment| records::parse_linear_execution_event_record(comment)) + .expect("stalled no-progress run should write a Linear execution event"); + + assert_eq!(ledger_event.event_type, "terminal_failure"); + assert_eq!(ledger_event.error_class.as_deref(), Some("stalled_run_detected")); + assert_eq!(ledger_event.terminal_path.as_deref(), None); + assert_eq!( + ledger_event.summary.as_deref(), + Some("Decodex run failed and needs attention.") + ); } #[test] fn stalled_run_reconciliation_reports_retained_partial_progress_for_dirty_worktree() { let (_temp_dir, config, workflow) = temp_project_layout(); - let tracker = FakeTracker::new(vec![]); let state_store = StateStore::open_in_memory().expect("state store should open"); let worktree_manager = WorktreeManager::new("pubfi", config.repo_root(), config.worktree_root()); let issue = sample_issue("In Progress", &[]); + let tracker = FakeTracker::new(vec![issue.clone()]); let run_id = "run-stalled-dirty"; let worktree_path = config.worktree_root().join("PUB-102"); @@ -1103,28 +1125,35 @@ fn stalled_run_reconciliation_reports_retained_partial_progress_for_dirty_worktr &worktree_path.display().to_string(), ) .expect("worktree mapping should record"); + state_store + .append_event(run_id, 1, "turn/diff/updated", "{\"changes\":1}") + .expect("stalled dirty issue protocol event should record"); - let action = ActiveRunReconciliation { - issue: issue.clone(), - run_attempt: state_store - .run_attempt(run_id) - .expect("run attempt query should succeed") - .expect("run attempt should exist"), - worktree_mapping: state_store - .worktree_for_issue(&issue.id) - .expect("worktree query should succeed"), - disposition: ActiveRunDisposition::Stalled { - idle_for: ACTIVE_RUN_IDLE_TIMEOUT + Duration::from_secs(1), - }, - workflow: workflow.clone(), - }; + let now = + OffsetDateTime::now_utc().unix_timestamp() + ACTIVE_RUN_IDLE_TIMEOUT.as_secs() as i64 + 1; + let actions = orchestrator::inspect_active_run_reconciliation_at( + &tracker, + &config, + &workflow, + &state_store, + None, + now, + ) + .expect("dirty stalled-run inspection should succeed"); + + assert_eq!(actions.len(), 1); + assert!(matches!( + actions[0].disposition, + ActiveRunDisposition::StalledRetainedPartialProgress { idle_for } + if idle_for >= ACTIVE_RUN_IDLE_TIMEOUT + )); orchestrator::apply_active_run_reconciliation( &tracker, &config, &state_store, &worktree_manager, - vec![action], + actions, ) .expect("reconciliation should succeed"); @@ -1147,11 +1176,42 @@ fn stalled_run_reconciliation_reports_retained_partial_progress_for_dirty_worktr let comments = tracker.comments.borrow(); assert!(comments.iter().any(|comment| { - comment.contains("partial_progress_retained") + comment.contains("decodex retained partial progress and needs attention") + && comment.contains("partial_progress_retained") && comment.contains("finish validation and PR handoff or reset the patch manually") && comment.contains(".worktrees/PUB-102") })); assert!(comments.iter().all(|comment| !comment.contains("stalled_run_detected"))); + assert!(comments.iter().all(|comment| !comment.contains("decodex run failed and needs attention"))); + + let ledger_event = comments + .iter() + .find_map(|comment| records::parse_linear_execution_event_record(comment)) + .expect("retained partial progress should write a Linear execution event"); + + assert_eq!(ledger_event.event_type, "needs_attention"); + assert_eq!(ledger_event.error_class.as_deref(), Some("partial_progress_retained")); + assert_eq!(ledger_event.terminal_path.as_deref(), Some("retained_partial_progress")); + assert_eq!( + ledger_event.summary.as_deref(), + Some("Decodex retained partial progress and needs attention.") + ); + assert_eq!( + ledger_event.blockers.as_deref(), + Some([String::from( + "Retained tracked worktree changes require operator recovery." + )] + .as_slice()) + ); + assert!( + ledger_event + .evidence + .as_deref() + .is_some_and(|evidence| evidence + .iter() + .any(|item| item.contains("tracked worktree changes retained"))), + "retained partial progress evidence should mention retained tracked changes" + ); } #[test] diff --git a/apps/decodex/src/orchestrator/tests/recovery/runtime_reentry.rs b/apps/decodex/src/orchestrator/tests/recovery/runtime_reentry.rs index 51df19d..43877be 100644 --- a/apps/decodex/src/orchestrator/tests/recovery/runtime_reentry.rs +++ b/apps/decodex/src/orchestrator/tests/recovery/runtime_reentry.rs @@ -73,6 +73,79 @@ fn exited_child_reconciliation_detects_stalled_failed_runs_from_protocol_idle() })); } +#[test] +fn exited_child_reconciliation_detects_retained_partial_progress_from_dirty_worktree() { + let (_temp_dir, config, workflow) = temp_project_layout(); + let state_store = StateStore::open_in_memory().expect("state store should open"); + let issue = sample_issue_with_sort_fields( + "issue-stalled-dirty-after-exit", + "PUB-206", + "In Progress", + &[], + Some(3), + "2026-03-13T04:16:17.133Z", + ); + let tracker = FakeTracker::new(vec![issue.clone()]); + let run_id = "run-stalled-dirty-after-exit"; + let worktree_path = config.worktree_root().join(&issue.identifier); + + git_status_success( + config.repo_root(), + &["worktree", "add", "-b", "x/pubfi-pub-206", ".worktrees/PUB-206", "main"], + ); + + fs::write(worktree_path.join("README.md"), "retained partial work\n") + .expect("tracked worktree file should change"); + + state_store + .record_run_attempt(run_id, &issue.id, 1, "failed") + .expect("run should exit as failed before daemon inspects it"); + state_store + .upsert_worktree( + "pubfi", + &issue.id, + "x/pubfi-pub-206", + &worktree_path.display().to_string(), + ) + .expect("worktree mapping should record"); + + state::write_run_protocol_activity_marker( + &worktree_path, + &ProtocolActivityMarker { + run_id, + attempt_number: 1, + thread_id: None, + turn_id: None, + event_count: 1, + last_event_type: "turn/diff/updated", + child_agent_activity: None, + protocol_activity: None, + }, + ) + .expect("protocol marker should write"); + + let last_protocol_activity = state::read_run_protocol_activity_marker(&worktree_path, run_id, 1) + .expect("protocol marker should read") + .expect("protocol activity should exist"); + let actions = orchestrator::inspect_exited_daemon_child_reconciliation_at( + &tracker, + &config, + &workflow, + &state_store, + &issue.id, + run_id, + last_protocol_activity + ACTIVE_RUN_IDLE_TIMEOUT.as_secs() as i64 + 1, + ) + .expect("exited child inspection should succeed"); + + assert_eq!(actions.len(), 1); + assert!(matches!( + actions[0].disposition, + ActiveRunDisposition::StalledRetainedPartialProgress { idle_for } + if idle_for >= ACTIVE_RUN_IDLE_TIMEOUT + )); +} + #[test] fn exited_child_reconciliation_ignores_superseded_failed_run() { let (_temp_dir, config, workflow) = temp_project_layout(); diff --git a/apps/decodex/src/orchestrator/tests/runtime/failure.rs b/apps/decodex/src/orchestrator/tests/runtime/failure.rs index 01ea269..74ac9cb 100644 --- a/apps/decodex/src/orchestrator/tests/runtime/failure.rs +++ b/apps/decodex/src/orchestrator/tests/runtime/failure.rs @@ -182,6 +182,22 @@ fn retained_partial_progress_uses_actionable_terminal_failure_comment() { assert!(next_action.contains("inspect retained worktree `.worktrees/PUB-101`")); assert!(next_action.contains("finish validation and PR handoff or reset the patch manually")); assert!(next_action.contains("clear label `decodex:needs-attention`")); + + let comment = orchestrator::format_terminal_failure_comment( + "pub-101-attempt-3-123", + 3, + String::from(".worktrees/PUB-101"), + "x/pubfi-pub-101", + None, + error_class, + &next_action, + ); + + assert!(comment.contains("decodex retained partial progress and needs attention")); + assert!(comment.contains("- recorded_at: `")); + assert!(!comment.contains("decodex run failed and needs attention")); + assert!(!comment.contains("- failed_at: `")); + assert!(comment.contains("full recovery context")); } #[test] diff --git a/apps/decodex/src/orchestrator/types.rs b/apps/decodex/src/orchestrator/types.rs index ff8ae5c..a1214ac 100644 --- a/apps/decodex/src/orchestrator/types.rs +++ b/apps/decodex/src/orchestrator/types.rs @@ -117,6 +117,7 @@ pub(crate) enum ActiveRunDisposition { Terminal, NonActive, Stalled { idle_for: Duration }, + StalledRetainedPartialProgress { idle_for: Duration }, StalledAlreadyNeedsAttention { idle_for: Duration }, } @@ -639,7 +640,7 @@ impl Display for RetainedPartialProgress { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { write!( f, - "Run `{}` for issue `{}` retained tracked worktree changes at `{}` after failing before terminal handoff; stop automatic retries and finish recovery manually.", + "Run `{}` for issue `{}` retained tracked worktree changes at `{}`; stop automatic retries and finish recovery manually.", self.run_id, self.issue_identifier, self.worktree_path ) } diff --git a/docs/spec/linear-execution-ledger.md b/docs/spec/linear-execution-ledger.md index d40ea95..1ea3e49 100644 --- a/docs/spec/linear-execution-ledger.md +++ b/docs/spec/linear-execution-ledger.md @@ -158,7 +158,7 @@ These fields are optional globally and become required for specific event types | `evidence` | array of strings | Short factual evidence items. | | `verification` | array of strings | Verification commands or checks already run. | | `error_class` | string | Normalized failure class for needs-attention or terminal-failure records. | -| `terminal_path` | string | Explicit terminal path such as `review_handoff`, `review_repair`, or `manual_attention`. | +| `terminal_path` | string | Explicit terminal path such as `review_handoff`, `review_repair`, `manual_attention`, or `retained_partial_progress`. | | `cleanup_status` | string | Cleanup result when cleanup is the event subject. | | `transport` | string | Agent transport name when agent startup is the event subject. | | `target_state` | string | Tracker workflow state written by closeout or failure handling. | @@ -218,7 +218,13 @@ Every event requires the record envelope. Additional required fields are listed `terminal_path` values must match the runtime-owned terminal path for the tool or phase that writes the event. For normal review handoff this is `review_handoff`; for retained repair completion this is `review_repair`; for explicit human-required exits this is -`manual_attention`. +`manual_attention`; for stalled dirty-worktree recovery this is +`retained_partial_progress`. + +Retained partial progress is a `needs_attention` event with +`error_class = "partial_progress_retained"` and +`terminal_path = "retained_partial_progress"`. It must describe retained tracked +worktree changes and must not be emitted as `terminal_failure`. `failed_command` and `raw_error` are public-summary fields, not private evidence escape hatches. Producers must validate those values before writing a Linear comment. diff --git a/docs/spec/runtime.md b/docs/spec/runtime.md index 456463f..53c5d8b 100644 --- a/docs/spec/runtime.md +++ b/docs/spec/runtime.md @@ -483,8 +483,10 @@ After a process restart, recent-run history, active lease ownership, retained po same-process re-entry summary. Later local attempts that did not consume retry budget must not force a synthetic closeout attempt number. - A leased issue that is still in a configured startable state during early control-plane ticks must be treated as a lane that has not finished claiming tracker ownership yet, not as an immediate non-active interruption. -- If a running attempt exceeds the app-server idle timeout with no recorded protocol activity, `decodex` must treat it as stalled, stop the active run, mark the attempt `stalled`, and converge the issue through the human-required failure path instead of silently retrying in this phase. -- If the supervised child already exited before the next control-plane tick, stalled reconciliation must still inspect the just-finished lane using recorded protocol activity rather than skipping directly to generic failure handling. +- If a running attempt exceeds the app-server idle timeout, `decodex` must treat it as stalled, stop the active run, and mark the attempt `stalled`. +- If stalled reconciliation finds tracked changes in the retained worktree, it must classify the lane as retained partial progress directly. This path must write a human-required `needs_attention` ledger record with `error_class = "partial_progress_retained"` and `terminal_path = "retained_partial_progress"` instead of first routing the lane through `stalled_run_detected` or `terminal_failure`. +- If stalled reconciliation finds no tracked changes in the retained worktree, it must converge the issue through the existing human-required failure path with `error_class = "stalled_run_detected"` instead of silently retrying in this phase. +- If the supervised child already exited before the next control-plane tick, stalled reconciliation must still inspect the just-finished lane using recorded protocol activity and retained worktree state rather than skipping directly to generic failure handling. - Operator status snapshots must expose structured liveness and wait-state fields derived from runtime records plus marker breadcrumbs, including current phase, optional wait reason, current operation, last run/protocol/progress times, idle age, a soft `suspected_stall` signal, and any queued retry kind plus due time, so operators can distinguish active execution from continuation waits, retry backoff, early stall suspicion, and genuine hard stalls without inferring progress from filesystem churn. - Operator status snapshots may expose an additive `child_agent_activity` object when app-server protocol events have produced one for the current run. The object must stay machine-readable and dashboard/CLI shared, and should describe dynamic observed buckets rather than a fixed workflow: current child bucket and elapsed time, bucket wall/event/tool counts, current/max/cumulative input tokens, cumulative output tokens, largest tool output, and warnings for repeated large outputs. Missing `child_agent_activity` means no child breakdown was captured; existing JSON consumers must continue to work without it. - If the agent Git credential preflight fails, operator status must report the retained lane as a credential failure requiring operator recovery, not as a still-running lane.