Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 34 additions & 6 deletions tools/ways-cli/src/cmd/scan/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,22 @@
//! scope/precondition gating, parent-threshold lowering, and show (display).

mod candidates;
mod reduce;
mod scoring;
mod state;
pub(crate) use scoring::batch_embed_score;

// Per-hook embed-query budgets (approximate tokens). MiniLM's window
// is 128 position embeddings; we budget ~85% of that. The reducer
// passes inputs through unchanged when they already fit; long inputs
// collapse to top-salience sentences within budget. The approximate
// tokenizer here (whitespace + char-budget max) over-counts vs
// MiniLM's WordPiece, so real tokens land safely under 128 even at
// the higher budgets. See ADR-130.
const BUDGET_PROMPT: usize = 110;
const BUDGET_TASK: usize = 110;
const BUDGET_COMMAND: usize = 75;
const BUDGET_FILE: usize = 30;
pub use state::state;

use anyhow::Result;
Expand Down Expand Up @@ -60,7 +73,12 @@ pub fn prompt(query: &str, session_id: &str, project: Option<&str>) -> Result<()
let scope = session::detect_scope(session_id);
let candidates = collect_candidates(&project_dir);

let embed_matches = batch_embed_score(query);
// ADR-130: cap embed input to the model's working window via the
// sentence-salience reducer. Pattern/keyword matching downstream
// still operates on `query` (the full prompt) — only the embed
// signal sees the reduced form.
let reduced = reduce::reduce_for_embed(query, BUDGET_PROMPT);
let embed_matches = batch_embed_score(&reduced);

let mut context = String::new();

Expand Down Expand Up @@ -112,7 +130,10 @@ pub fn task(
let is_teammate = team.is_some();
let candidates = collect_candidates(&project_dir);

let embed_matches = batch_embed_score(query);
// ADR-130: agent delegation prompts are the largest input class in
// practice. Reduce to the model's window before embedding.
let reduced = reduce::reduce_for_embed(query, BUDGET_TASK);
let embed_matches = batch_embed_score(&reduced);

let mut matched: Vec<(String, String)> = Vec::new(); // (way_id, channel)

Expand Down Expand Up @@ -231,15 +252,19 @@ pub fn command(
}
}

// Check matching: commands regex + semantic scoring
// Check matching: commands regex + semantic scoring.
// ADR-130: cap embed input. Heredoc bodies (gh pr create --body
// "$(cat <<EOF…)"), curl -d JSON payloads, and similar argument-
// body bash commands can run kilobytes long. The regex matcher
// above already saw the full cmd; only the embed query is reduced.
let checks = collect_checks(&project_dir);
let query_for_checks = format!(
"{} {}",
cmd,
description.unwrap_or("")
);

let embed_check_matches = batch_embed_score(&query_for_checks);
let reduced_for_checks = reduce::reduce_for_embed(&query_for_checks, BUDGET_COMMAND);
let embed_check_matches = batch_embed_score(&reduced_for_checks);

for check in &checks {
if !session::scope_matches(&check.scope, &scope) {
Expand Down Expand Up @@ -315,7 +340,10 @@ pub fn file(filepath: &str, session_id: &str, project: Option<&str>) -> Result<(
}

let checks = collect_checks(&project_dir);
let embed_matches = batch_embed_score(filepath);
// ADR-130: filepaths are short by nature, but enforce the budget
// uniformly across all hook surfaces for consistency.
let reduced = reduce::reduce_for_embed(filepath, BUDGET_FILE);
let embed_matches = batch_embed_score(&reduced);

for check in &checks {
if !session::scope_matches(&check.scope, &scope) {
Expand Down
Loading
Loading