Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion scripts/llm_transport.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@

# ── Anthropic API ────────────────────────────────────────────────────
ANTHROPIC_API_URL = "https://api.anthropic.com/v1/messages"
ANTHROPIC_DEFAULT_MODEL = "claude-opus-4-20250514"
ANTHROPIC_DEFAULT_MODEL = "claude-opus-4-6"
ANTHROPIC_API_VERSION = "2023-06-01"


Expand Down
98 changes: 95 additions & 3 deletions scripts/primer_classifier/classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
assign_verdict_with_llm,
CategoryVerdict,
classify_with_llm,
critique_reasoning,
generate_suggestions,
LLMError,
)
Expand Down Expand Up @@ -696,13 +697,31 @@ def classify_project(
base.method = "llm"
return base

# Pass 2: assign verdict based on reasoning
# Pass 1.5: self-critique the reasoning for factual errors
try:
critiqued_reason, critiqued_categories = critique_reasoning(
llm_result.reason,
llm_result.categories,
errors_text,
source_context,
model,
)
except LLMError as e:
print(
f" Warning: self-critique failed for {project.name}: {e}, "
"using original reasoning",
file=sys.stderr,
)
critiqued_reason = llm_result.reason
critiqued_categories = llm_result.categories

# Pass 2: assign verdict based on (critiqued) reasoning
verdict, categories_with_verdicts = assign_verdict_with_llm(
llm_result.reason, llm_result.categories, model
critiqued_reason, critiqued_categories, model
)

base.verdict = verdict
base.reason = llm_result.reason
base.reason = critiqued_reason
base.method = "llm"
base.categories = categories_with_verdicts
base.pr_attribution = llm_result.pr_attribution
Expand All @@ -722,6 +741,74 @@ def classify_project(
return base


def _enforce_cross_project_consistency(
classifications: list[Classification],
) -> None:
"""Enforce verdict consistency across projects that share error kinds.

When multiple LLM-classified projects share the same error kind(s) and
have conflicting verdicts, the majority verdict wins. This prevents the
classifier from saying "overload resolution improved" for one project
and "overload resolution regressed" for another with the same pattern.

Modifies classifications in place.
"""
# Only consider LLM-classified projects with clear verdicts
llm_classified = [
c for c in classifications
if c.method == "llm" and c.verdict in ("regression", "improvement")
]
if len(llm_classified) < 2:
return

# Group projects by their error kinds (using frozenset for hashability)
kind_to_projects: dict[str, list[Classification]] = defaultdict(list)
for c in llm_classified:
for kind in c.error_kinds:
kind_to_projects[kind].append(c)

# For each error kind shared by multiple projects, check consistency
already_adjusted: set[str] = set()
for kind, group in kind_to_projects.items():
if len(group) < 2:
continue

verdicts = [c.verdict for c in group]
if len(set(verdicts)) <= 1:
continue # already consistent

# Count verdicts
verdict_counts: dict[str, int] = {}
for v in verdicts:
verdict_counts[v] = verdict_counts.get(v, 0) + 1

majority = max(verdict_counts, key=lambda v: verdict_counts[v])
minority_count = sum(
c for v, c in verdict_counts.items() if v != majority
)

# Only enforce if majority is clear (> minority)
if verdict_counts[majority] <= minority_count:
continue

# Update minority projects to match majority
adjusted_names = []
for c in group:
if c.verdict != majority and c.project_name not in already_adjusted:
old = c.verdict
c.verdict = majority
adjusted_names.append(c.project_name)
already_adjusted.add(c.project_name)

if adjusted_names:
print(
f" Cross-project consistency [{kind}]: "
f"{', '.join(adjusted_names)} adjusted to {majority} "
f"(vote: {verdict_counts})",
file=sys.stderr,
)


def classify_all(
projects: list[ProjectDiff],
fetch_code: bool = True,
Expand Down Expand Up @@ -766,6 +853,11 @@ def classify_all(
)
result.classifications.append(classification)

# Enforce cross-project consistency before counting verdicts
_enforce_cross_project_consistency(result.classifications)

# Count verdicts after consistency enforcement
for classification in result.classifications:
if classification.verdict == "regression":
result.regressions += 1
elif classification.verdict == "improvement":
Expand Down
38 changes: 37 additions & 1 deletion scripts/primer_classifier/formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,42 @@ def func_replacer(match: re.Match) -> str:
return result


def _format_reason(reason: str) -> str:
"""Format a reason string for display, handling raw JSON dicts.

When the LLM returns a JSON dict as the reason (with fields like
spec_check, runtime_behavior, etc.), format it into readable text
instead of dumping raw JSON.
"""
if not reason or not reason.strip().startswith("{"):
return reason
try:
parsed = json.loads(reason)
if not isinstance(parsed, dict):
return reason
except (json.JSONDecodeError, ValueError):
return reason

# Format known analysis fields into readable text
_FIELD_LABELS = {
"spec_check": "Spec check",
"runtime_behavior": "Runtime behavior",
"mypy_pyright": "Mypy/pyright comparison",
"removal_assessment": "Removal assessment",
"pr_attribution": "PR attribution",
"reason": "Reasoning",
}
parts = []
for key, label in _FIELD_LABELS.items():
val = parsed.get(key)
if val and val != "N/A":
parts.append(f"**{label}:** {val}")
# Fall back to the "reason" field if nothing else was formatted
if not parts:
return parsed.get("reason", reason)
return "\n> ".join(parts)


def _extract_root_cause(c) -> str:
"""Extract a linkified root cause string from a classification's pr_attribution.

Expand Down Expand Up @@ -292,7 +328,7 @@ def format_markdown(result: ClassificationResult) -> str:
)
lines.append("")
else:
lines.append(f"> {c.reason}")
lines.append(f"> {_format_reason(c.reason)}")
if c.pr_attribution and c.pr_attribution != "N/A":
lines.append(
f"> **Attribution:** "
Expand Down
Loading