facebook · migeed-z · Mar 21, 2026 · Mar 21, 2026
diff --git a/scripts/llm_transport.py b/scripts/llm_transport.py
@@ -44,7 +44,7 @@
 
 # ── Anthropic API ────────────────────────────────────────────────────
 ANTHROPIC_API_URL = "https://api.anthropic.com/v1/messages"
-ANTHROPIC_DEFAULT_MODEL = "claude-opus-4-20250514"
+ANTHROPIC_DEFAULT_MODEL = "claude-opus-4-6"
 ANTHROPIC_API_VERSION = "2023-06-01"
 
 

diff --git a/scripts/primer_classifier/classifier.py b/scripts/primer_classifier/classifier.py
@@ -24,6 +24,7 @@
     assign_verdict_with_llm,
     CategoryVerdict,
     classify_with_llm,
+    critique_reasoning,
     generate_suggestions,
     LLMError,
 )
@@ -696,13 +697,31 @@ def classify_project(
             base.method = "llm"
             return base
 
-        # Pass 2: assign verdict based on reasoning
+        # Pass 1.5: self-critique the reasoning for factual errors
+        try:
+            critiqued_reason, critiqued_categories = critique_reasoning(
+                llm_result.reason,
+                llm_result.categories,
+                errors_text,
+                source_context,
+                model,
+            )
+        except LLMError as e:
+            print(
+                f"  Warning: self-critique failed for {project.name}: {e}, "
+                "using original reasoning",
+                file=sys.stderr,
+            )
+            critiqued_reason = llm_result.reason
+            critiqued_categories = llm_result.categories
+
+        # Pass 2: assign verdict based on (critiqued) reasoning
         verdict, categories_with_verdicts = assign_verdict_with_llm(
-            llm_result.reason, llm_result.categories, model
+            critiqued_reason, critiqued_categories, model
         )
 
         base.verdict = verdict
-        base.reason = llm_result.reason
+        base.reason = critiqued_reason
         base.method = "llm"
         base.categories = categories_with_verdicts
         base.pr_attribution = llm_result.pr_attribution
@@ -722,6 +741,74 @@ def classify_project(
     return base
 
 
+def _enforce_cross_project_consistency(
+    classifications: list[Classification],
+) -> None:
+    """Enforce verdict consistency across projects that share error kinds.
+
+    When multiple LLM-classified projects share the same error kind(s) and
+    have conflicting verdicts, the majority verdict wins. This prevents the
+    classifier from saying "overload resolution improved" for one project
+    and "overload resolution regressed" for another with the same pattern.
+
+    Modifies classifications in place.
+    """
+    # Only consider LLM-classified projects with clear verdicts
+    llm_classified = [
+        c for c in classifications
+        if c.method == "llm" and c.verdict in ("regression", "improvement")
+    ]
+    if len(llm_classified) < 2:
+        return
+
+    # Group projects by their error kinds (using frozenset for hashability)
+    kind_to_projects: dict[str, list[Classification]] = defaultdict(list)
+    for c in llm_classified:
+        for kind in c.error_kinds:
+            kind_to_projects[kind].append(c)
+
+    # For each error kind shared by multiple projects, check consistency
+    already_adjusted: set[str] = set()
+    for kind, group in kind_to_projects.items():
+        if len(group) < 2:
+            continue
+
+        verdicts = [c.verdict for c in group]
+        if len(set(verdicts)) <= 1:
+            continue  # already consistent
+
+        # Count verdicts
+        verdict_counts: dict[str, int] = {}
+        for v in verdicts:
+            verdict_counts[v] = verdict_counts.get(v, 0) + 1
+
+        majority = max(verdict_counts, key=lambda v: verdict_counts[v])
+        minority_count = sum(
+            c for v, c in verdict_counts.items() if v != majority
+        )
+
+        # Only enforce if majority is clear (> minority)
+        if verdict_counts[majority] <= minority_count:
+            continue
+
+        # Update minority projects to match majority
+        adjusted_names = []
+        for c in group:
+            if c.verdict != majority and c.project_name not in already_adjusted:
+                old = c.verdict
+                c.verdict = majority
+                adjusted_names.append(c.project_name)
+                already_adjusted.add(c.project_name)
+
+        if adjusted_names:
+            print(
+                f"  Cross-project consistency [{kind}]: "
+                f"{', '.join(adjusted_names)} adjusted to {majority} "
+                f"(vote: {verdict_counts})",
+                file=sys.stderr,
+            )
+
+
 def classify_all(
     projects: list[ProjectDiff],
     fetch_code: bool = True,
@@ -766,6 +853,11 @@ def classify_all(
         )
         result.classifications.append(classification)
 
+    # Enforce cross-project consistency before counting verdicts
+    _enforce_cross_project_consistency(result.classifications)
+
+    # Count verdicts after consistency enforcement
+    for classification in result.classifications:
         if classification.verdict == "regression":
             result.regressions += 1
         elif classification.verdict == "improvement":

diff --git a/scripts/primer_classifier/formatter.py b/scripts/primer_classifier/formatter.py
@@ -111,6 +111,42 @@ def func_replacer(match: re.Match) -> str:
     return result
 
 
+def _format_reason(reason: str) -> str:
+    """Format a reason string for display, handling raw JSON dicts.
+
+    When the LLM returns a JSON dict as the reason (with fields like
+    spec_check, runtime_behavior, etc.), format it into readable text
+    instead of dumping raw JSON.
+    """
+    if not reason or not reason.strip().startswith("{"):
+        return reason
+    try:
+        parsed = json.loads(reason)
+        if not isinstance(parsed, dict):
+            return reason
+    except (json.JSONDecodeError, ValueError):
+        return reason
+
+    # Format known analysis fields into readable text
+    _FIELD_LABELS = {
+        "spec_check": "Spec check",
+        "runtime_behavior": "Runtime behavior",
+        "mypy_pyright": "Mypy/pyright comparison",
+        "removal_assessment": "Removal assessment",
+        "pr_attribution": "PR attribution",
+        "reason": "Reasoning",
+    }
+    parts = []
+    for key, label in _FIELD_LABELS.items():
+        val = parsed.get(key)
+        if val and val != "N/A":
+            parts.append(f"**{label}:** {val}")
+    # Fall back to the "reason" field if nothing else was formatted
+    if not parts:
+        return parsed.get("reason", reason)
+    return "\n> ".join(parts)
+
+
 def _extract_root_cause(c) -> str:
     """Extract a linkified root cause string from a classification's pr_attribution.
 
@@ -292,7 +328,7 @@ def format_markdown(result: ClassificationResult) -> str:
                     )
                 lines.append("")
             else:
-                lines.append(f"> {c.reason}")
+                lines.append(f"> {_format_reason(c.reason)}")
                 if c.pr_attribution and c.pr_attribution != "N/A":
                     lines.append(
                         f"> **Attribution:** "