|
| 1 | +"""Token budget enforcement for generated AI context files. |
| 2 | +
|
| 3 | +Research context (why this exists): |
| 4 | + ETH Zurich study (5,694 PRs): auto-generated context files >100 lines |
| 5 | + reduced agent task success by 3% and increased cost by 20%. |
| 6 | + LLMs reliably follow ~150 instructions. Claude Code burns ~50 before |
| 7 | + AGENTS.md even loads. That leaves ~100 slots for our content. |
| 8 | +
|
| 9 | +Default cap: 100 lines. |
| 10 | +--verbose / --budget 0: unlimited. |
| 11 | +
|
| 12 | +Design: |
| 13 | + Sections marked as PROTECTED are always included regardless of budget. |
| 14 | + These are human-written or contain the highest-value tribal knowledge. |
| 15 | + Auto-generated bulk content (Project Structure, Circular Deps) is cut first. |
| 16 | +
|
| 17 | + The function works on already-rendered text -- it does NOT re-render. |
| 18 | + It splits on ## section headers, applies priority ordering, and reassembles |
| 19 | + within the budget. Protected sections are appended after the budget note. |
| 20 | +""" |
| 21 | +from __future__ import annotations |
| 22 | + |
| 23 | +# Lines cap below which we never bother truncating (avoids truncating small repos) |
| 24 | +_MIN_LINES_TO_TRUNCATE = 20 |
| 25 | + |
| 26 | +# Section header prefixes that are ALWAYS included, never cut. |
| 27 | +# These contain human-written tribal knowledge and project-specific rules. |
| 28 | +_PROTECTED_SECTION_PREFIXES = ( |
| 29 | + "## Tribal Knowledge", |
| 30 | + "## Project-Specific Rules", |
| 31 | + "## How to Verify", |
| 32 | +) |
| 33 | + |
| 34 | +# Section header prefixes ranked lowest priority -- cut first when over budget. |
| 35 | +# Project Structure is the biggest offender: monorepos generate 100s of lines. |
| 36 | +_LOW_PRIORITY_SECTION_PREFIXES = ( |
| 37 | + "## Project Structure", |
| 38 | + "## Circular Dependencies", |
| 39 | + "## Preferred imports", |
| 40 | +) |
| 41 | + |
| 42 | +_TRUNCATION_NOTE = ( |
| 43 | + "\n> [{omitted} lines omitted -- run `saar extract --verbose` for full output]\n" |
| 44 | +) |
| 45 | + |
| 46 | + |
| 47 | +def apply_budget(text: str, max_lines: int) -> str: |
| 48 | + """Apply a line budget to rendered AGENTS.md / CLAUDE.md content. |
| 49 | +
|
| 50 | + Args: |
| 51 | + text: Fully rendered content string (without SAAR markers). |
| 52 | + max_lines: Maximum lines allowed. 0 or negative = unlimited. |
| 53 | +
|
| 54 | + Returns: |
| 55 | + Content string within budget, with a truncation note if lines were cut. |
| 56 | + Protected sections (Tribal Knowledge, Project Rules) are always included. |
| 57 | + """ |
| 58 | + if max_lines <= 0: |
| 59 | + return text |
| 60 | + |
| 61 | + lines = text.splitlines(keepends=True) |
| 62 | + total = len(lines) |
| 63 | + |
| 64 | + if total <= max_lines or total <= _MIN_LINES_TO_TRUNCATE: |
| 65 | + return text |
| 66 | + |
| 67 | + # Split into sections. Each section = (header_line_index, lines[]) |
| 68 | + sections = _split_into_sections(lines) |
| 69 | + |
| 70 | + # Separate protected sections out -- they always appear at the end |
| 71 | + protected: list[list[str]] = [] |
| 72 | + regular: list[list[str]] = [] |
| 73 | + |
| 74 | + for section_lines in sections: |
| 75 | + header = section_lines[0].strip() if section_lines else "" |
| 76 | + if any(header.startswith(p) for p in _PROTECTED_SECTION_PREFIXES): |
| 77 | + protected.append(section_lines) |
| 78 | + else: |
| 79 | + regular.append(section_lines) |
| 80 | + |
| 81 | + # Sort regular sections: low-priority ones go to the end (cut first) |
| 82 | + def _priority(section_lines: list[str]) -> int: |
| 83 | + header = section_lines[0].strip() if section_lines else "" |
| 84 | + if any(header.startswith(p) for p in _LOW_PRIORITY_SECTION_PREFIXES): |
| 85 | + return 99 # sort last = cut first |
| 86 | + return 0 |
| 87 | + |
| 88 | + regular.sort(key=_priority) |
| 89 | + |
| 90 | + # Count lines reserved for protected sections + truncation note |
| 91 | + protected_line_count = sum(len(s) for s in protected) + 2 # +2 for note |
| 92 | + available = max_lines - protected_line_count |
| 93 | + |
| 94 | + # Fill regular sections within available budget |
| 95 | + kept: list[list[str]] = [] |
| 96 | + used = 0 |
| 97 | + omitted = 0 |
| 98 | + |
| 99 | + for section_lines in regular: |
| 100 | + section_len = len(section_lines) |
| 101 | + if used + section_len <= available: |
| 102 | + kept.append(section_lines) |
| 103 | + used += section_len |
| 104 | + else: |
| 105 | + omitted += section_len |
| 106 | + |
| 107 | + # Reassemble: kept sections (in original order) + note + protected |
| 108 | + # Re-sort kept back to original document order |
| 109 | + original_order = {id(s): i for i, s in enumerate(sections)} |
| 110 | + kept.sort(key=lambda s: original_order.get(id(s), 999)) |
| 111 | + |
| 112 | + result_lines: list[str] = [] |
| 113 | + for section_lines in kept: |
| 114 | + result_lines.extend(section_lines) |
| 115 | + |
| 116 | + if omitted > 0: |
| 117 | + note = _TRUNCATION_NOTE.format(omitted=omitted) |
| 118 | + result_lines.append(note) |
| 119 | + |
| 120 | + for section_lines in protected: |
| 121 | + result_lines.extend(section_lines) |
| 122 | + |
| 123 | + return "".join(result_lines) |
| 124 | + |
| 125 | + |
| 126 | +def _split_into_sections(lines: list[str]) -> list[list[str]]: |
| 127 | + """Split a list of lines into sections delimited by ## headers. |
| 128 | +
|
| 129 | + The preamble (lines before the first ## header) is treated as |
| 130 | + its own section with an empty header line. |
| 131 | + """ |
| 132 | + sections: list[list[str]] = [] |
| 133 | + current: list[str] = [] |
| 134 | + |
| 135 | + for line in lines: |
| 136 | + if line.startswith("## ") and current: |
| 137 | + sections.append(current) |
| 138 | + current = [line] |
| 139 | + else: |
| 140 | + current.append(line) |
| 141 | + |
| 142 | + if current: |
| 143 | + sections.append(current) |
| 144 | + |
| 145 | + return sections |
0 commit comments