diff --git a/backend/dependencies.py b/backend/dependencies.py index 62688b2..3d62bdb 100644 --- a/backend/dependencies.py +++ b/backend/dependencies.py @@ -11,6 +11,7 @@ from services.dependency_analyzer import DependencyAnalyzer from services.style_analyzer import StyleAnalyzer from services.dna_extractor import DNAExtractor +from services.context_assembler import ContextAssembler from services.rate_limiter import RateLimiter, APIKeyManager from services.supabase_service import get_supabase_service from services.input_validator import CostController @@ -25,6 +26,7 @@ dependency_analyzer = DependencyAnalyzer() style_analyzer = StyleAnalyzer() dna_extractor = DNAExtractor() +context_assembler = ContextAssembler() # Rate limiting and API key management rate_limiter = RateLimiter(redis_client=cache.redis if cache.redis else None) diff --git a/backend/main.py b/backend/main.py index 09245ab..04ccecf 100644 --- a/backend/main.py +++ b/backend/main.py @@ -35,6 +35,7 @@ from routes.github import router as github_router from routes.feedback import router as feedback_router from routes.admin import router as admin_router +from routes.context import router as context_router from routes.ws_playground import websocket_playground_index from routes.ws_repos import websocket_repo_indexing @@ -106,6 +107,7 @@ async def dispatch(self, request: Request, call_next): app.include_router(github_router, prefix=API_PREFIX) app.include_router(feedback_router, prefix=API_PREFIX) app.include_router(admin_router, prefix=API_PREFIX) +app.include_router(context_router, prefix=API_PREFIX) # WebSocket endpoints (versioned) app.add_api_websocket_route(f"{API_PREFIX}/ws/index/{{repo_id}}", websocket_index) diff --git a/backend/routes/context.py b/backend/routes/context.py new file mode 100644 index 0000000..603ed5c --- /dev/null +++ b/backend/routes/context.py @@ -0,0 +1,81 @@ +"""Context assembly endpoint. + +Provides per-task context packaging via POST /api/v1/context/assemble. +Uses semantic search + dependency graph + project rules to build a +minimal, precise context package for AI coding assistants. +""" +import time +from typing import Any + +from fastapi import APIRouter, Depends, HTTPException +from pydantic import BaseModel, Field + +from dependencies import verify_repo_access +from middleware.auth import AuthContext, require_auth +from services.observability import ( + add_breadcrumb, + capture_exception, + logger, + metrics, + set_operation_context, +) + +router = APIRouter(tags=["context"]) + + +class AssembleRequest(BaseModel): + task: str = Field(..., min_length=3, max_length=1000) + repo_id: str + token_budget: int = Field(default=1500, ge=100, le=10000) + + +@router.post("/context/assemble") +async def assemble_context( + request: AssembleRequest, + auth: AuthContext = Depends(require_auth), +) -> dict[str, Any]: + """Assemble task-specific context from semantic search + deps + rules. + + Returns a markdown context package sized to fit within token_budget, + containing only the files, dependencies, and project rules relevant + to the given task description. + """ + set_operation_context( + "context_assemble", + user_id=auth.user_id, + repo_id=request.repo_id, + ) + add_breadcrumb("Context assembly requested", category="context", repo_id=request.repo_id) + + verify_repo_access(request.repo_id, auth.user_id) + + from dependencies import context_assembler + + start = time.time() + try: + result = await context_assembler.assemble( + task=request.task, + repo_id=request.repo_id, + user_id=auth.user_id, + token_budget=request.token_budget, + ) + + elapsed = time.time() - start + logger.info( + "Context assembled", + repo_id=request.repo_id, + files=result["files_found"], + tokens=result["tokens_used"], + budget=request.token_budget, + duration_ms=round(elapsed * 1000), + ) + metrics.timing("context_assemble_ms", elapsed * 1000) + + return result + + except HTTPException: + raise + except Exception as exc: + capture_exception(exc, operation="context_assemble", repo_id=request.repo_id) + logger.error("Context assembly failed: %s", exc) + raise HTTPException(status_code=500, detail="Context assembly failed") diff --git a/backend/services/context_assembler.py b/backend/services/context_assembler.py new file mode 100644 index 0000000..86fad72 --- /dev/null +++ b/backend/services/context_assembler.py @@ -0,0 +1,295 @@ +"""Context assembly service for per-task context packaging. + +Takes a task description + repo, finds the most relevant files via +semantic search, expands with 1-hop dependencies, matches applicable +project rules, and returns an assembled context package within a +token budget. This is the core of OPE-172. +""" +import asyncio +import re +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple + +from services.observability import logger + +# Rule files in priority order (first found wins, same as dna_extractor) +RULES_FILES = [ + "CLAUDE.md", "AGENTS.md", ".cursorrules", + ".codeintel/rules.md", "CONVENTIONS.md", + ".github/copilot-instructions.md", "CODING_GUIDELINES.md", +] + +# Sections that apply to every task regardless of file matches +ALWAYS_RELEVANT_PATTERNS = re.compile( + r"(git|commit|workflow|what not to do|never|critical|do not|testing|review)", + re.IGNORECASE, +) + + +def _estimate_tokens(text: str) -> int: + """Rough token estimate: 1 token per 4 chars.""" + return len(text) // 4 + + +def _split_rules_into_sections(content: str) -> List[Dict[str, str]]: + """Split markdown content by ## headers into discrete sections.""" + sections: List[Dict[str, str]] = [] + current_header = "" + current_body: List[str] = [] + + for line in content.splitlines(): + if line.startswith("## "): + if current_header or current_body: + sections.append({ + "header": current_header, + "body": "\n".join(current_body).strip(), + }) + current_header = line + current_body = [] + else: + current_body.append(line) + + # Capture the last section + if current_header or current_body: + sections.append({ + "header": current_header, + "body": "\n".join(current_body).strip(), + }) + + return sections + + +def _read_rules_file_sync(repo_path: Path) -> Tuple[Optional[str], Optional[str]]: + """Find and read the first matching rules file in the repo (sync).""" + for filename in RULES_FILES: + rules_path = repo_path / filename + if rules_path.exists() and rules_path.is_file(): + try: + content = rules_path.read_text(encoding="utf-8", errors="replace") + if content.strip(): + return content, filename + except OSError as exc: + logger.warning("Could not read rules file", path=str(rules_path), error=str(exc)) + return None, None + + +def _load_deps_sync(repo_id: str) -> List[Dict]: + """Load file dependencies from Supabase (sync).""" + from services.supabase_service import get_supabase_service + return get_supabase_service().get_file_dependencies(repo_id) + + +class ContextAssembler: + """Assembles per-task context from semantic search + deps + rules.""" + + async def assemble( + self, + task: str, + repo_id: str, + user_id: str, + token_budget: int = 1500, + ) -> Dict[str, Any]: + """Build a context package for a specific coding task. + + Returns dict with 'context' (markdown string), 'files_found', + 'tokens_used', and 'debug' metadata. + """ + from dependencies import indexer, get_repo_or_404 + + repo = get_repo_or_404(repo_id, user_id) + local_path_str = repo.get("local_path", "") + + # Step 1: Semantic search for the most relevant files + search_results = await self._search(task, repo_id, indexer) + found_files = self._unique_files(search_results) + + # Step 2: Expand with 1-hop dependencies (sync DB call off event loop) + dep_files = await self._expand_deps(found_files, repo_id) + + # Step 3: Match relevant rule sections + all_files = list(dict.fromkeys(found_files + dep_files)) + rules_content: Optional[str] = None + rules_source: Optional[str] = None + if local_path_str and Path(local_path_str).is_dir(): + rules_content, rules_source = await asyncio.to_thread( + _read_rules_file_sync, Path(local_path_str), + ) + matched_rules = self._match_rules(rules_content, all_files) if rules_content else [] + + # Step 4: Assemble within token budget + context_md = self._build_package( + task, search_results, found_files, dep_files, + matched_rules, token_budget, + ) + + return { + "context": context_md, + "files_found": len(all_files), + "tokens_used": _estimate_tokens(context_md), + "token_budget": token_budget, + "rules_source": rules_source, + "search_hits": len(search_results), + "dep_files_added": len(dep_files), + "rule_sections_matched": len(matched_rules), + } + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + async def _search( + self, task: str, repo_id: str, indexer: Any, top_k: int = 5, + ) -> List[Dict]: + """Run semantic search and return top results.""" + try: + results = await indexer.search_v2( + query=task, repo_id=repo_id, top_k=top_k, use_reranking=True, + ) + return results + except Exception as exc: + logger.error("Context search failed", error=str(exc)) + return [] + + @staticmethod + def _unique_files(results: List[Dict]) -> List[str]: + """Extract unique file paths from search results, preserving order.""" + seen: set[str] = set() + files: List[str] = [] + for r in results: + fp = r.get("file_path", "") + if fp and fp not in seen: + seen.add(fp) + files.append(fp) + return files + + @staticmethod + async def _expand_deps(seed_files: List[str], repo_id: str) -> List[str]: + """Add 1-hop imports/dependents for seed files.""" + try: + all_deps = await asyncio.to_thread(_load_deps_sync, repo_id) + except Exception as exc: + logger.warning("Could not load deps for expansion", error=str(exc)) + return [] + + # Build adjacency maps + imports_map: Dict[str, List[str]] = {} + dependents_map: Dict[str, List[str]] = {} + for row in all_deps: + fp = row.get("file_path", "") + deps = row.get("depends_on", []) + imports_map[fp] = deps + for dep in deps: + dependents_map.setdefault(dep, []).append(fp) + + seed_set = set(seed_files) + expanded: List[str] = [] + for fp in seed_files: + for imp in imports_map.get(fp, []): + if imp not in seed_set and imp not in expanded: + expanded.append(imp) + for dep in dependents_map.get(fp, []): + if dep not in seed_set and dep not in expanded: + expanded.append(dep) + + return expanded + + @staticmethod + def _match_rules( + rules_content: str, files: List[str], + ) -> List[Dict[str, str]]: + """Return rule sections relevant to the discovered files.""" + sections = _split_rules_into_sections(rules_content) + stems = {Path(f).stem for f in files} + names = {Path(f).name for f in files} + + matched: List[Dict[str, str]] = [] + for section in sections: + header = section["header"] + body = section["body"] + combined = f"{header}\n{body}" + + # Always-relevant sections (git rules, "what not to do", etc.) + if ALWAYS_RELEVANT_PATTERNS.search(header): + matched.append(section) + continue + + # Sections mentioning any discovered file + if any(name in combined for name in names): + matched.append(section) + continue + if any(stem in combined for stem in stems if len(stem) > 2): + matched.append(section) + + return matched + + @staticmethod + def _build_package( + task: str, + search_results: List[Dict], + found_files: List[str], + dep_files: List[str], + matched_rules: List[Dict[str, str]], + budget: int, + ) -> str: + """Assemble markdown context package within token budget.""" + lines: List[str] = [f'## Context for: "{task}"', ""] + remaining = budget - _estimate_tokens("\n".join(lines)) + + # Tier 1: Relevant files (highest priority) + if found_files and remaining > 50: + header = "### Relevant files" + header_cost = _estimate_tokens(header) + 1 # +1 for trailing blank line + remaining -= header_cost + tier_lines = [header] + for r in search_results: + fp = r.get("file_path", "") + name = r.get("qualified_name", r.get("name", "")) + score = r.get("score", 0) + sig = r.get("signature", "") + pct = f"{score * 100:.0f}%" if isinstance(score, float) else str(score) + desc = sig if sig else name + entry = f"- `{fp}` -- {desc} (relevance: {pct})" + entry_tokens = _estimate_tokens(entry) + if entry_tokens <= remaining: + tier_lines.append(entry) + remaining -= entry_tokens + else: + break + tier_lines.append("") + lines.extend(tier_lines) + + # Tier 2: Dependency files + if dep_files and remaining > 50: + header = "### Depends on" + header_cost = _estimate_tokens(header) + 1 + remaining -= header_cost + tier_lines = [header] + for fp in dep_files[:10]: + entry = f"- `{fp}`" + entry_tokens = _estimate_tokens(entry) + if entry_tokens <= remaining: + tier_lines.append(entry) + remaining -= entry_tokens + else: + break + tier_lines.append("") + lines.extend(tier_lines) + + # Tier 3: Matched rules + if matched_rules and remaining > 50: + lines.append("### Rules that apply") + for section in matched_rules: + section_text = section["header"] + "\n" + section["body"] + section_tokens = _estimate_tokens(section_text) + if section_tokens <= remaining: + lines.append(section["body"]) + remaining -= section_tokens + else: + # Truncate the last section to fit + chars_left = remaining * 4 + if chars_left > 20: + lines.append(section["body"][:chars_left] + "...") + break + lines.append("") + + return "\n".join(lines) diff --git a/mcp-server/handlers.py b/mcp-server/handlers.py index da4142b..595263d 100644 --- a/mcp-server/handlers.py +++ b/mcp-server/handlers.py @@ -165,6 +165,20 @@ async def _handle_index_repository(args: dict[str, Any]) -> str: return "\n".join(lines) +async def _handle_get_context_for_task(args: dict[str, Any]) -> str: + payload = { + "task": args["task_description"], + "repo_id": args["repo_id"], + "token_budget": args.get("token_budget", 1500), + } + result = await api_post("/context/assemble", json=payload) + context = result.get("context", "No context assembled.") + tokens = result.get("tokens_used", 0) + budget = result.get("token_budget", 0) + files = result.get("files_found", 0) + return f"{context}\n\n---\n_{files} files, {tokens}/{budget} tokens_" + + async def _handle_delete_repository(args: dict[str, Any]) -> str: repo_id = args["repo_id"] result = await api_delete(f"/repos/{repo_id}") @@ -185,6 +199,7 @@ async def _handle_delete_repository(args: dict[str, Any]) -> str: "get_repo_directories": _handle_get_repo_directories, "index_repository": _handle_index_repository, "delete_repository": _handle_delete_repository, + "get_context_for_task": _handle_get_context_for_task, } diff --git a/mcp-server/tests/test_handlers.py b/mcp-server/tests/test_handlers.py index 63648ea..4644ca8 100644 --- a/mcp-server/tests/test_handlers.py +++ b/mcp-server/tests/test_handlers.py @@ -62,6 +62,27 @@ async def test_dna_calls_correct_endpoint(self, mock_get): call_path = mock_get.call_args[0][0] assert "/repos/r1/dna" in call_path + @pytest.mark.asyncio + @patch("handlers.api_post", new_callable=AsyncMock) + async def test_context_for_task_dispatches(self, mock_post): + mock_post.return_value = { + "context": "## Context for: \"test task\"", + "tokens_used": 200, + "token_budget": 1500, + "files_found": 3, + } + result = await call_tool("get_context_for_task", { + "task_description": "add auth to settings", + "repo_id": "abc", + }) + assert len(result) == 1 + assert "Context for" in result[0].text + call_path = mock_post.call_args[0][0] + assert call_path == "/context/assemble" + payload = mock_post.call_args[1]["json"] + assert payload["task"] == "add auth to settings" + assert payload["token_budget"] == 1500 + @pytest.mark.asyncio @patch("handlers.api_get", new_callable=AsyncMock) async def test_none_arguments_handled(self, mock_get): diff --git a/mcp-server/tests/test_tools.py b/mcp-server/tests/test_tools.py index bc239fb..25aebea 100644 --- a/mcp-server/tests/test_tools.py +++ b/mcp-server/tests/test_tools.py @@ -19,6 +19,7 @@ "get_repo_directories", "index_repository", "delete_repository", + "get_context_for_task", } diff --git a/mcp-server/tools.py b/mcp-server/tools.py index fb13e87..9705383 100644 --- a/mcp-server/tools.py +++ b/mcp-server/tools.py @@ -145,6 +145,48 @@ def get_tool_schemas() -> list[types.Tool]: "required": ["repo_id"], }, ), + # --- Context assembly --- + types.Tool( + name="get_context_for_task", + description=( + "Get precisely assembled context for a specific coding task. " + "Returns only the files, dependencies, and project rules " + "relevant to your task, within a token budget. Use this " + "BEFORE writing any code to get exactly the right context " + "for the task at hand -- semantic search finds relevant " + "code, dependency expansion adds imports, and rule matching " + "includes only the project conventions that apply." + ), + inputSchema={ + "type": "object", + "properties": { + "task_description": { + "type": "string", + "description": ( + "Natural language description of the coding task. " + "Examples: 'add rate limiting to settings endpoints', " + "'implement OAuth for the admin dashboard', " + "'fix the dependency graph for TypeScript repos'" + ), + }, + "repo_id": { + "type": "string", + "description": "Repository identifier", + }, + "token_budget": { + "type": "integer", + "description": ( + "Maximum tokens for the context package " + "(default: 1500). Lower = more focused." + ), + "default": 1500, + "minimum": 100, + "maximum": 10000, + }, + }, + "required": ["task_description", "repo_id"], + }, + ), # --- Write tools --- types.Tool( name="add_repository",