Skip to content

Commit 1e1ae25

Browse files
committed
fix(playground): add partial indexing support and fix bugs (#125)
- Add partial=true parameter to index first 200 files of large repos - Fix create_session() call - generate token before creating session - Fix JSON serialization in validation error handler - Add missing capture_http_exception function to sentry module - Add comprehensive tests for anonymous indexing (30 tests) All 169 tests passing.
1 parent d4ec4b4 commit 1e1ae25

6 files changed

Lines changed: 645 additions & 51 deletions

File tree

backend/main.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,11 +98,22 @@ async def dispatch(self, request: Request, call_next):
9898
@app.exception_handler(RequestValidationError)
9999
async def validation_exception_handler(request: Request, exc: RequestValidationError):
100100
"""Handle validation errors with clear messages."""
101+
# Convert errors to JSON-serializable format
102+
errors = []
103+
for err in exc.errors():
104+
error_dict = {
105+
"type": err.get("type"),
106+
"loc": err.get("loc"),
107+
"msg": err.get("msg"),
108+
"input": str(err.get("input")) if err.get("input") is not None else None,
109+
}
110+
errors.append(error_dict)
111+
101112
return JSONResponse(
102113
status_code=422,
103114
content={
104115
"detail": "Validation error",
105-
"errors": exc.errors()
116+
"errors": errors
106117
}
107118
)
108119

backend/routes/playground.py

Lines changed: 50 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ class IndexRepoRequest(BaseModel):
7676
"""
7777
github_url: str
7878
branch: Optional[str] = None # None = use repo's default branch
79+
partial: bool = False # If True, index first 200 files of large repos
7980

8081
@field_validator("github_url")
8182
@classmethod
@@ -653,8 +654,9 @@ async def start_anonymous_indexing(
653654
client_ip = _get_client_ip(req)
654655

655656
if not session_token:
656-
# Create new session
657-
session_token = limiter.create_session()
657+
# Create new session - generate token first, then create session
658+
session_token = limiter._generate_session_token()
659+
limiter.create_session(session_token)
658660
_set_session_cookie(response, session_token)
659661
logger.info("Created new session for indexing",
660662
session_token=session_token[:8],
@@ -767,18 +769,32 @@ async def start_anonymous_indexing(
767769
file_count = max(repo_size_kb // 3, 1)
768770

769771
# Check file limit
772+
is_partial = False
773+
files_to_index = file_count
774+
770775
if file_count > ANONYMOUS_FILE_LIMIT:
771-
raise HTTPException(
772-
status_code=400,
773-
detail={
774-
"error": "validation_failed",
775-
"reason": "too_large",
776-
"message": f"Repository has {file_count:,} code files. "
777-
f"Anonymous limit is {ANONYMOUS_FILE_LIMIT}.",
778-
"file_count": file_count,
779-
"limit": ANONYMOUS_FILE_LIMIT
780-
}
781-
)
776+
if request.partial:
777+
# Partial indexing - cap at limit
778+
is_partial = True
779+
files_to_index = ANONYMOUS_FILE_LIMIT
780+
logger.info("Partial indexing enabled",
781+
total_files=file_count,
782+
indexing=files_to_index)
783+
else:
784+
# Reject large repos without partial flag
785+
raise HTTPException(
786+
status_code=400,
787+
detail={
788+
"error": "validation_failed",
789+
"reason": "too_large",
790+
"message": f"Repository has {file_count:,} code files. "
791+
f"Anonymous limit is {ANONYMOUS_FILE_LIMIT}. "
792+
f"Use partial=true to index first {ANONYMOUS_FILE_LIMIT} files.",
793+
"file_count": file_count,
794+
"limit": ANONYMOUS_FILE_LIMIT,
795+
"hint": "Set partial=true to index a subset of files"
796+
}
797+
)
782798

783799
# --- Validation passed! Create job and start background indexing ---
784800

@@ -796,7 +812,9 @@ async def start_anonymous_indexing(
796812
owner=owner,
797813
repo_name=repo_name,
798814
branch=branch,
799-
file_count=file_count
815+
file_count=file_count,
816+
is_partial=is_partial,
817+
max_files=files_to_index
800818
)
801819

802820
# Queue background task
@@ -811,25 +829,38 @@ async def start_anonymous_indexing(
811829
owner=owner,
812830
repo_name=repo_name,
813831
branch=branch,
814-
file_count=file_count
832+
file_count=files_to_index, # Actual files to index (may be capped)
833+
max_files=files_to_index if is_partial else None # Limit for partial indexing
815834
)
816835

817836
logger.info("Indexing job queued",
818837
job_id=job_id,
819838
owner=owner,
820839
repo=repo_name,
821840
branch=branch,
822-
file_count=file_count,
841+
file_count=files_to_index,
842+
is_partial=is_partial,
823843
session_token=session_token[:8],
824844
response_time_ms=response_time_ms)
825845

826846
# Estimate time based on file count (~0.3s per file)
827-
estimated_seconds = max(10, int(file_count * 0.3))
847+
estimated_seconds = max(10, int(files_to_index * 0.3))
828848

829-
return {
849+
response_data = {
830850
"job_id": job_id,
831851
"status": "queued",
832852
"estimated_time_seconds": estimated_seconds,
833-
"file_count": file_count,
853+
"file_count": files_to_index,
834854
"message": f"Indexing started. Poll /playground/index/{job_id} for status."
835855
}
856+
857+
# Add partial info if applicable
858+
if is_partial:
859+
response_data["partial"] = True
860+
response_data["total_files"] = file_count
861+
response_data["message"] = (
862+
f"Partial indexing started ({files_to_index} of {file_count} files). "
863+
f"Poll /playground/index/{job_id} for status."
864+
)
865+
866+
return response_data

backend/services/anonymous_indexer.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,9 @@ def create_job(
9393
owner: str,
9494
repo_name: str,
9595
branch: str,
96-
file_count: int
96+
file_count: int,
97+
is_partial: bool = False,
98+
max_files: Optional[int] = None
9799
) -> dict:
98100
"""
99101
Create a new indexing job in Redis.
@@ -111,6 +113,8 @@ def create_job(
111113
"repo_name": repo_name,
112114
"branch": branch,
113115
"file_count": file_count,
116+
"is_partial": is_partial,
117+
"max_files": max_files,
114118
"status": JobStatus.QUEUED.value,
115119
"progress": None,
116120
"stats": None,
@@ -225,13 +229,17 @@ async def run_indexing_job(
225229
owner: str,
226230
repo_name: str,
227231
branch: str,
228-
file_count: int
232+
file_count: int,
233+
max_files: Optional[int] = None
229234
) -> None:
230235
"""
231236
Background task to clone and index a repository.
232237
233238
This runs asynchronously after the endpoint returns.
234239
Updates Redis with progress and final status.
240+
241+
Args:
242+
max_files: If set, limit indexing to first N files (for partial indexing)
235243
"""
236244
import time
237245
start_time = time.time()
@@ -286,7 +294,8 @@ async def progress_callback(files_processed: int, functions_found: int, total: i
286294
indexer.index_repository_with_progress(
287295
repo_id,
288296
str(temp_path),
289-
progress_callback
297+
progress_callback,
298+
max_files=max_files
290299
),
291300
timeout=job_manager.INDEX_TIMEOUT_SECONDS
292301
)

backend/services/indexer_optimized.py

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -464,17 +464,30 @@ async def explain_code(
464464
return f"Error: {str(e)}"
465465

466466
async def index_repository_with_progress(
467-
self,
468-
repo_id: str,
467+
self,
468+
repo_id: str,
469469
repo_path: str,
470-
progress_callback
470+
progress_callback,
471+
max_files: int = None
471472
):
472-
"""Index repository with real-time progress updates"""
473+
"""Index repository with real-time progress updates
474+
475+
Args:
476+
max_files: If set, limit indexing to first N files (for partial indexing)
477+
"""
473478
start_time = time.time()
474479
logger.info("Starting optimized indexing with progress", repo_id=repo_id)
475-
480+
476481
# Discover code files
477482
code_files = self._discover_code_files(repo_path)
483+
484+
# Apply file limit if specified (partial indexing)
485+
if max_files and len(code_files) > max_files:
486+
logger.info("Limiting files for partial indexing",
487+
total_discovered=len(code_files),
488+
max_files=max_files)
489+
code_files = code_files[:max_files]
490+
478491
total_files = len(code_files)
479492
logger.info("Found code files", repo_id=repo_id, total_files=total_files)
480493

backend/services/sentry.py

Lines changed: 39 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -12,58 +12,58 @@
1212
def init_sentry() -> bool:
1313
"""
1414
Initialize Sentry SDK if SENTRY_DSN is configured.
15-
15+
1616
Returns:
1717
bool: True if Sentry was initialized, False otherwise
1818
"""
1919
sentry_dsn = os.getenv("SENTRY_DSN")
20-
20+
2121
if not sentry_dsn:
2222
print("ℹ️ Sentry DSN not configured - error tracking disabled")
2323
return False
24-
24+
2525
try:
2626
import sentry_sdk
2727
from sentry_sdk.integrations.fastapi import FastApiIntegration
2828
from sentry_sdk.integrations.starlette import StarletteIntegration
29-
29+
3030
environment = os.getenv("ENVIRONMENT", "development")
31-
31+
3232
sentry_sdk.init(
3333
dsn=sentry_dsn,
3434
environment=environment,
35-
35+
3636
# Performance monitoring - sample rate based on environment
3737
traces_sample_rate=0.1 if environment == "production" else 1.0,
38-
38+
3939
# Profile sampled transactions
4040
profiles_sample_rate=0.1 if environment == "production" else 1.0,
41-
41+
4242
# Send PII for debugging (user IDs, emails)
4343
send_default_pii=True,
44-
44+
4545
# Integrations
4646
integrations=[
4747
FastApiIntegration(transaction_style="endpoint"),
4848
StarletteIntegration(transaction_style="endpoint"),
4949
],
50-
50+
5151
# Filter noisy events
5252
before_send=_filter_events,
53-
53+
5454
# Debug mode for development
5555
debug=environment == "development",
56-
56+
5757
# Attach stack traces to messages
5858
attach_stacktrace=True,
59-
59+
6060
# Include local variables in stack traces
6161
include_local_variables=True,
6262
)
63-
63+
6464
print(f"✅ Sentry initialized (environment: {environment})")
6565
return True
66-
66+
6767
except ImportError:
6868
print("⚠️ sentry-sdk not installed - error tracking disabled")
6969
return False
@@ -74,26 +74,26 @@ def init_sentry() -> bool:
7474

7575
def _filter_events(event, hint):
7676
"""Filter out noisy events before sending to Sentry."""
77-
77+
7878
# Don't send health check errors
7979
request_url = event.get("request", {}).get("url", "")
8080
if "/health" in request_url:
8181
return None
82-
82+
8383
# Don't send 404s for common bot paths
8484
exception_values = event.get("exception", {}).get("values", [])
8585
if exception_values:
8686
exception_value = str(exception_values[0].get("value", ""))
8787
bot_paths = ["/wp-admin", "/wp-login", "/.env", "/config", "/admin", "/phpmyadmin", "/.git"]
8888
if any(path in exception_value for path in bot_paths):
8989
return None
90-
90+
9191
# Don't send validation errors (they're expected)
9292
if exception_values:
9393
exception_type = exception_values[0].get("type", "")
9494
if exception_type in ("RequestValidationError", "ValidationError"):
9595
return None
96-
96+
9797
return event
9898

9999

@@ -104,7 +104,7 @@ def _filter_events(event, hint):
104104
def set_user_context(user_id: Optional[str] = None, email: Optional[str] = None):
105105
"""
106106
Set user context for error tracking.
107-
107+
108108
DEPRECATED: Use from services.observability import set_user_context
109109
"""
110110
try:
@@ -117,7 +117,7 @@ def set_user_context(user_id: Optional[str] = None, email: Optional[str] = None)
117117
def capture_exception(error: Exception, **extra_context):
118118
"""
119119
Manually capture an exception with additional context.
120-
120+
121121
DEPRECATED: Use from services.observability import capture_exception
122122
"""
123123
try:
@@ -133,7 +133,7 @@ def capture_exception(error: Exception, **extra_context):
133133
def capture_message(message: str, level: str = "info", **extra_context):
134134
"""
135135
Capture a message (not an exception) for tracking.
136-
136+
137137
DEPRECATED: Use from services.observability import get_logger
138138
"""
139139
try:
@@ -149,7 +149,7 @@ def capture_message(message: str, level: str = "info", **extra_context):
149149
def set_operation_context(operation: str, **tags):
150150
"""
151151
Set operation context for the current scope.
152-
152+
153153
DEPRECATED: Use from services.observability import trace_operation
154154
"""
155155
try:
@@ -159,3 +159,19 @@ def set_operation_context(operation: str, **tags):
159159
sentry_sdk.set_tag(key, str(value))
160160
except ImportError:
161161
pass
162+
163+
164+
def capture_http_exception(request, exc: Exception, status_code: int):
165+
"""
166+
Capture HTTP exception with request context for error tracking.
167+
"""
168+
try:
169+
import sentry_sdk
170+
with sentry_sdk.push_scope() as scope:
171+
scope.set_extra("status_code", status_code)
172+
scope.set_extra("path", str(request.url.path))
173+
scope.set_extra("method", request.method)
174+
sentry_sdk.capture_exception(exc)
175+
except ImportError:
176+
pass
177+
pass

0 commit comments

Comments
 (0)