Skip to content

Commit 88e1da8

Browse files
committed
fix: recover stuck indexing jobs on restart and retry (#311)
An indexer that dies mid-job (a redeploy guarantees this) left repositories.status='indexing' forever, and try_set_indexing then permanently blocked retry. Add indexing_started_at (migration 003) stamped on every transition to 'indexing', plus a startup sweep (resets all orphaned 'indexing' rows to 'error') and steal-on-retry (re-claims rows older than 30 min or with a null stamp) via an atomic PostgREST .or_ CAS that cannot double-claim. Migrations are applied manually (no runner), so both write paths degrade gracefully if 003 is not yet applied: update_repository_status retries without the column, try_set_indexing falls back to the original CAS. A wrong deploy order no longer 500s every index; it only loses stuck-job recovery until the migration runs.
1 parent 0c1f42f commit 88e1da8

3 files changed

Lines changed: 80 additions & 14 deletions

File tree

backend/main.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,10 @@
4444
@asynccontextmanager
4545
async def lifespan(app: FastAPI):
4646
validate_environment()
47+
# Any repo left 'indexing' at boot is orphaned (indexing runs in-process; a restart
48+
# kills it). Reset so the user can retry instead of seeing an eternal spinner. (#311)
49+
from services.supabase_service import get_supabase_service
50+
get_supabase_service().reset_stuck_indexing_jobs()
4751
await load_demo_repos()
4852
yield
4953
# Shutdown (cleanup if needed)
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
-- Durable repo-state v0.1 (issue #311)
2+
-- Records when a repo entered the 'indexing' state so the stuck-job reaper can tell a
3+
-- live job from an orphaned one (process died mid-index, leaving status='indexing' forever).
4+
5+
ALTER TABLE repositories
6+
ADD COLUMN IF NOT EXISTS indexing_started_at TIMESTAMPTZ;
7+
8+
-- Partial index: the reaper and the steal-on-retry path only ever scan rows currently indexing.
9+
CREATE INDEX IF NOT EXISTS idx_repositories_indexing_started_at
10+
ON repositories(indexing_started_at)
11+
WHERE status = 'indexing';

backend/services/supabase_service.py

Lines changed: 65 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,16 @@
44
"""
55
import os
66
from typing import Dict, List, Optional
7-
from datetime import datetime
7+
from datetime import datetime, timedelta
88
from supabase import create_client, Client, ClientOptions
99
import uuid
1010

1111
from services.observability import logger
1212

13+
# A repo stuck in 'indexing' longer than this is treated as orphaned (its indexer
14+
# process died) and may be re-claimed on retry. The startup sweep is unconditional.
15+
STUCK_INDEXING_THRESHOLD_MINUTES = 30
16+
1317

1418
class SupabaseService:
1519
"""Service for Supabase database operations"""
@@ -87,23 +91,70 @@ def update_repository(self, repo_id: str, updates: Dict) -> Optional[Dict]:
8791
return result.data[0] if result.data else None
8892

8993
def update_repository_status(self, repo_id: str, status: str) -> None:
90-
"""Update repository status"""
91-
self.client.table("repositories").update({"status": status}).eq("id", repo_id).execute()
92-
94+
"""Update repository status. Transitioning to 'indexing' stamps indexing_started_at
95+
so the stuck-job reaper has a clock to measure against, regardless of which code path
96+
started the job."""
97+
updates: Dict = {"status": status}
98+
if status == "indexing":
99+
updates["indexing_started_at"] = datetime.utcnow().isoformat()
100+
try:
101+
self.client.table("repositories").update(updates).eq("id", repo_id).execute()
102+
except Exception as e:
103+
# Degrade gracefully if migration 003 (indexing_started_at) hasn't been applied yet:
104+
# retry the status update without the new column instead of failing the index.
105+
if "indexing_started_at" not in updates:
106+
raise
107+
logger.warning(
108+
"Status update with indexing_started_at failed; retrying without it (apply migration 003)",
109+
repo_id=repo_id, error=str(e),
110+
)
111+
self.client.table("repositories").update({"status": status}).eq("id", repo_id).execute()
112+
93113
def try_set_indexing_status(self, repo_id: str) -> bool:
94114
"""
95-
Atomically set status to 'indexing' only if not already indexing.
96-
97-
Returns True if status was set, False if repo was already indexing.
98-
This prevents TOCTOU race conditions where two requests could both
99-
see status != 'indexing' and both start indexing.
115+
Atomically set status to 'indexing' only if not already actively indexing.
116+
117+
Returns True if status was set, False if a fresh indexing job already owns the repo.
118+
This prevents TOCTOU race conditions where two requests both see status != 'indexing'
119+
and both start indexing. A row stuck in 'indexing' past the threshold (its process died)
120+
-- or with no start stamp at all (legacy/pre-migration) -- is treated as orphaned and
121+
re-claimed, so a crashed job never permanently blocks retry.
100122
"""
101-
result = self.client.table("repositories").update(
102-
{"status": "indexing"}
103-
).eq("id", repo_id).neq("status", "indexing").execute()
104-
105-
# If result.data is empty, no rows matched (already indexing)
123+
cutoff = (datetime.utcnow() - timedelta(minutes=STUCK_INDEXING_THRESHOLD_MINUTES)).isoformat()
124+
try:
125+
result = self.client.table("repositories").update(
126+
{"status": "indexing", "indexing_started_at": datetime.utcnow().isoformat()}
127+
).eq("id", repo_id).or_(
128+
f"status.neq.indexing,indexing_started_at.is.null,indexing_started_at.lt.{cutoff}"
129+
).execute()
130+
except Exception as e:
131+
# Degrade gracefully if migration 003 hasn't been applied yet: fall back to the
132+
# original atomic compare-and-set (no steal-on-stale) so indexing still works.
133+
logger.warning(
134+
"try_set_indexing steal path failed; falling back to basic CAS (apply migration 003)",
135+
repo_id=repo_id, error=str(e),
136+
)
137+
result = self.client.table("repositories").update(
138+
{"status": "indexing"}
139+
).eq("id", repo_id).neq("status", "indexing").execute()
140+
141+
# If result.data is empty, no rows matched (a fresh indexing job owns it)
106142
return bool(result.data)
143+
144+
def reset_stuck_indexing_jobs(self) -> int:
145+
"""Reset every repo left in 'indexing' to 'error' so the user can retry.
146+
147+
Called once on startup: indexing runs in-process (BackgroundTasks / WebSocket), so a
148+
restart kills any in-flight job and every 'indexing' row at boot is by definition
149+
orphaned. Returns the number of rows reset.
150+
"""
151+
result = self.client.table("repositories").update(
152+
{"status": "error"}
153+
).eq("status", "indexing").execute()
154+
count = len(result.data) if result.data else 0
155+
if count:
156+
logger.warning("Reset orphaned indexing jobs on startup", count=count)
157+
return count
107158

108159
def update_file_count(self, repo_id: str, count: int) -> None:
109160
"""Update repository file count"""

0 commit comments

Comments
 (0)