fix: recover stuck indexing jobs on restart and retry (#311)

DevanshuNEU · DevanshuNEU · commit 88e1da8a55cc · 2026-06-08T15:04:55.000-04:00
An indexer that dies mid-job (a redeploy guarantees this) left repositories.status='indexing' forever, and try_set_indexing then permanently blocked retry. Add indexing_started_at (migration 003) stamped on every transition to 'indexing', plus a startup sweep (resets all orphaned 'indexing' rows to 'error') and steal-on-retry (re-claims rows older than 30 min or with a null stamp) via an atomic PostgREST .or_ CAS that cannot double-claim.

Migrations are applied manually (no runner), so both write paths degrade gracefully if 003 is not yet applied: update_repository_status retries without the column, try_set_indexing falls back to the original CAS. A wrong deploy order no longer 500s every index; it only loses stuck-job recovery until the migration runs.
diff --git a/backend/main.py b/backend/main.py
@@ -44,6 +44,10 @@
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     validate_environment()
+    # Any repo left 'indexing' at boot is orphaned (indexing runs in-process; a restart
+    # kills it). Reset so the user can retry instead of seeing an eternal spinner. (#311)
+    from services.supabase_service import get_supabase_service
+    get_supabase_service().reset_stuck_indexing_jobs()
     await load_demo_repos()
     yield
     # Shutdown (cleanup if needed)
diff --git a/backend/migrations/003_add_indexing_started_at.sql b/backend/migrations/003_add_indexing_started_at.sql
@@ -0,0 +1,11 @@
+-- Durable repo-state v0.1 (issue #311)
+-- Records when a repo entered the 'indexing' state so the stuck-job reaper can tell a
+-- live job from an orphaned one (process died mid-index, leaving status='indexing' forever).
+
+ALTER TABLE repositories
+    ADD COLUMN IF NOT EXISTS indexing_started_at TIMESTAMPTZ;
+
+-- Partial index: the reaper and the steal-on-retry path only ever scan rows currently indexing.
+CREATE INDEX IF NOT EXISTS idx_repositories_indexing_started_at
+    ON repositories(indexing_started_at)
+    WHERE status = 'indexing';
diff --git a/backend/services/supabase_service.py b/backend/services/supabase_service.py
@@ -4,12 +4,16 @@
 """
 import os
 from typing import Dict, List, Optional
-from datetime import datetime
+from datetime import datetime, timedelta
 from supabase import create_client, Client, ClientOptions
 import uuid
 
 from services.observability import logger
 
+# A repo stuck in 'indexing' longer than this is treated as orphaned (its indexer
+# process died) and may be re-claimed on retry. The startup sweep is unconditional.
+STUCK_INDEXING_THRESHOLD_MINUTES = 30
+
 
 class SupabaseService:
     """Service for Supabase database operations"""
@@ -87,23 +91,70 @@ def update_repository(self, repo_id: str, updates: Dict) -> Optional[Dict]:
         return result.data[0] if result.data else None
     
     def update_repository_status(self, repo_id: str, status: str) -> None:
-        """Update repository status"""
-        self.client.table("repositories").update({"status": status}).eq("id", repo_id).execute()
-    
+        """Update repository status. Transitioning to 'indexing' stamps indexing_started_at
+        so the stuck-job reaper has a clock to measure against, regardless of which code path
+        started the job."""
+        updates: Dict = {"status": status}
+        if status == "indexing":
+            updates["indexing_started_at"] = datetime.utcnow().isoformat()
+        try:
+            self.client.table("repositories").update(updates).eq("id", repo_id).execute()
+        except Exception as e:
+            # Degrade gracefully if migration 003 (indexing_started_at) hasn't been applied yet:
+            # retry the status update without the new column instead of failing the index.
+            if "indexing_started_at" not in updates:
+                raise
+            logger.warning(
+                "Status update with indexing_started_at failed; retrying without it (apply migration 003)",
+                repo_id=repo_id, error=str(e),
+            )
+            self.client.table("repositories").update({"status": status}).eq("id", repo_id).execute()
+
     def try_set_indexing_status(self, repo_id: str) -> bool:
         """
-        Atomically set status to 'indexing' only if not already indexing.
-        
-        Returns True if status was set, False if repo was already indexing.
-        This prevents TOCTOU race conditions where two requests could both
-        see status != 'indexing' and both start indexing.
+        Atomically set status to 'indexing' only if not already actively indexing.
+
+        Returns True if status was set, False if a fresh indexing job already owns the repo.
+        This prevents TOCTOU race conditions where two requests both see status != 'indexing'
+        and both start indexing. A row stuck in 'indexing' past the threshold (its process died)
+        -- or with no start stamp at all (legacy/pre-migration) -- is treated as orphaned and
+        re-claimed, so a crashed job never permanently blocks retry.
         """
-        result = self.client.table("repositories").update(
-            {"status": "indexing"}
-        ).eq("id", repo_id).neq("status", "indexing").execute()
-        
-        # If result.data is empty, no rows matched (already indexing)
+        cutoff = (datetime.utcnow() - timedelta(minutes=STUCK_INDEXING_THRESHOLD_MINUTES)).isoformat()
+        try:
+            result = self.client.table("repositories").update(
+                {"status": "indexing", "indexing_started_at": datetime.utcnow().isoformat()}
+            ).eq("id", repo_id).or_(
+                f"status.neq.indexing,indexing_started_at.is.null,indexing_started_at.lt.{cutoff}"
+            ).execute()
+        except Exception as e:
+            # Degrade gracefully if migration 003 hasn't been applied yet: fall back to the
+            # original atomic compare-and-set (no steal-on-stale) so indexing still works.
+            logger.warning(
+                "try_set_indexing steal path failed; falling back to basic CAS (apply migration 003)",
+                repo_id=repo_id, error=str(e),
+            )
+            result = self.client.table("repositories").update(
+                {"status": "indexing"}
+            ).eq("id", repo_id).neq("status", "indexing").execute()
+
+        # If result.data is empty, no rows matched (a fresh indexing job owns it)
         return bool(result.data)
+
+    def reset_stuck_indexing_jobs(self) -> int:
+        """Reset every repo left in 'indexing' to 'error' so the user can retry.
+
+        Called once on startup: indexing runs in-process (BackgroundTasks / WebSocket), so a
+        restart kills any in-flight job and every 'indexing' row at boot is by definition
+        orphaned. Returns the number of rows reset.
+        """
+        result = self.client.table("repositories").update(
+            {"status": "error"}
+        ).eq("status", "indexing").execute()
+        count = len(result.data) if result.data else 0
+        if count:
+            logger.warning("Reset orphaned indexing jobs on startup", count=count)
+        return count
     
     def update_file_count(self, repo_id: str, count: int) -> None:
         """Update repository file count"""