Skip to content

Commit ba64b5d

Browse files
committed
fix: address review findings on durable repo-state (#311)
_clone_into_place: drop ignore_errors on the canonical-dir cleanup so a failed removal surfaces (ensure_clone wraps it into a logged RepoCloneError) instead of being swallowed and causing a confusing rename failure later. update_repository_status / try_set_indexing_status: gate the missing-column migration fallback on the error actually naming indexing_started_at, so an unrelated failure (network, constraint) re-raises instead of being masked by the retry. CodeRabbit's 'fallback never runs' reading was incorrect (the not-in check does execute the retry), but narrowing the catch to the migration case is the right hardening. Add an injection-safety note above the PostgREST .or_ filter: cutoff is server-controlled (utcnow + STUCK_INDEXING_THRESHOLD_MINUTES); any future user-supplied filter value must be SDK-parameterized. Tests: unrelated errors now re-raise rather than hit the fallback.
1 parent 97b549b commit ba64b5d

3 files changed

Lines changed: 39 additions & 10 deletions

File tree

backend/services/repo_manager.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -209,9 +209,12 @@ def _clone_into_place(self, repo_id: str, git_url: str, branch: str, canonical:
209209
tmp = self.repos_dir / f".{repo_id}.tmp.{uuid.uuid4().hex}"
210210
try:
211211
git.Repo.clone_from(git_url, tmp, branch=branch, depth=1)
212-
# Clear any leftover partial dir before the atomic swap.
212+
# Clear any leftover partial dir before the atomic swap. Do NOT ignore errors here:
213+
# a failed removal must surface (the outer except re-raises it, and ensure_clone wraps
214+
# it into a logged RepoCloneError) rather than letting us rename onto a dir we could
215+
# not clean, which would fail later with a more confusing error.
213216
if canonical.exists():
214-
shutil.rmtree(canonical, ignore_errors=True)
217+
shutil.rmtree(canonical)
215218
os.rename(tmp, canonical) # atomic on the same filesystem
216219
except Exception:
217220
if tmp.exists():

backend/services/supabase_service.py

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -100,12 +100,13 @@ def update_repository_status(self, repo_id: str, status: str) -> None:
100100
try:
101101
self.client.table("repositories").update(updates).eq("id", repo_id).execute()
102102
except Exception as e:
103-
# Degrade gracefully if migration 003 (indexing_started_at) hasn't been applied yet:
104-
# retry the status update without the new column instead of failing the index.
105-
if "indexing_started_at" not in updates:
103+
# Only swallow the specific case where migration 003 (indexing_started_at) has not been
104+
# applied yet -- detected by the column name appearing in the DB error. Any other error
105+
# (network, constraint, etc.) must re-raise so a real failure isn't masked by the retry.
106+
if "indexing_started_at" not in updates or "indexing_started_at" not in str(e):
106107
raise
107108
logger.warning(
108-
"Status update with indexing_started_at failed; retrying without it (apply migration 003)",
109+
"indexing_started_at column missing; retrying status update without it (apply migration 003)",
109110
repo_id=repo_id, error=str(e),
110111
)
111112
self.client.table("repositories").update({"status": status}).eq("id", repo_id).execute()
@@ -120,6 +121,11 @@ def try_set_indexing_status(self, repo_id: str) -> bool:
120121
-- or with no start stamp at all (legacy/pre-migration) -- is treated as orphaned and
121122
re-claimed, so a crashed job never permanently blocks retry.
122123
"""
124+
# cutoff is the staleness threshold for stealing an orphaned 'indexing' lock, derived from
125+
# STUCK_INDEXING_THRESHOLD_MINUTES and a SERVER-CONTROLLED datetime.utcnow(). It is never
126+
# user input, so interpolating it into the PostgREST .or_ filter string is safe. If any
127+
# user-supplied value is ever introduced into this filter, parameterize it via the SDK
128+
# (do not f-string it) to avoid filter-injection.
123129
cutoff = (datetime.utcnow() - timedelta(minutes=STUCK_INDEXING_THRESHOLD_MINUTES)).isoformat()
124130
try:
125131
result = self.client.table("repositories").update(
@@ -128,10 +134,12 @@ def try_set_indexing_status(self, repo_id: str) -> bool:
128134
f"status.neq.indexing,indexing_started_at.is.null,indexing_started_at.lt.{cutoff}"
129135
).execute()
130136
except Exception as e:
131-
# Degrade gracefully if migration 003 hasn't been applied yet: fall back to the
132-
# original atomic compare-and-set (no steal-on-stale) so indexing still works.
137+
# Only fall back when migration 003 (indexing_started_at) is missing -- detected by the
138+
# column name in the DB error. Re-raise anything else so a real failure isn't masked.
139+
if "indexing_started_at" not in str(e):
140+
raise
133141
logger.warning(
134-
"try_set_indexing steal path failed; falling back to basic CAS (apply migration 003)",
142+
"indexing_started_at column missing; falling back to basic CAS (apply migration 003)",
135143
repo_id=repo_id, error=str(e),
136144
)
137145
result = self.client.table("repositories").update(

backend/tests/test_durable_repo_state.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -169,14 +169,32 @@ def test_try_set_indexing_falls_back_to_basic_cas_when_column_missing(self):
169169
"""Steal path errors without the column -> fall back to the original atomic CAS."""
170170
svc = self._service()
171171
eq_chain = svc.client.table.return_value.update.return_value.eq.return_value
172-
eq_chain.or_.return_value.execute.side_effect = Exception("PGRST204 column missing")
172+
eq_chain.or_.return_value.execute.side_effect = Exception(
173+
"PGRST204 Could not find the 'indexing_started_at' column in schema cache"
174+
)
173175
fallback = MagicMock()
174176
fallback.data = [{"id": "r1"}]
175177
eq_chain.neq.return_value.execute.return_value = fallback
176178

177179
assert svc.try_set_indexing_status("r1") is True
178180
eq_chain.neq.assert_called_once_with("status", "indexing")
179181

182+
def test_update_status_reraises_unrelated_error(self):
183+
"""A non-migration error must NOT be masked by the missing-column fallback."""
184+
svc = self._service()
185+
chain = svc.client.table.return_value.update.return_value.eq.return_value
186+
chain.execute.side_effect = Exception("connection reset by peer")
187+
with pytest.raises(Exception, match="connection reset"):
188+
svc.update_repository_status("r1", "indexing")
189+
190+
def test_try_set_indexing_reraises_unrelated_error(self):
191+
"""A non-migration error must NOT trigger the basic-CAS fallback."""
192+
svc = self._service()
193+
eq_chain = svc.client.table.return_value.update.return_value.eq.return_value
194+
eq_chain.or_.return_value.execute.side_effect = Exception("connection reset by peer")
195+
with pytest.raises(Exception, match="connection reset"):
196+
svc.try_set_indexing_status("r1")
197+
180198

181199
class TestRouteWiring:
182200
"""Integration: a route actually invokes ensure_clone (guards against the guard being removed)."""

0 commit comments

Comments
 (0)