From c79081f58b57eaab3f299e871eaa47c96c0f0f04 Mon Sep 17 00:00:00 2001 From: Bob Date: Fri, 27 Feb 2026 23:38:55 +0000 Subject: [PATCH 1/4] fix(datastore): handle commit failures gracefully instead of panicking When a transaction commit fails (e.g. disk full / SQLITE_FULL), the worker thread panicked, permanently breaking the datastore channel. All subsequent requests returned MpscError (HTTP 500) until restart. Replace the panic with error logging and continue. The rolled-back events will be re-sent by watchers via heartbeat or retried by clients. Add CommitFailed error variant mapped to HTTP 503 (Service Unavailable) so clients know to back off and retry. Fixes #256 --- aw-datastore/src/lib.rs | 2 ++ aw-datastore/src/worker.rs | 10 +++++++++- aw-server/src/endpoints/util.rs | 3 +++ 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/aw-datastore/src/lib.rs b/aw-datastore/src/lib.rs index 69c47618..c18d801e 100644 --- a/aw-datastore/src/lib.rs +++ b/aw-datastore/src/lib.rs @@ -39,4 +39,6 @@ pub enum DatastoreError { // Errors specific to when migrate is disabled Uninitialized(String), OldDbVersion(String), + // Database write failure (e.g. disk full) + CommitFailed(String), } diff --git a/aw-datastore/src/worker.rs b/aw-datastore/src/worker.rs index 18eaf665..0ed260de 100644 --- a/aw-datastore/src/worker.rs +++ b/aw-datastore/src/worker.rs @@ -192,7 +192,15 @@ impl DatastoreWorker { ); match tx.commit() { Ok(_) => (), - Err(err) => panic!("Failed to commit datastore transaction! {err}"), + Err(err) => { + error!( + "Failed to commit datastore transaction ({} events lost): {err}", + self.uncommitted_events + ); + // Continue instead of panicking — the rolled-back events will be + // re-sent by watchers (heartbeats) or retried by clients. + // This handles transient failures like disk full (SQLITE_FULL). + } } if self.quit { break; diff --git a/aw-server/src/endpoints/util.rs b/aw-server/src/endpoints/util.rs index fdf4a992..6a841af4 100644 --- a/aw-server/src/endpoints/util.rs +++ b/aw-server/src/endpoints/util.rs @@ -98,6 +98,9 @@ impl From for HttpErrorJson { DatastoreError::OldDbVersion(msg) => { HttpErrorJson::new(Status::InternalServerError, msg) } + DatastoreError::CommitFailed(msg) => { + HttpErrorJson::new(Status::ServiceUnavailable, msg) + } } } } From 4c34dd4d7352b926671f41625597664a43a83b98 Mon Sep 17 00:00:00 2001 From: Bob Date: Sun, 1 Mar 2026 11:30:46 +0000 Subject: [PATCH 2/4] fix(datastore): apply graceful error handling to legacy import commit The main work loop commit (line 193) was already handled gracefully (error log + continue), but the legacy import commit (line 143) still panicked on failure. This makes the error handling consistent. Addresses review feedback from Greptile. --- aw-datastore/src/worker.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/aw-datastore/src/worker.rs b/aw-datastore/src/worker.rs index 0ed260de..27c38f4f 100644 --- a/aw-datastore/src/worker.rs +++ b/aw-datastore/src/worker.rs @@ -142,7 +142,11 @@ impl DatastoreWorker { } match transaction.commit() { Ok(_) => (), - Err(err) => panic!("Failed to commit datastore transaction! {err}"), + Err(err) => { + error!("Failed to commit legacy import transaction: {err}"); + // Continue without panicking — legacy import will be retried on + // next startup if the commit didn't persist. + } } } From 75d5a4ca77b89c1f34957c26c276e6a162ab661e Mon Sep 17 00:00:00 2001 From: Bob Date: Mon, 2 Mar 2026 12:09:38 +0000 Subject: [PATCH 3/4] docs(datastore): correct misleading comment about event recovery on commit failure --- aw-datastore/src/worker.rs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/aw-datastore/src/worker.rs b/aw-datastore/src/worker.rs index 27c38f4f..b116a1f3 100644 --- a/aw-datastore/src/worker.rs +++ b/aw-datastore/src/worker.rs @@ -201,9 +201,12 @@ impl DatastoreWorker { "Failed to commit datastore transaction ({} events lost): {err}", self.uncommitted_events ); - // Continue instead of panicking — the rolled-back events will be - // re-sent by watchers (heartbeats) or retried by clients. - // This handles transient failures like disk full (SQLITE_FULL). + // Continue instead of panicking — the worker thread survives this + // transient failure (e.g. SQLITE_FULL on disk full). Note: clients + // already received success responses before the commit, so they won't + // know to retry. Rolled-back events create a gap in the timeline; + // watchers will resume sending heartbeats from current state, but the + // specific batch of events is permanently lost. } } if self.quit { From 2902485627dfd00f2140c699bb317185b54c3556 Mon Sep 17 00:00:00 2001 From: Bob Date: Mon, 2 Mar 2026 12:45:37 +0000 Subject: [PATCH 4/4] refactor(datastore): remove unused CommitFailed error variant --- aw-datastore/src/lib.rs | 2 -- aw-server/src/endpoints/util.rs | 3 --- 2 files changed, 5 deletions(-) diff --git a/aw-datastore/src/lib.rs b/aw-datastore/src/lib.rs index c18d801e..69c47618 100644 --- a/aw-datastore/src/lib.rs +++ b/aw-datastore/src/lib.rs @@ -39,6 +39,4 @@ pub enum DatastoreError { // Errors specific to when migrate is disabled Uninitialized(String), OldDbVersion(String), - // Database write failure (e.g. disk full) - CommitFailed(String), } diff --git a/aw-server/src/endpoints/util.rs b/aw-server/src/endpoints/util.rs index 6a841af4..fdf4a992 100644 --- a/aw-server/src/endpoints/util.rs +++ b/aw-server/src/endpoints/util.rs @@ -98,9 +98,6 @@ impl From for HttpErrorJson { DatastoreError::OldDbVersion(msg) => { HttpErrorJson::new(Status::InternalServerError, msg) } - DatastoreError::CommitFailed(msg) => { - HttpErrorJson::new(Status::ServiceUnavailable, msg) - } } } }