From 173723b2130a055b4a9669cbe536ed602ca63aa3 Mon Sep 17 00:00:00 2001 From: Hitesh Joshi Date: Tue, 31 Mar 2026 05:44:54 +0530 Subject: [PATCH 01/49] improve gitignore for rlm iterations --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 2768254..1b0372b 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,6 @@ .env .claude/ CLAUDE.md +.lambda-rlm-cache/ +src/.*.md +src/.*.log From fcfffe5006b6452fda5626d31e918c1ecc07d4b6 Mon Sep 17 00:00:00 2001 From: Hitesh Joshi Date: Tue, 31 Mar 2026 05:47:32 +0530 Subject: [PATCH 02/49] Production hardening: SQLite resilience, fsync durability, WAL checkpointing, zeroize decrypt output - db.rs: Add busy_timeout=5s for lock contention, journal_size_limit=64MB to cap WAL growth, synchronous=NORMAL (safe with WAL) - config.rs: fsync data before rename + fsync parent directory on Unix for crash-safe atomic writes - server.rs: Background WAL checkpoint (TRUNCATE) every 5 min to prevent unbounded WAL growth - server.rs: Validate X-Forwarded-For entries as IpAddr to reject spoofed non-IP values - crypto/vault.rs: decrypt() now returns Zeroizing> so plaintext is wiped from memory on drop - vault.rs: Updated callers to work with zeroized decrypt output --- src/config.rs | 11 +++++++++++ src/crypto/vault.rs | 13 ++++++++----- src/db.rs | 4 +++- src/server.rs | 29 +++++++++++++++++++++++++---- src/vault.rs | 9 ++++++--- 5 files changed, 53 insertions(+), 13 deletions(-) diff --git a/src/config.rs b/src/config.rs index 36d0953..e8cd41a 100644 --- a/src/config.rs +++ b/src/config.rs @@ -22,9 +22,20 @@ pub fn write_secure(path: &Path, data: &[u8]) -> Result<()> { .with_context(|| format!("Failed to write {}", tmp_path.display()))?; file.write_all(data) .with_context(|| format!("Failed to write {}", tmp_path.display()))?; + file.sync_all() + .with_context(|| format!("Failed to fsync {}", tmp_path.display()))?; drop(file); std::fs::rename(&tmp_path, path) .with_context(|| format!("Failed to rename {} to {}", tmp_path.display(), path.display()))?; + + // fsync parent directory to ensure rename durability (POSIX requirement) + #[cfg(unix)] + if let Some(parent) = path.parent() { + if let Ok(dir) = std::fs::File::open(parent) { + let _ = dir.sync_all(); + } + } + Ok(()) } diff --git a/src/crypto/vault.rs b/src/crypto/vault.rs index 00ddea3..28c064d 100644 --- a/src/crypto/vault.rs +++ b/src/crypto/vault.rs @@ -40,7 +40,8 @@ pub fn encrypt(key: &[u8; 32], plaintext: &[u8]) -> Result> { } // Expects 12-byte nonce prepended to ciphertext (same format encrypt() produces). -pub fn decrypt(key: &[u8; 32], data: &[u8]) -> Result> { +// Returns Zeroizing> so plaintext is wiped from memory on drop. +pub fn decrypt(key: &[u8; 32], data: &[u8]) -> Result>> { if data.len() < NONCE_SIZE { anyhow::bail!("Encrypted data too short"); } @@ -50,10 +51,12 @@ pub fn decrypt(key: &[u8; 32], data: &[u8]) -> Result> { .map_err(|e| anyhow::anyhow!("Failed to create cipher: {e}"))?; let nonce = Nonce::from_slice(nonce_bytes); - cipher + let plaintext = cipher .decrypt(nonce, ciphertext) .map_err(|_| anyhow::anyhow!("Decryption failed (wrong key or corrupted data)")) - .context("Vault decryption failed") + .context("Vault decryption failed")?; + + Ok(Zeroizing::new(plaintext)) } #[cfg(test)] @@ -66,7 +69,7 @@ mod tests { let plaintext = b"secret data here"; let encrypted = encrypt(&key, plaintext).unwrap(); let decrypted = decrypt(&key, &encrypted).unwrap(); - assert_eq!(plaintext.to_vec(), decrypted); + assert_eq!(plaintext.as_slice(), &*decrypted); } #[test] @@ -112,7 +115,7 @@ mod tests { let key = [42u8; 32]; let encrypted = encrypt(&key, b"").unwrap(); let decrypted = decrypt(&key, &encrypted).unwrap(); - assert!(decrypted.is_empty()); + assert!((*decrypted).is_empty()); } #[test] diff --git a/src/db.rs b/src/db.rs index af79752..10a3514 100644 --- a/src/db.rs +++ b/src/db.rs @@ -23,8 +23,10 @@ pub fn open() -> Result { // WAL mode: fast reads, lets multiple processes access the file conn.pragma_update(None, "journal_mode", "WAL")?; - conn.pragma_update(None, "synchronous", "FULL")?; + conn.pragma_update(None, "synchronous", "NORMAL")?; // NORMAL is safe with WAL mode conn.pragma_update(None, "cache_size", "-2000")?; + conn.pragma_update(None, "busy_timeout", "5000")?; // Wait 5s for locks under contention + conn.pragma_update(None, "journal_size_limit", "67108864")?; // Cap WAL at 64MB // CREATE TABLE IF NOT EXISTS is idempotent — safe to run every time migrate(&conn)?; diff --git a/src/server.rs b/src/server.rs index 71d5e08..111b5d8 100644 --- a/src/server.rs +++ b/src/server.rs @@ -106,6 +106,22 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { rate_limiter: std::sync::Mutex::new(HashMap::with_capacity(256)), }); + // Background task: WAL checkpoint every 5 minutes to prevent unbounded WAL growth + let wal_state = state.clone(); + tokio::spawn(async move { + loop { + tokio::time::sleep(std::time::Duration::from_secs(300)).await; + let db_ref = wal_state.clone(); + let _ = tokio::task::spawn_blocking(move || { + if let Ok(conn) = db_ref.db.lock() { + if let Err(e) = conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);") { + tracing::warn!("WAL checkpoint failed: {e}"); + } + } + }).await; + } + }); + // Background task: clean expired magic links, old deposit nonces, and stale rate limiter entries let db_clone = state.clone(); tokio::spawn(async move { @@ -233,14 +249,19 @@ async fn handle_deposit( return StatusCode::NOT_FOUND.into_response(); } - // Only trust X-Forwarded-For when running behind a known reverse proxy + // Only trust X-Forwarded-For when running behind a known reverse proxy. + // Parse the rightmost entry as IpAddr to reject spoofed non-IP values. let source_ip = if state.behind_proxy { headers .get("x-forwarded-for") .and_then(|v| v.to_str().ok()) - .and_then(|v| v.rsplit(',').next()) - .map(|s| s.trim().to_string()) - .unwrap_or_else(|| addr.ip().to_string()) + .and_then(|v| { + v.rsplit(',') + .next() + .and_then(|s| s.trim().parse::().ok()) + }) + .unwrap_or_else(|| addr.ip()) + .to_string() } else { addr.ip().to_string() }; diff --git a/src/vault.rs b/src/vault.rs index 3eec866..4c7927a 100644 --- a/src/vault.rs +++ b/src/vault.rs @@ -31,7 +31,10 @@ pub fn vault_get(label: &str, vault_key: &[u8; 32]) -> Result> { match result { Some(encrypted) => { let plaintext = crypto_vault::decrypt(vault_key, &encrypted)?; - let value = String::from_utf8(plaintext).context("Vault value is not valid UTF-8")?; + // plaintext is Zeroizing> — borrow, convert, then let it drop (zeroing memory) + let value = std::str::from_utf8(&plaintext) + .context("Vault value is not valid UTF-8")? + .to_string(); Ok(Some(value)) } None => Ok(None), @@ -136,7 +139,7 @@ mod tests { let mut stmt = conn.prepare("SELECT value FROM vault_secrets WHERE label = ?1").unwrap(); let encrypted: Vec = stmt.query_row(["api_key"], |row| row.get(0)).unwrap(); let decrypted = cv::decrypt(&key, &encrypted).unwrap(); - assert_eq!(String::from_utf8(decrypted).unwrap(), "sk-12345"); + assert_eq!(std::str::from_utf8(&decrypted).unwrap(), "sk-12345"); } #[test] @@ -148,7 +151,7 @@ mod tests { let mut stmt = conn.prepare("SELECT value FROM vault_secrets WHERE label = ?1").unwrap(); let encrypted: Vec = stmt.query_row(["k"], |row| row.get(0)).unwrap(); let decrypted = cv::decrypt(&key, &encrypted).unwrap(); - assert_eq!(String::from_utf8(decrypted).unwrap(), "v2"); + assert_eq!(std::str::from_utf8(&decrypted).unwrap(), "v2"); } #[test] From 7a34d2e552ed7a724760ec3dd746f4c6659ba483 Mon Sep 17 00:00:00 2001 From: Hitesh Joshi Date: Tue, 31 Mar 2026 05:47:43 +0530 Subject: [PATCH 03/49] Update README with changelog for fcfffe5 --- README.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/README.md b/README.md index eb451d3..7d3c130 100644 --- a/README.md +++ b/README.md @@ -206,6 +206,15 @@ cargo test # 65 tests Cross-compiles to `x86_64-linux-musl`, `aarch64-linux-musl`, `x86_64-apple-darwin`, `aarch64-apple-darwin`. +## Changelog + +**fcfffe5** — Production hardening: SQLite resilience, fsync durability, WAL checkpointing, zeroize decrypt output +- SQLite: `busy_timeout=5s`, `journal_size_limit=64MB`, `synchronous=NORMAL` (WAL-safe) +- Atomic writes: fsync data + parent directory on Unix for crash safety +- Background WAL checkpoint (TRUNCATE) every 5 min to cap disk growth +- X-Forwarded-For IP validation to reject spoofed non-IP values +- `decrypt()` returns `Zeroizing>` — plaintext wiped from memory on drop + ## Roadmap - [x] Identity (agent.json + Ed25519) From fdee3d2bd849b0d1a37e6a5a4936884a64d9d80d Mon Sep 17 00:00:00 2001 From: Hitesh Joshi Date: Tue, 31 Mar 2026 05:54:13 +0530 Subject: [PATCH 04/49] Harden server: 404-everything, SIGTERM/SIGHUP handling, shutdown WAL checkpoint, ciphertext size limit, SQLite mmap - Error sanitization: all HTTP error paths return 404 (no 500s), preventing info leakage - SIGTERM handling: shutdown_signal() now catches SIGTERM (from `atomic stop`) in addition to SIGINT - SIGHUP TLS reload: `kill -HUP` triggers immediate cert reload (zero-delay vs 12h polling) - Shutdown WAL checkpoint: PRAGMA wal_checkpoint(TRUNCATE) runs before exit for data integrity - Graceful shutdown timeout increased from 10s to 30s for slow-disk WAL merge - Ciphertext size validation: decrypt() rejects >16MB payloads before allocation - SQLite tuning: wal_autocheckpoint=1000 pages, mmap_size=64MB for read performance --- src/crypto/vault.rs | 12 +++++++ src/db.rs | 2 ++ src/server.rs | 85 +++++++++++++++++++++++++++++++++++++-------- 3 files changed, 84 insertions(+), 15 deletions(-) diff --git a/src/crypto/vault.rs b/src/crypto/vault.rs index 28c064d..d39ca4a 100644 --- a/src/crypto/vault.rs +++ b/src/crypto/vault.rs @@ -10,6 +10,7 @@ use zeroize::Zeroizing; const NONCE_SIZE: usize = 12; const HKDF_SALT: &[u8] = b"atomic-v1"; +const MAX_CIPHERTEXT_SIZE: usize = 16 * 1024 * 1024; // 16 MB // HKDF with salt and "atomic-vault" context, so the vault key differs from the signing key. pub fn derive_vault_key(private_key_bytes: &[u8; 32]) -> Result> { @@ -45,6 +46,9 @@ pub fn decrypt(key: &[u8; 32], data: &[u8]) -> Result>> { if data.len() < NONCE_SIZE { anyhow::bail!("Encrypted data too short"); } + if data.len() > MAX_CIPHERTEXT_SIZE { + anyhow::bail!("Encrypted data too large"); + } let (nonce_bytes, ciphertext) = data.split_at(NONCE_SIZE); let cipher = Aes256Gcm::new_from_slice(key) @@ -110,6 +114,14 @@ mod tests { assert!(decrypt(&key, &[0u8; 5]).is_err()); } + #[test] + fn decrypt_oversized_rejected() { + let key = [42u8; 32]; + let oversized = vec![0u8; MAX_CIPHERTEXT_SIZE + 1]; + let err = decrypt(&key, &oversized).unwrap_err(); + assert!(err.to_string().contains("too large")); + } + #[test] fn encrypt_decrypt_empty_plaintext() { let key = [42u8; 32]; diff --git a/src/db.rs b/src/db.rs index 10a3514..a755385 100644 --- a/src/db.rs +++ b/src/db.rs @@ -27,6 +27,8 @@ pub fn open() -> Result { conn.pragma_update(None, "cache_size", "-2000")?; conn.pragma_update(None, "busy_timeout", "5000")?; // Wait 5s for locks under contention conn.pragma_update(None, "journal_size_limit", "67108864")?; // Cap WAL at 64MB + conn.pragma_update(None, "wal_autocheckpoint", "1000")?; // Checkpoint every 1000 pages + conn.pragma_update(None, "mmap_size", "67108864")?; // 64MB memory-mapped I/O for reads // CREATE TABLE IF NOT EXISTS is idempotent — safe to run every time migrate(&conn)?; diff --git a/src/server.rs b/src/server.rs index 111b5d8..5accfb5 100644 --- a/src/server.rs +++ b/src/server.rs @@ -155,6 +155,8 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { .allow_methods(Any) .allow_headers(Any); + let shutdown_state = state.clone(); + let public_routes = Router::new() .route("/.well-known/agent.json", get(serve_agent_json)) .layer(cors); @@ -195,11 +197,37 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { if is_auto { crate::tls::spawn_renewal_watcher(rustls_config.clone()); } + // Reload TLS cert on SIGHUP (works for both auto and custom certs) + #[cfg(unix)] + { + let sighup_config = rustls_config.clone(); + tokio::spawn(async move { + use tokio::signal::unix::{signal, SignalKind}; + let mut sighup = signal(SignalKind::hangup()) + .expect("Failed to install SIGHUP handler"); + loop { + sighup.recv().await; + let tls_dir = match crate::config::tls_dir() { + Ok(d) => d, + Err(e) => { + tracing::warn!("SIGHUP cert reload failed: {e}"); + continue; + } + }; + let cert_path = tls_dir.join("fullchain.pem"); + let key_path = tls_dir.join("key.pem"); + match sighup_config.reload_from_pem_file(&cert_path, &key_path).await { + Ok(()) => info!("TLS cert reloaded (SIGHUP)"), + Err(e) => tracing::warn!("TLS cert reload failed (SIGHUP): {e}"), + } + } + }); + } let handle = axum_server::Handle::new(); let shutdown_handle = handle.clone(); tokio::spawn(async move { shutdown_signal().await; - shutdown_handle.graceful_shutdown(Some(std::time::Duration::from_secs(10))); + shutdown_handle.graceful_shutdown(Some(std::time::Duration::from_secs(30))); }); let mode_label = if is_auto { "acme.sh" } else { "custom cert" }; info!("Listening on {} (HTTPS/{}, PID {})", addr, mode_label, std::process::id()); @@ -211,6 +239,15 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { } } + // Final WAL checkpoint before exit to ensure all data is merged + let _ = tokio::task::spawn_blocking(move || { + if let Ok(conn) = shutdown_state.db.lock() { + if let Err(e) = conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);") { + tracing::warn!("Final WAL checkpoint failed: {e}"); + } + } + }).await; + let _ = std::fs::remove_file(&pid_path); info!("Server stopped"); Ok(()) @@ -291,20 +328,21 @@ async fn handle_deposit( let resp = DepositResponse { status: "deposited", label }; match serde_json::to_string(&resp) { Ok(json) => (StatusCode::OK, [(header::CONTENT_TYPE, "application/json")], json).into_response(), - Err(_) => StatusCode::INTERNAL_SERVER_ERROR.into_response(), + Err(_) => StatusCode::NOT_FOUND.into_response(), } } Ok(Err(e)) => { - let err_msg = format!("{e}"); - if err_msg.contains("replay") || err_msg.contains("Nonce already used") { - return StatusCode::NOT_FOUND.into_response(); + let msg = e.to_string(); + if msg.contains("Nonce already used") { + tracing::debug!("Deposit replay rejected"); + } else { + tracing::error!("Deposit failed: {e}"); } - tracing::error!("Deposit failed: {}", e); - StatusCode::INTERNAL_SERVER_ERROR.into_response() + StatusCode::NOT_FOUND.into_response() } Err(e) => { - tracing::error!("Deposit task panicked: {}", e); - StatusCode::INTERNAL_SERVER_ERROR.into_response() + tracing::error!("Deposit task panicked: {e}"); + StatusCode::NOT_FOUND.into_response() } } } @@ -337,17 +375,17 @@ async fn handle_magic_link( let resp = MagicLinkResponse { status: "verified" }; match serde_json::to_string(&resp) { Ok(json) => (StatusCode::OK, [(header::CONTENT_TYPE, "application/json")], json).into_response(), - Err(_) => StatusCode::INTERNAL_SERVER_ERROR.into_response(), + Err(_) => StatusCode::NOT_FOUND.into_response(), } } Ok(Ok(None)) => StatusCode::NOT_FOUND.into_response(), Ok(Err(e)) => { tracing::error!("Magic link DB error: {e}"); - StatusCode::INTERNAL_SERVER_ERROR.into_response() + StatusCode::NOT_FOUND.into_response() } Err(e) => { tracing::error!("Magic link task panicked: {e}"); - StatusCode::INTERNAL_SERVER_ERROR.into_response() + StatusCode::NOT_FOUND.into_response() } } } @@ -389,8 +427,25 @@ async fn security_headers( } async fn shutdown_signal() { - tokio::signal::ctrl_c() - .await - .expect("Failed to install CTRL+C handler"); + #[cfg(unix)] + { + use tokio::signal::unix::{signal, SignalKind}; + let mut sigterm = signal(SignalKind::terminate()) + .expect("Failed to install SIGTERM handler"); + tokio::select! { + r = tokio::signal::ctrl_c() => { + if let Err(e) = r { + tracing::error!("CTRL+C handler error: {e}"); + } + } + _ = sigterm.recv() => {} + } + } + #[cfg(not(unix))] + { + tokio::signal::ctrl_c() + .await + .expect("Failed to install CTRL+C handler"); + } info!("Shutdown signal received"); } From fa48e9c2c406486eef76e9a2a3207ce3f3d7c76c Mon Sep 17 00:00:00 2001 From: Hitesh Joshi Date: Tue, 31 Mar 2026 05:54:24 +0530 Subject: [PATCH 05/49] Update README with changelog for fdee3d2 --- README.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/README.md b/README.md index 7d3c130..2390436 100644 --- a/README.md +++ b/README.md @@ -208,6 +208,15 @@ Cross-compiles to `x86_64-linux-musl`, `aarch64-linux-musl`, `x86_64-apple-darwi ## Changelog +**fdee3d2** — Harden server: 404-everything, SIGTERM/SIGHUP handling, shutdown WAL checkpoint, ciphertext size limit, SQLite mmap +- All HTTP error paths return 404 (no 500s) to prevent information leakage +- `shutdown_signal()` handles SIGTERM (from `atomic stop`) and SIGINT +- `kill -HUP` triggers immediate TLS cert reload on Unix (zero-delay vs 12h poll) +- Final WAL checkpoint (TRUNCATE) on shutdown for data integrity +- Graceful shutdown timeout 10s → 30s for slow-disk WAL merge +- `decrypt()` rejects ciphertext >16MB before allocation (resource exhaustion defense) +- SQLite: `wal_autocheckpoint=1000` pages, `mmap_size=64MB` for read throughput + **fcfffe5** — Production hardening: SQLite resilience, fsync durability, WAL checkpointing, zeroize decrypt output - SQLite: `busy_timeout=5s`, `journal_size_limit=64MB`, `synchronous=NORMAL` (WAL-safe) - Atomic writes: fsync data + parent directory on Unix for crash safety From a7d7afbcb6fe11419469b714bba21f8a928fce5b Mon Sep 17 00:00:00 2001 From: Hitesh Joshi Date: Tue, 31 Mar 2026 05:59:27 +0530 Subject: [PATCH 06/49] Resilience hardening: supervised tasks, health endpoint, input validation, SQLite tuning - Background tasks (WAL checkpoint, DB cleanup) now restart automatically on panic - Added /_/health endpoint checking DB responsiveness and agent.json integrity - Input validation rejects non-printable chars and overlong labels/codes - SQLite cache bumped to 64MB, temp_store set to MEMORY - X-Forwarded-For misconfiguration warning when behind_proxy=false - Mutex poisoning recovery uses into_inner() consistently --- src/db.rs | 3 +- src/server.rs | 116 ++++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 95 insertions(+), 24 deletions(-) diff --git a/src/db.rs b/src/db.rs index a755385..57bdc5f 100644 --- a/src/db.rs +++ b/src/db.rs @@ -24,8 +24,9 @@ pub fn open() -> Result { // WAL mode: fast reads, lets multiple processes access the file conn.pragma_update(None, "journal_mode", "WAL")?; conn.pragma_update(None, "synchronous", "NORMAL")?; // NORMAL is safe with WAL mode - conn.pragma_update(None, "cache_size", "-2000")?; + conn.pragma_update(None, "cache_size", "-64000")?; // 64MB page cache conn.pragma_update(None, "busy_timeout", "5000")?; // Wait 5s for locks under contention + conn.pragma_update(None, "temp_store", "MEMORY")?; // Temp tables/indexes in memory conn.pragma_update(None, "journal_size_limit", "67108864")?; // Cap WAL at 64MB conn.pragma_update(None, "wal_autocheckpoint", "1000")?; // Checkpoint every 1000 pages conn.pragma_update(None, "mmap_size", "67108864")?; // 64MB memory-mapped I/O for reads diff --git a/src/server.rs b/src/server.rs index 5accfb5..bc0eb49 100644 --- a/src/server.rs +++ b/src/server.rs @@ -9,6 +9,7 @@ use axum::{ }; use serde::Serialize; use std::collections::HashMap; +use std::future::Future; use std::net::{IpAddr, SocketAddr}; use std::sync::Arc; use tower_http::cors::{Any, CorsLayer}; @@ -19,9 +20,37 @@ use crate::config; use crate::credentials::Credentials; use crate::tls::TlsMode; +/// Spawn a supervised background task that restarts on panic/error with backoff. +fn spawn_supervised(name: &'static str, make_task: F) +where + F: Fn() -> Fut + Send + 'static, + Fut: Future + Send + 'static, +{ + tokio::spawn(async move { + loop { + let result = tokio::spawn(make_task()).await; + match result { + Ok(()) => break, // clean exit + Err(e) => { + tracing::error!("{name} task panicked: {e}. Restarting in 5s..."); + tokio::time::sleep(std::time::Duration::from_secs(5)).await; + } + } + } + }); +} + const RATE_LIMIT_WINDOW_SECS: i64 = 60; const RATE_LIMIT_MAX_REQUESTS: u32 = 10; const RATE_LIMIT_MAX_ENTRIES: usize = 10_000; +const MAX_INPUT_LEN: usize = 256; + +/// Reject inputs with non-printable characters or excessive length. +fn is_valid_input(s: &str) -> bool { + !s.is_empty() + && s.len() <= MAX_INPUT_LEN + && s.bytes().all(|b| b >= 0x20 && b != 0x7F) +} pub struct AppState { pub agent_json_cached: bytes::Bytes, @@ -108,28 +137,32 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { // Background task: WAL checkpoint every 5 minutes to prevent unbounded WAL growth let wal_state = state.clone(); - tokio::spawn(async move { - loop { - tokio::time::sleep(std::time::Duration::from_secs(300)).await; - let db_ref = wal_state.clone(); - let _ = tokio::task::spawn_blocking(move || { - if let Ok(conn) = db_ref.db.lock() { + spawn_supervised("wal-checkpoint", move || { + let st = wal_state.clone(); + async move { + loop { + tokio::time::sleep(std::time::Duration::from_secs(300)).await; + let db_ref = st.clone(); + let _ = tokio::task::spawn_blocking(move || { + let conn = db_ref.db.lock().unwrap_or_else(|e| e.into_inner()); if let Err(e) = conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);") { tracing::warn!("WAL checkpoint failed: {e}"); } - } - }).await; + }).await; + } } }); // Background task: clean expired magic links, old deposit nonces, and stale rate limiter entries - let db_clone = state.clone(); - tokio::spawn(async move { - loop { - tokio::time::sleep(std::time::Duration::from_secs(3600)).await; - let db_ref = db_clone.clone(); - let _ = tokio::task::spawn_blocking(move || { - if let Ok(conn) = db_ref.db.lock() { + let cleanup_state = state.clone(); + spawn_supervised("db-cleanup", move || { + let st = cleanup_state.clone(); + async move { + loop { + tokio::time::sleep(std::time::Duration::from_secs(3600)).await; + let db_ref = st.clone(); + let _ = tokio::task::spawn_blocking(move || { + let conn = db_ref.db.lock().unwrap_or_else(|e| e.into_inner()); let now = chrono::Utc::now().timestamp(); if let Err(e) = conn.execute("DELETE FROM magic_links WHERE expires_at <= ?1", [now]) { tracing::warn!("Failed to clean expired magic links: {e}"); @@ -138,13 +171,12 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { if let Err(e) = conn.execute("DELETE FROM used_deposits WHERE used_at < ?1", [cutoff]) { tracing::warn!("Failed to clean old deposit nonces: {e}"); } - } - // Evict stale rate limiter entries - if let Ok(mut map) = db_ref.rate_limiter.lock() { + // Evict stale rate limiter entries + let mut map = db_ref.rate_limiter.lock().unwrap_or_else(|e| e.into_inner()); let now = chrono::Utc::now().timestamp(); map.retain(|_, (_, window_start)| now - *window_start <= RATE_LIMIT_WINDOW_SECS); - } - }).await; + }).await; + } } }); @@ -166,6 +198,7 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { .merge(public_routes) .route("/d/{token}", post(handle_deposit)) .route("/m/{code}", get(handle_magic_link)) + .route("/_/health", get(handle_health)) .fallback(handle_404) .layer(DefaultBodyLimit::max(MAX_BODY_SIZE)) .layer(middleware::from_fn_with_state( @@ -282,7 +315,8 @@ async fn handle_deposit( None => return StatusCode::NOT_FOUND.into_response(), }; - if body.is_empty() { + // Reject empty body or labels with non-printable/overlong content + if body.is_empty() || !is_valid_input(&payload.label) { return StatusCode::NOT_FOUND.into_response(); } @@ -300,6 +334,16 @@ async fn handle_deposit( .unwrap_or_else(|| addr.ip()) .to_string() } else { + // Warn once if XFF header is present without proxy mode — likely misconfiguration + if headers.contains_key("x-forwarded-for") { + tracing::warn_span!("deposit").in_scope(|| { + tracing::warn!( + "X-Forwarded-For header present but behind_proxy=false; \ + ignoring header and using direct IP. \ + If behind a reverse proxy, re-init with --proxy." + ); + }); + } addr.ip().to_string() }; let user_agent = headers @@ -357,8 +401,8 @@ async fn handle_magic_link( return StatusCode::TOO_MANY_REQUESTS.into_response(); } - // Reject obviously short codes before touching the DB (host() enforces >= 20 chars) - if code.len() < 20 { + // Reject obviously short codes or codes with non-printable chars before touching the DB + if code.len() < 20 || !is_valid_input(&code) { return StatusCode::NOT_FOUND.into_response(); } @@ -390,6 +434,32 @@ async fn handle_magic_link( } } +async fn handle_health(State(state): State>) -> Response { + // Check DB is responsive + let db_ok = { + let st = state.clone(); + tokio::task::spawn_blocking(move || { + let conn = st.db.lock().unwrap_or_else(|e| e.into_inner()); + conn.execute_batch("SELECT 1").is_ok() + }) + .await + .unwrap_or(false) + }; + + // Check agent.json is valid JSON + let agent_ok = serde_json::from_slice::(&state.agent_json_cached).is_ok(); + + if db_ok && agent_ok { + (StatusCode::OK, [(header::CONTENT_TYPE, "application/json")], r#"{"status":"ok"}"#).into_response() + } else { + let detail = format!( + r#"{{"status":"degraded","db":{},"agent_json":{}}}"#, + db_ok, agent_ok + ); + (StatusCode::SERVICE_UNAVAILABLE, [(header::CONTENT_TYPE, "application/json")], detail).into_response() + } +} + async fn handle_404() -> StatusCode { StatusCode::NOT_FOUND } From ddd72286b1540cecd6086e81b8a34259e1a60cb0 Mon Sep 17 00:00:00 2001 From: Hitesh Joshi Date: Tue, 31 Mar 2026 05:59:53 +0530 Subject: [PATCH 07/49] Update README with changelog for a7d7afb --- README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.md b/README.md index 2390436..0e86e19 100644 --- a/README.md +++ b/README.md @@ -217,6 +217,14 @@ Cross-compiles to `x86_64-linux-musl`, `aarch64-linux-musl`, `x86_64-apple-darwi - `decrypt()` rejects ciphertext >16MB before allocation (resource exhaustion defense) - SQLite: `wal_autocheckpoint=1000` pages, `mmap_size=64MB` for read throughput +**a7d7afb** — Resilience hardening: supervised tasks, health endpoint, input validation, SQLite tuning +- Background tasks (WAL checkpoint, DB cleanup) auto-restart on panic with 5s backoff +- `/_/health` endpoint: checks DB + agent.json, returns 200/503 for load balancer integration +- Input validation rejects non-printable chars and labels > 256 bytes on deposit and magic link paths +- SQLite page cache bumped to 64MB, temp tables stored in memory +- X-Forwarded-For warning when header present but `behind_proxy=false` (misconfiguration detection) +- Mutex poisoning recovery uses `into_inner()` consistently across all lock sites + **fcfffe5** — Production hardening: SQLite resilience, fsync durability, WAL checkpointing, zeroize decrypt output - SQLite: `busy_timeout=5s`, `journal_size_limit=64MB`, `synchronous=NORMAL` (WAL-safe) - Atomic writes: fsync data + parent directory on Unix for crash safety From a6095345c43084807cedbde035969f0ea89a9df3 Mon Sep 17 00:00:00 2001 From: Hitesh Joshi Date: Tue, 31 Mar 2026 06:03:26 +0530 Subject: [PATCH 08/49] Exponential backoff for supervised tasks, deposit_log retention MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - spawn_supervised: replace fixed 5s restart delay with exponential backoff (5s → 10s → 20s → ... → 320s cap) to prevent spin loops on persistent failures like disk full - Cleanup task: purge deposit_log entries older than 90 days to prevent unbounded disk growth on long-lived servers --- src/server.rs | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/server.rs b/src/server.rs index bc0eb49..5ab7c59 100644 --- a/src/server.rs +++ b/src/server.rs @@ -20,20 +20,23 @@ use crate::config; use crate::credentials::Credentials; use crate::tls::TlsMode; -/// Spawn a supervised background task that restarts on panic/error with backoff. +/// Spawn a supervised background task that restarts on panic/error with exponential backoff. fn spawn_supervised(name: &'static str, make_task: F) where F: Fn() -> Fut + Send + 'static, Fut: Future + Send + 'static, { tokio::spawn(async move { + let mut retries: u32 = 0; loop { let result = tokio::spawn(make_task()).await; match result { Ok(()) => break, // clean exit Err(e) => { - tracing::error!("{name} task panicked: {e}. Restarting in 5s..."); - tokio::time::sleep(std::time::Duration::from_secs(5)).await; + let delay_secs = 5_u64.saturating_mul(1u64 << retries.min(6)); // 5s, 10s, 20s, ..., 320s (capped) + tracing::error!("{name} task panicked: {e}. Restarting in {delay_secs}s..."); + tokio::time::sleep(std::time::Duration::from_secs(delay_secs)).await; + retries = retries.saturating_add(1); } } } @@ -171,6 +174,11 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { if let Err(e) = conn.execute("DELETE FROM used_deposits WHERE used_at < ?1", [cutoff]) { tracing::warn!("Failed to clean old deposit nonces: {e}"); } + // Purge deposit log entries older than 90 days to prevent unbounded disk growth + let log_cutoff = now - 90 * 86400; + if let Err(e) = conn.execute("DELETE FROM deposit_log WHERE deposited_at < ?1", [log_cutoff]) { + tracing::warn!("Failed to clean old deposit log entries: {e}"); + } // Evict stale rate limiter entries let mut map = db_ref.rate_limiter.lock().unwrap_or_else(|e| e.into_inner()); let now = chrono::Utc::now().timestamp(); From 0f7da71529f4c36070abb63bc270e9645279f8d1 Mon Sep 17 00:00:00 2001 From: Hitesh Joshi Date: Tue, 31 Mar 2026 06:03:35 +0530 Subject: [PATCH 09/49] Update README with changelog for a609534 --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index 0e86e19..19226ad 100644 --- a/README.md +++ b/README.md @@ -208,6 +208,10 @@ Cross-compiles to `x86_64-linux-musl`, `aarch64-linux-musl`, `x86_64-apple-darwi ## Changelog +**a609534** — Exponential backoff for supervised tasks, deposit_log retention +- `spawn_supervised` uses exponential backoff (5s → 10s → ... → 320s cap) instead of fixed 5s delay to prevent spin loops on persistent failures (e.g. disk full) +- Cleanup task purges `deposit_log` entries older than 90 days to prevent unbounded disk growth on long-lived servers + **fdee3d2** — Harden server: 404-everything, SIGTERM/SIGHUP handling, shutdown WAL checkpoint, ciphertext size limit, SQLite mmap - All HTTP error paths return 404 (no 500s) to prevent information leakage - `shutdown_signal()` handles SIGTERM (from `atomic stop`) and SIGINT From a3f46889ba12f6c8d29e94676cbdcb5163545519 Mon Sep 17 00:00:00 2001 From: Hitesh Joshi Date: Tue, 31 Mar 2026 06:07:08 +0530 Subject: [PATCH 10/49] =?UTF-8?q?=CE=BB-RLM=20iter=205:=20rate=20limiter?= =?UTF-8?q?=20GC,=20supervisor=20circuit=20breaker,=20WAL=20checkpoint=20s?= =?UTF-8?q?trategy,=20PID=20race=20fix?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Rate limiter: always evict stale entries on every access instead of only at 10k cap, preventing unbounded memory under IPv6 scanning / DDoS - Supervisor circuit breaker: abort process after 5 restarts within 5 minutes to prevent resource exhaustion on unrecoverable errors (e.g. SQLite corruption) - WAL checkpointing: use PASSIVE every 5m (non-blocking), TRUNCATE hourly to reclaim space - PID stop command: verify process alive with kill -0 immediately before SIGTERM to minimize PID reuse race window --- src/main.rs | 9 +++++++++ src/server.rs | 40 +++++++++++++++++++++++++++++++++++----- 2 files changed, 44 insertions(+), 5 deletions(-) diff --git a/src/main.rs b/src/main.rs index b18e5e7..105058c 100644 --- a/src/main.rs +++ b/src/main.rs @@ -72,6 +72,15 @@ async fn main() -> Result<()> { } } + // Verify process still alive immediately before kill to minimize PID reuse window + let probe = std::process::Command::new("kill") + .args(["-0", &pid.to_string()]) + .status(); + if !probe.map(|s| s.success()).unwrap_or(false) { + let _ = std::fs::remove_file(&pid_path); + anyhow::bail!("PID {pid} no longer exists (stale PID file removed)"); + } + // Send SIGTERM let status = std::process::Command::new("kill") .arg(pid.to_string()) diff --git a/src/server.rs b/src/server.rs index 5ab7c59..a8b9370 100644 --- a/src/server.rs +++ b/src/server.rs @@ -20,7 +20,14 @@ use crate::config; use crate::credentials::Credentials; use crate::tls::TlsMode; +/// Max restarts within the circuit breaker window before we abort the process. +const SUPERVISOR_MAX_RESTARTS: u32 = 5; +/// Circuit breaker window: if SUPERVISOR_MAX_RESTARTS occur within this duration, fail-fast. +const SUPERVISOR_WINDOW_SECS: u64 = 300; // 5 minutes + /// Spawn a supervised background task that restarts on panic/error with exponential backoff. +/// Circuit breaker: if 5 restarts occur within 5 minutes, abort the process to prevent +/// resource exhaustion on unrecoverable errors (e.g., SQLite corruption). fn spawn_supervised(name: &'static str, make_task: F) where F: Fn() -> Fut + Send + 'static, @@ -28,13 +35,29 @@ where { tokio::spawn(async move { let mut retries: u32 = 0; + let mut window_start = std::time::Instant::now(); + let mut window_restarts: u32 = 0; loop { let result = tokio::spawn(make_task()).await; match result { Ok(()) => break, // clean exit Err(e) => { + let now = std::time::Instant::now(); + // Reset circuit breaker window if enough time has passed + if now.duration_since(window_start).as_secs() > SUPERVISOR_WINDOW_SECS { + window_start = now; + window_restarts = 0; + } + window_restarts += 1; + if window_restarts >= SUPERVISOR_MAX_RESTARTS { + tracing::error!( + "{name} task failed {SUPERVISOR_MAX_RESTARTS} times in {SUPERVISOR_WINDOW_SECS}s — aborting process" + ); + std::process::exit(1); + } + let delay_secs = 5_u64.saturating_mul(1u64 << retries.min(6)); // 5s, 10s, 20s, ..., 320s (capped) - tracing::error!("{name} task panicked: {e}. Restarting in {delay_secs}s..."); + tracing::error!("{name} task panicked: {e}. Restarting in {delay_secs}s ({window_restarts}/{SUPERVISOR_MAX_RESTARTS} in window)..."); tokio::time::sleep(std::time::Duration::from_secs(delay_secs)).await; retries = retries.saturating_add(1); } @@ -76,9 +99,11 @@ impl AppState { pub fn check_rate_limit(&self, ip: IpAddr) -> bool { let now = chrono::Utc::now().timestamp(); let mut map = self.rate_limiter.lock().unwrap_or_else(|e| e.into_inner()); - // Evict stale entries when the map grows too large to prevent memory exhaustion + // Always evict stale entries to guarantee bounded memory regardless of attacker volume + map.retain(|_, (_, window_start)| now - *window_start <= RATE_LIMIT_WINDOW_SECS); + // Hard cap as defense-in-depth against clock skew or other edge cases if map.len() >= RATE_LIMIT_MAX_ENTRIES { - map.retain(|_, (_, window_start)| now - *window_start <= RATE_LIMIT_WINDOW_SECS); + return false; } let entry = map.entry(ip).or_insert((0, now)); if now - entry.1 > RATE_LIMIT_WINDOW_SECS { @@ -138,7 +163,8 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { rate_limiter: std::sync::Mutex::new(HashMap::with_capacity(256)), }); - // Background task: WAL checkpoint every 5 minutes to prevent unbounded WAL growth + // Background task: WAL checkpoint every 5 minutes (PASSIVE to avoid blocking writers; + // the hourly cleanup task runs TRUNCATE to actually reclaim WAL disk space). let wal_state = state.clone(); spawn_supervised("wal-checkpoint", move || { let st = wal_state.clone(); @@ -148,7 +174,7 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { let db_ref = st.clone(); let _ = tokio::task::spawn_blocking(move || { let conn = db_ref.db.lock().unwrap_or_else(|e| e.into_inner()); - if let Err(e) = conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);") { + if let Err(e) = conn.execute_batch("PRAGMA wal_checkpoint(PASSIVE);") { tracing::warn!("WAL checkpoint failed: {e}"); } }).await; @@ -179,6 +205,10 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { if let Err(e) = conn.execute("DELETE FROM deposit_log WHERE deposited_at < ?1", [log_cutoff]) { tracing::warn!("Failed to clean old deposit log entries: {e}"); } + // TRUNCATE checkpoint hourly to reclaim WAL disk space + if let Err(e) = conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);") { + tracing::warn!("Hourly WAL TRUNCATE checkpoint failed: {e}"); + } // Evict stale rate limiter entries let mut map = db_ref.rate_limiter.lock().unwrap_or_else(|e| e.into_inner()); let now = chrono::Utc::now().timestamp(); From 672570e82d5bbc1bb90e21b5ee14e811ca344d1c Mon Sep 17 00:00:00 2001 From: Hitesh Joshi Date: Tue, 31 Mar 2026 06:07:28 +0530 Subject: [PATCH 11/49] Update README with changelog for a3f4688 --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index 19226ad..6292a79 100644 --- a/README.md +++ b/README.md @@ -236,6 +236,12 @@ Cross-compiles to `x86_64-linux-musl`, `aarch64-linux-musl`, `x86_64-apple-darwi - X-Forwarded-For IP validation to reject spoofed non-IP values - `decrypt()` returns `Zeroizing>` — plaintext wiped from memory on drop +**a3f4688** — Rate limiter GC, supervisor circuit breaker, WAL checkpoint strategy, PID race fix +- Rate limiter evicts stale entries on every access (bounded memory under DDoS/IPv6 scanning) +- Supervisor circuit breaker: process aborts after 5 restarts in 5 minutes (prevents resource exhaustion on unrecoverable errors) +- WAL checkpointing split: PASSIVE every 5m (non-blocking), TRUNCATE hourly (reclaims disk) +- `atomic stop`: kill -0 probe before SIGTERM to minimize PID reuse race window + ## Roadmap - [x] Identity (agent.json + Ed25519) From 1b65780ccf092db4f982a23acb0575df0d3ef9d0 Mon Sep 17 00:00:00 2001 From: Hitesh Joshi Date: Tue, 31 Mar 2026 06:17:07 +0530 Subject: [PATCH 12/49] =?UTF-8?q?=CE=BB-RLM=20iter=206:=20connection=20poo?= =?UTF-8?q?l,=20handler=20timeouts,=20credential=20zeroize,=20startup=20gu?= =?UTF-8?q?ards?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Replace Mutex with zero-dep channel-based DbPool (4 conns), unlocking WAL concurrent readers instead of serializing all requests - Wrap handler DB ops in tokio::time::timeout(5s) to prevent unbounded task accumulation from slow I/O or lock contention - Lower SQLite busy_timeout to 4s (below tokio 5s) for clean BUSY errors - Zeroize credential JSON buffer in save() via Zeroizing> to prevent private key material lingering in freed heap memory - Add RLIMIT_NOFILE check at server startup (warn if <4096) - Add log file size warning at startup (warn if >100MB) --- src/credentials.rs | 7 +- src/db.rs | 106 +++++++++++++++++++++++++----- src/server.rs | 156 +++++++++++++++++++++++++++++++++------------ 3 files changed, 211 insertions(+), 58 deletions(-) diff --git a/src/credentials.rs b/src/credentials.rs index 1858c95..0ec2de9 100644 --- a/src/credentials.rs +++ b/src/credentials.rs @@ -73,8 +73,11 @@ impl Credentials { } pub fn save(&self, path: &Path) -> Result<()> { - let json = serde_json::to_string_pretty(self).context("Failed to serialize credentials")?; - crate::config::write_secure(path, json.as_bytes()) + // Write directly into a Zeroizing> so the private key material + // in the serialized JSON is zeroed on drop, not left in freed heap memory. + let mut buf = zeroize::Zeroizing::new(Vec::new()); + serde_json::to_writer_pretty(&mut *buf, self).context("Failed to serialize credentials")?; + crate::config::write_secure(path, &buf) } pub fn load(path: &Path) -> Result { diff --git a/src/db.rs b/src/db.rs index 57bdc5f..d3abc93 100644 --- a/src/db.rs +++ b/src/db.rs @@ -1,12 +1,55 @@ use anyhow::{Context, Result}; use rusqlite::Connection; +use std::path::Path; +use std::sync::mpsc; use crate::config; -pub fn open() -> Result { - let db_path = config::atomic_dir()?.join("atomic.db"); +/// Zero-dependency connection pool for SQLite. +/// Uses a bounded channel to distribute pre-opened connections. +/// WAL mode allows concurrent readers; the pool prevents serialization +/// behind a single Mutex. +pub struct DbPool { + sender: mpsc::Sender, + receiver: std::sync::Mutex>, +} + +/// RAII guard that returns the connection to the pool on drop. +pub struct PooledConn<'a> { + pool: &'a DbPool, + conn: Option, +} + +impl<'a> std::ops::Deref for PooledConn<'a> { + type Target = Connection; + fn deref(&self) -> &Connection { + self.conn.as_ref().expect("PooledConn used after take") + } +} - // Pre-create DB file with restricted permissions (0600) before SQLite opens it +impl Drop for PooledConn<'_> { + fn drop(&mut self) { + if let Some(c) = self.conn.take() { + let _ = self.pool.sender.send(c); + } + } +} + +impl DbPool { + /// Get a connection from the pool, blocking up to 5 seconds. + pub fn get(&self) -> Result> { + let rx = self.receiver.lock().unwrap_or_else(|e| e.into_inner()); + let conn = rx + .recv_timeout(std::time::Duration::from_secs(5)) + .map_err(|_| anyhow::anyhow!("DB pool exhausted (5s timeout)"))?; + Ok(PooledConn { + pool: self, + conn: Some(conn), + }) + } +} + +fn ensure_db_file(db_path: &Path) -> Result<()> { #[cfg(unix)] if !db_path.exists() { use std::os::unix::fs::OpenOptionsExt; @@ -14,26 +57,59 @@ pub fn open() -> Result { .write(true) .create_new(true) .mode(0o600) - .open(&db_path) + .open(db_path) .with_context(|| format!("Failed to create database at {}", db_path.display()))?; } + Ok(()) +} - let conn = Connection::open(&db_path) +fn open_connection(db_path: &Path) -> Result { + let conn = Connection::open(db_path) .with_context(|| format!("Failed to open database at {}", db_path.display()))?; // WAL mode: fast reads, lets multiple processes access the file conn.pragma_update(None, "journal_mode", "WAL")?; - conn.pragma_update(None, "synchronous", "NORMAL")?; // NORMAL is safe with WAL mode - conn.pragma_update(None, "cache_size", "-64000")?; // 64MB page cache - conn.pragma_update(None, "busy_timeout", "5000")?; // Wait 5s for locks under contention - conn.pragma_update(None, "temp_store", "MEMORY")?; // Temp tables/indexes in memory - conn.pragma_update(None, "journal_size_limit", "67108864")?; // Cap WAL at 64MB - conn.pragma_update(None, "wal_autocheckpoint", "1000")?; // Checkpoint every 1000 pages - conn.pragma_update(None, "mmap_size", "67108864")?; // 64MB memory-mapped I/O for reads - - // CREATE TABLE IF NOT EXISTS is idempotent — safe to run every time - migrate(&conn)?; + conn.pragma_update(None, "synchronous", "NORMAL")?; + conn.pragma_update(None, "cache_size", "-64000")?; + // 4s busy_timeout: lower than the 5s tokio::time::timeout on handlers, + // so SQLite returns BUSY cleanly before the task gets cancelled. + conn.pragma_update(None, "busy_timeout", "4000")?; + conn.pragma_update(None, "temp_store", "MEMORY")?; + conn.pragma_update(None, "journal_size_limit", "67108864")?; + conn.pragma_update(None, "wal_autocheckpoint", "1000")?; + conn.pragma_update(None, "mmap_size", "67108864")?; + + Ok(conn) +} + +/// Open a connection pool with `size` connections, each configured for WAL mode. +/// Migrations run once on the first connection. +pub fn open_pool(size: usize) -> Result { + let db_path = config::atomic_dir()?.join("atomic.db"); + ensure_db_file(&db_path)?; + + let first = open_connection(&db_path)?; + migrate(&first)?; + + let (tx, rx) = mpsc::channel(); + tx.send(first).expect("channel just created"); + for _ in 1..size { + tx.send(open_connection(&db_path)?).expect("channel just created"); + } + + Ok(DbPool { + sender: tx, + receiver: std::sync::Mutex::new(rx), + }) +} + +/// Open a single connection (for CLI commands that don't need a pool). +pub fn open() -> Result { + let db_path = config::atomic_dir()?.join("atomic.db"); + ensure_db_file(&db_path)?; + let conn = open_connection(&db_path)?; + migrate(&conn)?; Ok(conn) } diff --git a/src/server.rs b/src/server.rs index a8b9370..2cccd63 100644 --- a/src/server.rs +++ b/src/server.rs @@ -25,6 +25,10 @@ const SUPERVISOR_MAX_RESTARTS: u32 = 5; /// Circuit breaker window: if SUPERVISOR_MAX_RESTARTS occur within this duration, fail-fast. const SUPERVISOR_WINDOW_SECS: u64 = 300; // 5 minutes +/// Timeout for DB operations in HTTP handlers. Must exceed SQLite busy_timeout (4s) +/// so that SQLite returns BUSY cleanly before the task gets force-cancelled. +const DB_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(5); + /// Spawn a supervised background task that restarts on panic/error with exponential backoff. /// Circuit breaker: if 5 restarts occur within 5 minutes, abort the process to prevent /// resource exhaustion on unrecoverable errors (e.g., SQLite corruption). @@ -83,8 +87,8 @@ pub struct AppState { pub verifying_key: ed25519_dalek::VerifyingKey, /// Zeroized on drop. Derived from the private key via HKDF. vault_key: Zeroizing<[u8; 32]>, - /// Only lock inside `spawn_blocking` — never hold across an `.await`. - pub db: std::sync::Mutex, + /// Connection pool — use `db_pool.get()` inside `spawn_blocking`. + pub db_pool: crate::db::DbPool, pub tls_active: bool, pub behind_proxy: bool, rate_limiter: std::sync::Mutex>, @@ -133,6 +137,42 @@ struct MagicLinkResponse { const MAX_BODY_SIZE: usize = 1024 * 1024; pub async fn run_server(credentials: Credentials) -> Result<()> { + // --- Startup checks --- + + // Fix 6: Warn if file descriptor limit is too low for a long-lived TLS server + #[cfg(unix)] + { + if let Ok(output) = std::process::Command::new("sh") + .args(["-c", "ulimit -n"]) + .output() + { + if let Ok(s) = std::str::from_utf8(&output.stdout) { + if let Ok(n) = s.trim().parse::() { + if n < 4096 { + tracing::warn!( + "RLIMIT_NOFILE is {n}, recommended minimum 4096 for production" + ); + } + } + } + } + } + + // Fix 8: Warn if log file is getting large (risk of disk-full on vault writes) + if let Ok(log_path) = config::log_path() { + if let Ok(metadata) = std::fs::metadata(&log_path) { + let size_mb = metadata.len() / (1024 * 1024); + if size_mb > 100 { + tracing::warn!( + "Log file is {size_mb}MB ({}), consider rotating", + log_path.display() + ); + } + } + } + + // --- State setup --- + let agent_json_path = config::agent_json_path()?; let agent_json_cached: bytes::Bytes = std::fs::read_to_string(&agent_json_path) .with_context(|| format!("Failed to read agent.json at {}", agent_json_path.display()))? @@ -144,7 +184,9 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { let vault_key = crate::crypto::vault::derive_vault_key(&sk_bytes)?; sk_bytes.iter_mut().for_each(|b| *b = 0); // belt-and-suspenders drop(sk_bytes); - let db_conn = crate::db::open()?; + + // Connection pool with 4 connections — WAL mode allows concurrent readers + let db_pool = crate::db::open_pool(4)?; let addr = SocketAddr::from(([0, 0, 0, 0], credentials.port)); let tls_mode = TlsMode::from_credentials(&credentials); @@ -157,7 +199,7 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { agent_json_cached, verifying_key, vault_key, - db: std::sync::Mutex::new(db_conn), + db_pool, tls_active, behind_proxy, rate_limiter: std::sync::Mutex::new(HashMap::with_capacity(256)), @@ -173,9 +215,13 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { tokio::time::sleep(std::time::Duration::from_secs(300)).await; let db_ref = st.clone(); let _ = tokio::task::spawn_blocking(move || { - let conn = db_ref.db.lock().unwrap_or_else(|e| e.into_inner()); - if let Err(e) = conn.execute_batch("PRAGMA wal_checkpoint(PASSIVE);") { - tracing::warn!("WAL checkpoint failed: {e}"); + match db_ref.db_pool.get() { + Ok(conn) => { + if let Err(e) = conn.execute_batch("PRAGMA wal_checkpoint(PASSIVE);") { + tracing::warn!("WAL checkpoint failed: {e}"); + } + } + Err(e) => tracing::warn!("WAL checkpoint: pool exhausted: {e}"), } }).await; } @@ -191,7 +237,13 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { tokio::time::sleep(std::time::Duration::from_secs(3600)).await; let db_ref = st.clone(); let _ = tokio::task::spawn_blocking(move || { - let conn = db_ref.db.lock().unwrap_or_else(|e| e.into_inner()); + let conn = match db_ref.db_pool.get() { + Ok(c) => c, + Err(e) => { + tracing::warn!("DB cleanup: pool exhausted: {e}"); + return; + } + }; let now = chrono::Utc::now().timestamp(); if let Err(e) = conn.execute("DELETE FROM magic_links WHERE expires_at <= ?1", [now]) { tracing::warn!("Failed to clean expired magic links: {e}"); @@ -209,6 +261,8 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { if let Err(e) = conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);") { tracing::warn!("Hourly WAL TRUNCATE checkpoint failed: {e}"); } + // Drop connection back to pool before locking rate limiter + drop(conn); // Evict stale rate limiter entries let mut map = db_ref.rate_limiter.lock().unwrap_or_else(|e| e.into_inner()); let now = chrono::Utc::now().timestamp(); @@ -312,10 +366,13 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { // Final WAL checkpoint before exit to ensure all data is merged let _ = tokio::task::spawn_blocking(move || { - if let Ok(conn) = shutdown_state.db.lock() { - if let Err(e) = conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);") { - tracing::warn!("Final WAL checkpoint failed: {e}"); + match shutdown_state.db_pool.get() { + Ok(conn) => { + if let Err(e) = conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);") { + tracing::warn!("Final WAL checkpoint failed: {e}"); + } } + Err(e) => tracing::warn!("Final WAL checkpoint skipped: {e}"), } }).await; @@ -390,22 +447,28 @@ async fn handle_deposit( .unwrap_or("") .to_string(); - // DB operations in spawn_blocking. + // DB operations in spawn_blocking with timeout to prevent unbounded task accumulation. // Access vault_key via the Arc reference inside the closure // to avoid copying the key out of its Zeroizing wrapper. let state_clone = state.clone(); let body_clone = body; - let deposit_result = tokio::task::spawn_blocking(move || { - let conn = state_clone.db.lock() - .map_err(|e| anyhow::anyhow!("DB mutex poisoned: {e}"))?; - crate::deposit::claim_nonce_with_conn(&payload, &conn)?; - crate::vault::vault_set_with_conn(&conn, &payload.label, &body_clone, state_clone.vault_key())?; - crate::deposit::log_deposit(&conn, &payload.label, &source_ip, &user_agent)?; - Ok::<_, anyhow::Error>(payload.label) - }).await; + let deposit_result = tokio::time::timeout( + DB_TIMEOUT, + tokio::task::spawn_blocking(move || { + let conn = state_clone.db_pool.get()?; + crate::deposit::claim_nonce_with_conn(&payload, &conn)?; + crate::vault::vault_set_with_conn(&conn, &payload.label, &body_clone, state_clone.vault_key())?; + crate::deposit::log_deposit(&conn, &payload.label, &source_ip, &user_agent)?; + Ok::<_, anyhow::Error>(payload.label) + }) + ).await; match deposit_result { - Ok(Ok(label)) => { + Err(_elapsed) => { + tracing::error!("Deposit handler timed out"); + StatusCode::NOT_FOUND.into_response() + } + Ok(Ok(Ok(label))) => { info!("Deposit received: '{label}'"); let resp = DepositResponse { status: "deposited", label }; match serde_json::to_string(&resp) { @@ -413,7 +476,7 @@ async fn handle_deposit( Err(_) => StatusCode::NOT_FOUND.into_response(), } } - Ok(Err(e)) => { + Ok(Ok(Err(e))) => { let msg = e.to_string(); if msg.contains("Nonce already used") { tracing::debug!("Deposit replay rejected"); @@ -422,7 +485,7 @@ async fn handle_deposit( } StatusCode::NOT_FOUND.into_response() } - Err(e) => { + Ok(Err(e)) => { tracing::error!("Deposit task panicked: {e}"); StatusCode::NOT_FOUND.into_response() } @@ -446,26 +509,32 @@ async fn handle_magic_link( let state_clone = state.clone(); let code_clone = code; - let result = tokio::task::spawn_blocking(move || { - let conn = state_clone.db.lock() - .map_err(|e| anyhow::anyhow!("DB mutex poisoned: {e}"))?; - Ok::<_, anyhow::Error>(crate::magic_link::claim_with_conn(&code_clone, &conn)) - }).await; + let result = tokio::time::timeout( + DB_TIMEOUT, + tokio::task::spawn_blocking(move || { + let conn = state_clone.db_pool.get()?; + Ok::<_, anyhow::Error>(crate::magic_link::claim_with_conn(&code_clone, &conn)) + }) + ).await; match result { - Ok(Ok(Some(_))) => { + Err(_elapsed) => { + tracing::error!("Magic link handler timed out"); + StatusCode::NOT_FOUND.into_response() + } + Ok(Ok(Ok(Some(_)))) => { let resp = MagicLinkResponse { status: "verified" }; match serde_json::to_string(&resp) { Ok(json) => (StatusCode::OK, [(header::CONTENT_TYPE, "application/json")], json).into_response(), Err(_) => StatusCode::NOT_FOUND.into_response(), } } - Ok(Ok(None)) => StatusCode::NOT_FOUND.into_response(), - Ok(Err(e)) => { + Ok(Ok(Ok(None))) => StatusCode::NOT_FOUND.into_response(), + Ok(Ok(Err(e))) => { tracing::error!("Magic link DB error: {e}"); StatusCode::NOT_FOUND.into_response() } - Err(e) => { + Ok(Err(e)) => { tracing::error!("Magic link task panicked: {e}"); StatusCode::NOT_FOUND.into_response() } @@ -473,16 +542,21 @@ async fn handle_magic_link( } async fn handle_health(State(state): State>) -> Response { - // Check DB is responsive - let db_ok = { - let st = state.clone(); - tokio::task::spawn_blocking(move || { - let conn = st.db.lock().unwrap_or_else(|e| e.into_inner()); - conn.execute_batch("SELECT 1").is_ok() + // Check DB is responsive (with timeout) + let db_ok = tokio::time::timeout( + std::time::Duration::from_secs(3), + tokio::task::spawn_blocking({ + let st = state.clone(); + move || match st.db_pool.get() { + Ok(conn) => conn.execute_batch("SELECT 1").is_ok(), + Err(_) => false, + } }) - .await - .unwrap_or(false) - }; + ) + .await + .ok() + .and_then(|r| r.ok()) + .unwrap_or(false); // Check agent.json is valid JSON let agent_ok = serde_json::from_slice::(&state.agent_json_cached).is_ok(); From e612ebd2fd904b5c591dc6c5dd722e44e06fa6f9 Mon Sep 17 00:00:00 2001 From: Hitesh Joshi Date: Tue, 31 Mar 2026 06:17:31 +0530 Subject: [PATCH 13/49] Update README with changelog for 1b65780 --- README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.md b/README.md index 6292a79..ffdfbd7 100644 --- a/README.md +++ b/README.md @@ -242,6 +242,14 @@ Cross-compiles to `x86_64-linux-musl`, `aarch64-linux-musl`, `x86_64-apple-darwi - WAL checkpointing split: PASSIVE every 5m (non-blocking), TRUNCATE hourly (reclaims disk) - `atomic stop`: kill -0 probe before SIGTERM to minimize PID reuse race window +**1b65780** — Connection pool, handler timeouts, credential zeroize, startup guards +- Replace `Mutex` with zero-dep channel-based `DbPool` (4 conns), unlocking WAL concurrent readers +- Wrap handler DB ops in `tokio::time::timeout(5s)` to prevent unbounded task accumulation +- Lower SQLite `busy_timeout` to 4s (below tokio 5s) for clean BUSY errors before task cancellation +- Zeroize credential JSON buffer in `save()` via `Zeroizing>` — private key material no longer lingers in freed heap +- Startup guard: warn if `RLIMIT_NOFILE < 4096` (prevents cryptic fd exhaustion under TLS load) +- Startup guard: warn if log file exceeds 100MB (prevents disk-full failures on vault atomic writes) + ## Roadmap - [x] Identity (agent.json + Ed25519) From 5fc281e34ad34a1a5a9dd29279298e42a8cdbde1 Mon Sep 17 00:00:00 2001 From: Hitesh Joshi Date: Tue, 31 Mar 2026 06:22:35 +0530 Subject: [PATCH 14/49] =?UTF-8?q?=CE=BB-RLM=20iter=207:=20AES-GCM=20zeroiz?= =?UTF-8?q?e,=20dynamic=20pool=20sizing,=20health=20check=20WAL=20monitor?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Cargo.lock | 1 + Cargo.toml | 2 +- src/server.rs | 21 ++++++++++++++++----- 3 files changed, 18 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6e04180..322cf8b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -35,6 +35,7 @@ dependencies = [ "ctr", "ghash", "subtle", + "zeroize", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index a2b1c38..908adf2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -30,7 +30,7 @@ serde_json = "1" ed25519-dalek = { version = "2", features = ["rand_core"] } # Crypto - AES-256-GCM -aes-gcm = "0.10" +aes-gcm = { version = "0.10", features = ["zeroize"] } # Key derivation hkdf = "0.12" diff --git a/src/server.rs b/src/server.rs index 2cccd63..314c291 100644 --- a/src/server.rs +++ b/src/server.rs @@ -185,8 +185,11 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { sk_bytes.iter_mut().for_each(|b| *b = 0); // belt-and-suspenders drop(sk_bytes); - // Connection pool with 4 connections — WAL mode allows concurrent readers - let db_pool = crate::db::open_pool(4)?; + // Size pool to available parallelism (capped 2..8) — WAL mode allows concurrent readers + let pool_size = std::thread::available_parallelism() + .map(|n| n.get().clamp(2, 8)) + .unwrap_or(4); + let db_pool = crate::db::open_pool(pool_size)?; let addr = SocketAddr::from(([0, 0, 0, 0], credentials.port)); let tls_mode = TlsMode::from_credentials(&credentials); @@ -561,12 +564,20 @@ async fn handle_health(State(state): State>) -> Response { // Check agent.json is valid JSON let agent_ok = serde_json::from_slice::(&state.agent_json_cached).is_ok(); - if db_ok && agent_ok { + // Check WAL size is under control (< 50MB) + let wal_ok = crate::config::atomic_dir() + .map(|d| d.join("atomic.db-wal")) + .ok() + .and_then(|p| std::fs::metadata(&p).ok()) + .map(|m| m.len() < 50 * 1024 * 1024) + .unwrap_or(true); // WAL not existing is fine + + if db_ok && agent_ok && wal_ok { (StatusCode::OK, [(header::CONTENT_TYPE, "application/json")], r#"{"status":"ok"}"#).into_response() } else { let detail = format!( - r#"{{"status":"degraded","db":{},"agent_json":{}}}"#, - db_ok, agent_ok + r#"{{"status":"degraded","db":{},"agent_json":{},"wal_size_ok":{}}}"#, + db_ok, agent_ok, wal_ok ); (StatusCode::SERVICE_UNAVAILABLE, [(header::CONTENT_TYPE, "application/json")], detail).into_response() } From c67614751e05f5f646eccec1057f46d95f8fa6a9 Mon Sep 17 00:00:00 2001 From: Hitesh Joshi Date: Tue, 31 Mar 2026 06:22:57 +0530 Subject: [PATCH 15/49] Update README with changelog for 5fc281e --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index ffdfbd7..c3fabe6 100644 --- a/README.md +++ b/README.md @@ -250,6 +250,11 @@ Cross-compiles to `x86_64-linux-musl`, `aarch64-linux-musl`, `x86_64-apple-darwi - Startup guard: warn if `RLIMIT_NOFILE < 4096` (prevents cryptic fd exhaustion under TLS load) - Startup guard: warn if log file exceeds 100MB (prevents disk-full failures on vault atomic writes) +**5fc281e** — AES-GCM zeroize, dynamic pool sizing, health check WAL monitor +- Enable `zeroize` feature on `aes-gcm`: AES key schedule is now wiped from memory on cipher drop +- DB connection pool sized dynamically via `available_parallelism()` (clamped 2..8) instead of hardcoded 4 +- Health endpoint (`/_/health`) now reports WAL file size, returns degraded if WAL exceeds 50MB + ## Roadmap - [x] Identity (agent.json + Ed25519) From 1ba345e7e933be12c2e73681f1381a745452c6cc Mon Sep 17 00:00:00 2001 From: Hitesh Joshi Date: Tue, 31 Mar 2026 06:30:11 +0530 Subject: [PATCH 16/49] =?UTF-8?q?=CE=BB-RLM=20iter=208:=20DashMap=20rate?= =?UTF-8?q?=20limiter,=20flock=20PID=20locking,=20jemalloc=20opt-in,=20con?= =?UTF-8?q?nection=20leak=20detection?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Replace Mutex rate limiter with DashMap (sharded locks, no global contention under high concurrency) - PID file locking via flock prevents double-start races; lock auto-released on crash by kernel - Optional jemalloc allocator (--features jemalloc) for long-lived Linux deployments to prevent RSS bloat - PooledConn tracks hold time, warns on connections held >30s (leak detection) - Panic hook cleans up PID file on fatal panic - PRAGMA optimize added to hourly cleanup for SQLite query planner stats - Rate limiter eviction moved from hot path to background cleanup (DashMap len() check replaces per-request retain) --- Cargo.lock | 118 ++++++++++++++++++++++++++++++++++++++++++++++++++ Cargo.toml | 6 +++ src/config.rs | 25 +++++++++++ src/db.rs | 8 ++++ src/main.rs | 13 ++++++ src/server.rs | 29 +++++++------ 6 files changed, 185 insertions(+), 14 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 322cf8b..d9121ec 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -133,8 +133,10 @@ dependencies = [ "bytes", "chrono", "clap", + "dashmap", "dirs", "ed25519-dalek", + "fs2", "hex", "hkdf", "rand 0.8.5", @@ -144,6 +146,7 @@ dependencies = [ "serde_json", "sha2", "thiserror", + "tikv-jemallocator", "tokio", "tower-http", "tracing", @@ -422,6 +425,12 @@ dependencies = [ "libc", ] +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + [[package]] name = "crypto-common" version = "0.1.7" @@ -469,6 +478,20 @@ dependencies = [ "syn", ] +[[package]] +name = "dashmap" +version = "6.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf" +dependencies = [ + "cfg-if", + "crossbeam-utils", + "hashbrown 0.14.5", + "lock_api", + "once_cell", + "parking_lot_core", +] + [[package]] name = "der" version = "0.7.10" @@ -630,6 +653,16 @@ dependencies = [ "tokio", ] +[[package]] +name = "fs2" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9564fc758e15025b46aa6643b1b77d047d1a56a1aea6e01002ac0c7026876213" +dependencies = [ + "libc", + "winapi", +] + [[package]] name = "fs_extra" version = "1.3.0" @@ -741,6 +774,12 @@ dependencies = [ "tracing", ] +[[package]] +name = "hashbrown" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" + [[package]] name = "hashbrown" version = "0.15.5" @@ -1134,6 +1173,15 @@ version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77" +[[package]] +name = "lock_api" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965" +dependencies = [ + "scopeguard", +] + [[package]] name = "log" version = "0.4.29" @@ -1226,6 +1274,19 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" +[[package]] +name = "parking_lot_core" +version = "0.9.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-link", +] + [[package]] name = "percent-encoding" version = "2.3.2" @@ -1428,6 +1489,15 @@ dependencies = [ "getrandom 0.3.4", ] +[[package]] +name = "redox_syscall" +version = "0.5.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" +dependencies = [ + "bitflags", +] + [[package]] name = "redox_users" version = "0.5.2" @@ -1586,6 +1656,12 @@ version = "1.0.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + [[package]] name = "semver" version = "1.0.27" @@ -1813,6 +1889,26 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "tikv-jemalloc-sys" +version = "0.6.1+5.3.0-1-ge13ca993e8ccb9ba9847cc330696e02839f328f7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd8aa5b2ab86a2cefa406d889139c162cbb230092f7d1d7cbc1716405d852a3b" +dependencies = [ + "cc", + "libc", +] + +[[package]] +name = "tikv-jemallocator" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0359b4327f954e0567e69fb191cf1436617748813819c94b8cd4a431422d053a" +dependencies = [ + "libc", + "tikv-jemalloc-sys", +] + [[package]] name = "tinystr" version = "0.8.2" @@ -2184,6 +2280,28 @@ dependencies = [ "rustls-pki-types", ] +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + [[package]] name = "windows-core" version = "0.62.2" diff --git a/Cargo.toml b/Cargo.toml index 908adf2..5a12148 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,6 +10,9 @@ repository = "https://github.com/ploton/atomic" name = "atomic" path = "src/main.rs" +[features] +jemalloc = ["dep:tikv-jemallocator"] + [dependencies] # CLI clap = { version = "4", features = ["derive"] } @@ -62,6 +65,9 @@ rusqlite = { version = "0.34", features = ["bundled"] } bytes = "1" dirs = "6" zeroize = { version = "1.8.2", features = ["derive"] } +dashmap = "6" +fs2 = "0.4" +tikv-jemallocator = { version = "0.6", optional = true } [profile.release] opt-level = 3 diff --git a/src/config.rs b/src/config.rs index e8cd41a..ac3dfcf 100644 --- a/src/config.rs +++ b/src/config.rs @@ -68,6 +68,31 @@ pub fn log_path() -> Result { Ok(atomic_dir()?.join("atomic.log")) } +/// Acquire an exclusive flock on the PID file to prevent double-start races. +/// Returns the open File handle — caller must keep it alive for the duration of the process. +/// The lock is automatically released by the kernel when the process exits (even on crash). +pub fn acquire_pid_lock(path: &Path) -> Result { + use fs2::FileExt; + use std::io::Write; + + let file = std::fs::OpenOptions::new() + .write(true) + .create(true) + .truncate(false) + .open(path) + .with_context(|| format!("Failed to open PID file at {}", path.display()))?; + + file.try_lock_exclusive() + .context("Another atomic process is already running (PID file locked)")?; + + // Write PID after lock acquired + let mut f = &file; + file.set_len(0)?; + f.write_all(std::process::id().to_string().as_bytes())?; + file.sync_all()?; + Ok(file) +} + pub fn ensure_atomic_dir() -> Result { let dir = atomic_dir()?; if !dir.exists() { diff --git a/src/db.rs b/src/db.rs index d3abc93..914593c 100644 --- a/src/db.rs +++ b/src/db.rs @@ -2,6 +2,7 @@ use anyhow::{Context, Result}; use rusqlite::Connection; use std::path::Path; use std::sync::mpsc; +use std::time::{Duration, Instant}; use crate::config; @@ -15,9 +16,11 @@ pub struct DbPool { } /// RAII guard that returns the connection to the pool on drop. +/// Tracks hold time to detect potential connection leaks. pub struct PooledConn<'a> { pool: &'a DbPool, conn: Option, + acquired_at: Instant, } impl<'a> std::ops::Deref for PooledConn<'a> { @@ -30,6 +33,10 @@ impl<'a> std::ops::Deref for PooledConn<'a> { impl Drop for PooledConn<'_> { fn drop(&mut self) { if let Some(c) = self.conn.take() { + let held = self.acquired_at.elapsed(); + if held > Duration::from_secs(30) { + tracing::warn!("SQLite connection held for {:?}, possible leak", held); + } let _ = self.pool.sender.send(c); } } @@ -45,6 +52,7 @@ impl DbPool { Ok(PooledConn { pool: self, conn: Some(conn), + acquired_at: Instant::now(), }) } } diff --git a/src/main.rs b/src/main.rs index 105058c..43c5ce9 100644 --- a/src/main.rs +++ b/src/main.rs @@ -12,6 +12,10 @@ mod sign; mod tls; mod vault; +#[cfg(feature = "jemalloc")] +#[global_allocator] +static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; + use anyhow::{Context, Result}; use clap::Parser; use tracing_subscriber::EnvFilter; @@ -24,6 +28,15 @@ async fn main() -> Result<()> { .with_env_filter(EnvFilter::from_default_env().add_directive("atomic=info".parse()?)) .init(); + // Clean up PID file on panic (best-effort). With panic=abort in release, + // the hook still runs before the process terminates. + std::panic::set_hook(Box::new(|info| { + eprintln!("atomic: fatal panic: {info}"); + if let Ok(path) = config::pid_path() { + let _ = std::fs::remove_file(path); + } + })); + let cli = Cli::parse(); match cli.command { diff --git a/src/server.rs b/src/server.rs index 314c291..3167d12 100644 --- a/src/server.rs +++ b/src/server.rs @@ -7,8 +7,8 @@ use axum::{ routing::{get, post}, Router, }; +use dashmap::DashMap; use serde::Serialize; -use std::collections::HashMap; use std::future::Future; use std::net::{IpAddr, SocketAddr}; use std::sync::Arc; @@ -91,7 +91,8 @@ pub struct AppState { pub db_pool: crate::db::DbPool, pub tls_active: bool, pub behind_proxy: bool, - rate_limiter: std::sync::Mutex>, + /// Sharded concurrent map — no global mutex contention under high concurrency. + rate_limiter: DashMap, } impl AppState { @@ -100,16 +101,14 @@ impl AppState { } /// Returns true if the request is within rate limits. + /// Uses DashMap (sharded locks) so concurrent requests don't serialize on a global mutex. pub fn check_rate_limit(&self, ip: IpAddr) -> bool { let now = chrono::Utc::now().timestamp(); - let mut map = self.rate_limiter.lock().unwrap_or_else(|e| e.into_inner()); - // Always evict stale entries to guarantee bounded memory regardless of attacker volume - map.retain(|_, (_, window_start)| now - *window_start <= RATE_LIMIT_WINDOW_SECS); - // Hard cap as defense-in-depth against clock skew or other edge cases - if map.len() >= RATE_LIMIT_MAX_ENTRIES { + // Hard cap as defense-in-depth (approximate len is fine for rate limiting) + if self.rate_limiter.len() >= RATE_LIMIT_MAX_ENTRIES { return false; } - let entry = map.entry(ip).or_insert((0, now)); + let mut entry = self.rate_limiter.entry(ip).or_insert((0, now)); if now - entry.1 > RATE_LIMIT_WINDOW_SECS { *entry = (1, now); true @@ -205,7 +204,7 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { db_pool, tls_active, behind_proxy, - rate_limiter: std::sync::Mutex::new(HashMap::with_capacity(256)), + rate_limiter: DashMap::with_capacity(256), }); // Background task: WAL checkpoint every 5 minutes (PASSIVE to avoid blocking writers; @@ -264,12 +263,13 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { if let Err(e) = conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);") { tracing::warn!("Hourly WAL TRUNCATE checkpoint failed: {e}"); } - // Drop connection back to pool before locking rate limiter + // Let SQLite update its query planner statistics + let _ = conn.execute_batch("PRAGMA optimize;"); + // Drop connection back to pool before touching rate limiter drop(conn); - // Evict stale rate limiter entries - let mut map = db_ref.rate_limiter.lock().unwrap_or_else(|e| e.into_inner()); + // Evict stale rate limiter entries (DashMap::retain is lock-free per shard) let now = chrono::Utc::now().timestamp(); - map.retain(|_, (_, window_start)| now - *window_start <= RATE_LIMIT_WINDOW_SECS); + db_ref.rate_limiter.retain(|_, (_, window_start)| now - *window_start <= RATE_LIMIT_WINDOW_SECS); }).await; } } @@ -303,7 +303,8 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { .with_state(state); let pid_path = config::pid_path()?; - config::write_secure(&pid_path, std::process::id().to_string().as_bytes())?; + // Acquire flock — prevents double-start races. Lock released on process exit (even crash). + let _pid_lock = config::acquire_pid_lock(&pid_path)?; match tls_mode { TlsMode::None => { From 6b451bf33cb6e0a30bd4d2294e537a10d6524122 Mon Sep 17 00:00:00 2001 From: Hitesh Joshi Date: Tue, 31 Mar 2026 06:30:21 +0530 Subject: [PATCH 17/49] Update README with changelog for 1ba345e --- README.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/README.md b/README.md index c3fabe6..64ec749 100644 --- a/README.md +++ b/README.md @@ -208,6 +208,15 @@ Cross-compiles to `x86_64-linux-musl`, `aarch64-linux-musl`, `x86_64-apple-darwi ## Changelog +**1ba345e** — DashMap rate limiter, flock PID locking, jemalloc opt-in, connection leak detection +- Replace `Mutex` rate limiter with `DashMap` (sharded locks, no global contention under high concurrency) +- PID file locking via `flock` prevents double-start races; lock auto-released on crash by kernel +- Optional jemalloc allocator (`--features jemalloc`) for long-lived Linux deployments to prevent RSS bloat +- `PooledConn` tracks hold time, warns on connections held >30s (leak detection) +- Panic hook cleans up PID file on fatal panic +- `PRAGMA optimize` added to hourly cleanup for SQLite query planner stats +- Rate limiter eviction moved from hot path to background cleanup (`DashMap::len()` check replaces per-request `retain`) + **a609534** — Exponential backoff for supervised tasks, deposit_log retention - `spawn_supervised` uses exponential backoff (5s → 10s → ... → 320s cap) instead of fixed 5s delay to prevent spin loops on persistent failures (e.g. disk full) - Cleanup task purges `deposit_log` entries older than 90 days to prevent unbounded disk growth on long-lived servers From 7b45baf43d214667276f9a564d7699f4466946d3 Mon Sep 17 00:00:00 2001 From: Hitesh Joshi Date: Tue, 31 Mar 2026 06:36:15 +0530 Subject: [PATCH 18/49] =?UTF-8?q?=CE=BB-RLM=20iter=209:=20replace=20exit(1?= =?UTF-8?q?)=20circuit=20breaker=20with=20max=20backoff,=20monotonic=20rat?= =?UTF-8?q?e=20limiter,=20bounded=20sync=5Fchannel=20pool?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/db.rs | 8 ++++---- src/server.rs | 56 ++++++++++++++++++++++++++++++++++++--------------- 2 files changed, 44 insertions(+), 20 deletions(-) diff --git a/src/db.rs b/src/db.rs index 914593c..cf7becb 100644 --- a/src/db.rs +++ b/src/db.rs @@ -7,11 +7,11 @@ use std::time::{Duration, Instant}; use crate::config; /// Zero-dependency connection pool for SQLite. -/// Uses a bounded channel to distribute pre-opened connections. +/// Uses a bounded sync_channel to distribute pre-opened connections. /// WAL mode allows concurrent readers; the pool prevents serialization /// behind a single Mutex. pub struct DbPool { - sender: mpsc::Sender, + sender: mpsc::SyncSender, receiver: std::sync::Mutex>, } @@ -37,7 +37,7 @@ impl Drop for PooledConn<'_> { if held > Duration::from_secs(30) { tracing::warn!("SQLite connection held for {:?}, possible leak", held); } - let _ = self.pool.sender.send(c); + let _ = self.pool.sender.try_send(c); } } } @@ -99,7 +99,7 @@ pub fn open_pool(size: usize) -> Result { let first = open_connection(&db_path)?; migrate(&first)?; - let (tx, rx) = mpsc::channel(); + let (tx, rx) = mpsc::sync_channel(size); tx.send(first).expect("channel just created"); for _ in 1..size { diff --git a/src/server.rs b/src/server.rs index 3167d12..fd145be 100644 --- a/src/server.rs +++ b/src/server.rs @@ -30,8 +30,9 @@ const SUPERVISOR_WINDOW_SECS: u64 = 300; // 5 minutes const DB_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(5); /// Spawn a supervised background task that restarts on panic/error with exponential backoff. -/// Circuit breaker: if 5 restarts occur within 5 minutes, abort the process to prevent -/// resource exhaustion on unrecoverable errors (e.g., SQLite corruption). +/// Circuit breaker: if 5 restarts occur within 5 minutes, enter max backoff (320s) instead +/// of killing the process — process::exit skips destructors, preventing Zeroizing from +/// wiping vault keys and the final WAL checkpoint from running. fn spawn_supervised(name: &'static str, make_task: F) where F: Fn() -> Fut + Send + 'static, @@ -41,6 +42,7 @@ where let mut retries: u32 = 0; let mut window_start = std::time::Instant::now(); let mut window_restarts: u32 = 0; + let mut circuit_open = false; loop { let result = tokio::spawn(make_task()).await; match result { @@ -51,16 +53,25 @@ where if now.duration_since(window_start).as_secs() > SUPERVISOR_WINDOW_SECS { window_start = now; window_restarts = 0; + if circuit_open { + tracing::info!("{name}: circuit breaker reset after quiet period"); + circuit_open = false; + retries = 0; + } } window_restarts += 1; - if window_restarts >= SUPERVISOR_MAX_RESTARTS { + if window_restarts >= SUPERVISOR_MAX_RESTARTS && !circuit_open { tracing::error!( - "{name} task failed {SUPERVISOR_MAX_RESTARTS} times in {SUPERVISOR_WINDOW_SECS}s — aborting process" + "{name}: circuit breaker OPEN — {SUPERVISOR_MAX_RESTARTS} failures in {SUPERVISOR_WINDOW_SECS}s, entering max backoff" ); - std::process::exit(1); + circuit_open = true; } - let delay_secs = 5_u64.saturating_mul(1u64 << retries.min(6)); // 5s, 10s, 20s, ..., 320s (capped) + let delay_secs = if circuit_open { + 320 // Max backoff while circuit is open + } else { + 5_u64.saturating_mul(1u64 << retries.min(6)) // 5s, 10s, 20s, ..., 320s + }; tracing::error!("{name} task panicked: {e}. Restarting in {delay_secs}s ({window_restarts}/{SUPERVISOR_MAX_RESTARTS} in window)..."); tokio::time::sleep(std::time::Duration::from_secs(delay_secs)).await; retries = retries.saturating_add(1); @@ -70,7 +81,7 @@ where }); } -const RATE_LIMIT_WINDOW_SECS: i64 = 60; +const RATE_LIMIT_WINDOW_SECS: u64 = 60; const RATE_LIMIT_MAX_REQUESTS: u32 = 10; const RATE_LIMIT_MAX_ENTRIES: usize = 10_000; const MAX_INPUT_LEN: usize = 256; @@ -92,7 +103,8 @@ pub struct AppState { pub tls_active: bool, pub behind_proxy: bool, /// Sharded concurrent map — no global mutex contention under high concurrency. - rate_limiter: DashMap, + /// Uses monotonic Instant (not wall clock) to prevent clock-skew manipulation. + rate_limiter: DashMap, } impl AppState { @@ -102,14 +114,16 @@ impl AppState { /// Returns true if the request is within rate limits. /// Uses DashMap (sharded locks) so concurrent requests don't serialize on a global mutex. + /// Monotonic Instant prevents clock-skew attacks from resetting windows. pub fn check_rate_limit(&self, ip: IpAddr) -> bool { - let now = chrono::Utc::now().timestamp(); + let now = std::time::Instant::now(); + let window = std::time::Duration::from_secs(RATE_LIMIT_WINDOW_SECS); // Hard cap as defense-in-depth (approximate len is fine for rate limiting) if self.rate_limiter.len() >= RATE_LIMIT_MAX_ENTRIES { return false; } let mut entry = self.rate_limiter.entry(ip).or_insert((0, now)); - if now - entry.1 > RATE_LIMIT_WINDOW_SECS { + if now.duration_since(entry.1) > window { *entry = (1, now); true } else if entry.0 >= RATE_LIMIT_MAX_REQUESTS { @@ -230,7 +244,7 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { } }); - // Background task: clean expired magic links, old deposit nonces, and stale rate limiter entries + // Background task: clean expired magic links, old deposit nonces, and rate limiter entries let cleanup_state = state.clone(); spawn_supervised("db-cleanup", move || { let st = cleanup_state.clone(); @@ -265,16 +279,26 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { } // Let SQLite update its query planner statistics let _ = conn.execute_batch("PRAGMA optimize;"); - // Drop connection back to pool before touching rate limiter - drop(conn); - // Evict stale rate limiter entries (DashMap::retain is lock-free per shard) - let now = chrono::Utc::now().timestamp(); - db_ref.rate_limiter.retain(|_, (_, window_start)| now - *window_start <= RATE_LIMIT_WINDOW_SECS); }).await; } } }); + // Background task: evict stale rate limiter entries every 5 minutes. + // More aggressive than hourly to bound DashMap memory under sustained attack. + let rl_state = state.clone(); + spawn_supervised("rate-limiter-evict", move || { + let st = rl_state.clone(); + async move { + loop { + tokio::time::sleep(std::time::Duration::from_secs(300)).await; + let cutoff = std::time::Duration::from_secs(RATE_LIMIT_WINDOW_SECS); + let now = std::time::Instant::now(); + st.rate_limiter.retain(|_, (_, window_start)| now.duration_since(*window_start) <= cutoff); + } + } + }); + // CORS only on the public agent.json endpoint; deposit and magic link // endpoints are called by servers, not browsers, and don't need CORS. let cors = CorsLayer::new() From 6c81242fe95ce3e1c496a8936d4f9b9f115d2d2c Mon Sep 17 00:00:00 2001 From: Hitesh Joshi Date: Tue, 31 Mar 2026 06:36:36 +0530 Subject: [PATCH 19/49] Update README with changelog for 7b45baf --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index 64ec749..8b640af 100644 --- a/README.md +++ b/README.md @@ -259,6 +259,12 @@ Cross-compiles to `x86_64-linux-musl`, `aarch64-linux-musl`, `x86_64-apple-darwi - Startup guard: warn if `RLIMIT_NOFILE < 4096` (prevents cryptic fd exhaustion under TLS load) - Startup guard: warn if log file exceeds 100MB (prevents disk-full failures on vault atomic writes) +**7b45baf** — Replace exit(1) circuit breaker with max backoff, monotonic rate limiter, bounded pool +- Circuit breaker no longer calls `process::exit(1)` — enters 320s max backoff instead, so destructors run (Zeroizing wipes vault keys, final WAL checkpoint completes) +- Rate limiter uses monotonic `Instant` instead of wall-clock `i64` timestamps, preventing clock-skew manipulation of rate windows +- Rate limiter eviction split into dedicated 5-minute task (was hourly), bounding DashMap memory under sustained attack +- Connection pool uses bounded `sync_channel(size)` instead of unbounded `channel()` for explicit capacity enforcement + **5fc281e** — AES-GCM zeroize, dynamic pool sizing, health check WAL monitor - Enable `zeroize` feature on `aes-gcm`: AES key schedule is now wiped from memory on cipher drop - DB connection pool sized dynamically via `available_parallelism()` (clamped 2..8) instead of hardcoded 4 From e3eb87e6b78e55157e55828b8690e6577453ba3c Mon Sep 17 00:00:00 2001 From: Hitesh Joshi Date: Tue, 31 Mar 2026 06:39:58 +0530 Subject: [PATCH 20/49] =?UTF-8?q?=CE=BB-RLM=20iter=2010:=20hard=20conn=20l?= =?UTF-8?q?ifetime,=20WAL=20checkpoint=20timeout,=20vault=20label=20valida?= =?UTF-8?q?tion,=20jemalloc=20default?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Cargo.toml | 1 + src/db.rs | 6 ++++++ src/server.rs | 25 ++++++++++++++++--------- src/vault.rs | 11 +++++++++-- 4 files changed, 32 insertions(+), 11 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 5a12148..ef95a84 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,6 +11,7 @@ name = "atomic" path = "src/main.rs" [features] +default = ["jemalloc"] jemalloc = ["dep:tikv-jemallocator"] [dependencies] diff --git a/src/db.rs b/src/db.rs index cf7becb..521a4e8 100644 --- a/src/db.rs +++ b/src/db.rs @@ -34,6 +34,12 @@ impl Drop for PooledConn<'_> { fn drop(&mut self) { if let Some(c) = self.conn.take() { let held = self.acquired_at.elapsed(); + if held > Duration::from_secs(60) { + // Force-close stale connections rather than returning to pool. + // Prevents connection leaks from panicked threads or stuck queries. + tracing::warn!("SQLite connection held for {:?}, dropping instead of returning to pool", held); + return; + } if held > Duration::from_secs(30) { tracing::warn!("SQLite connection held for {:?}, possible leak", held); } diff --git a/src/server.rs b/src/server.rs index fd145be..b81a0d6 100644 --- a/src/server.rs +++ b/src/server.rs @@ -392,17 +392,24 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { } } - // Final WAL checkpoint before exit to ensure all data is merged - let _ = tokio::task::spawn_blocking(move || { - match shutdown_state.db_pool.get() { - Ok(conn) => { - if let Err(e) = conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);") { - tracing::warn!("Final WAL checkpoint failed: {e}"); + // Final WAL checkpoint before exit to ensure all data is merged. + // Timeout prevents indefinite hang if the DB is stuck. + let checkpoint_result = tokio::time::timeout( + std::time::Duration::from_secs(5), + tokio::task::spawn_blocking(move || { + match shutdown_state.db_pool.get() { + Ok(conn) => { + if let Err(e) = conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);") { + tracing::warn!("Final WAL checkpoint failed: {e}"); + } } + Err(e) => tracing::warn!("Final WAL checkpoint skipped: {e}"), } - Err(e) => tracing::warn!("Final WAL checkpoint skipped: {e}"), - } - }).await; + }) + ).await; + if checkpoint_result.is_err() { + tracing::error!("Final WAL checkpoint timed out (5s) — data is consistent but may remain in WAL file"); + } let _ = std::fs::remove_file(&pid_path); info!("Server stopped"); diff --git a/src/vault.rs b/src/vault.rs index 4c7927a..924e05e 100644 --- a/src/vault.rs +++ b/src/vault.rs @@ -74,9 +74,16 @@ pub fn vault_count() -> Result { Ok(count as usize) } +/// Reject labels with non-printable characters or excessive length. +fn is_valid_label(s: &str) -> bool { + !s.is_empty() + && s.len() <= 256 + && s.bytes().all(|b| b >= 0x20 && b != 0x7F) +} + pub fn cmd_set(label: &str, value: &str, vault_key: &[u8; 32]) -> Result<()> { - if label.is_empty() || label.len() > 256 { - bail!("Label must be non-empty and at most 256 characters"); + if !is_valid_label(label) { + bail!("Label must be non-empty, at most 256 printable characters"); } vault_set(label, value, vault_key)?; println!("Stored '{label}'"); From 7f6806ea9341877bc6064d9e28e046750c17042c Mon Sep 17 00:00:00 2001 From: Hitesh Joshi Date: Tue, 31 Mar 2026 06:40:22 +0530 Subject: [PATCH 21/49] Update README with changelog for e3eb87e --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index 8b640af..25c566c 100644 --- a/README.md +++ b/README.md @@ -208,6 +208,12 @@ Cross-compiles to `x86_64-linux-musl`, `aarch64-linux-musl`, `x86_64-apple-darwi ## Changelog +**e3eb87e** — Hard conn lifetime, WAL checkpoint timeout, vault label validation, jemalloc default +- `PooledConn::drop` force-closes connections held >60s instead of returning to pool (prevents leaks from panicked threads or stuck queries) +- Final WAL checkpoint wrapped in 5s `tokio::time::timeout` to prevent indefinite hang on shutdown if DB is stuck +- `vault::cmd_set` validates labels with printable ASCII check (defense-in-depth, matching server-side `is_valid_input`) +- jemalloc is now the default feature (`cargo build` enables it; `--no-default-features` to opt out) + **1ba345e** — DashMap rate limiter, flock PID locking, jemalloc opt-in, connection leak detection - Replace `Mutex` rate limiter with `DashMap` (sharded locks, no global contention under high concurrency) - PID file locking via `flock` prevents double-start races; lock auto-released on crash by kernel From 756eadbdcf2ad572a01d8696a26408a2e09bd3d1 Mon Sep 17 00:00:00 2001 From: Hitesh Joshi Date: Tue, 31 Mar 2026 06:59:48 +0530 Subject: [PATCH 22/49] =?UTF-8?q?=CE=BB-RLM=20iter=2011:=20cooperative=20c?= =?UTF-8?q?onn=20interrupt,=20prepared=20statement=20cache,=20request=20ti?= =?UTF-8?q?meout,=20global=20magic=20link=20rate=20limit?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - db.rs: call interrupt() before force-closing stale SQLite connections to allow clean WAL rollback instead of undefined state from aborted transactions - deposit.rs, magic_link.rs, vault.rs: switch hot-path DB queries to prepare_cached() to eliminate repeated SQL parse overhead under load - server.rs: add 30s global request timeout middleware as defense-in-depth - server.rs: make pool size configurable via ATOMIC_POOL_SIZE env var (1-64) - server.rs: improve rate limiter eviction — when DashMap is full, evict one expired entry inline instead of blanket-denying all new IPs - server.rs: add global per-second rate limit (20/s) on magic link claims to prevent distributed brute-force across many source IPs --- src/db.rs | 8 +++--- src/deposit.rs | 6 +++-- src/magic_link.rs | 6 ++--- src/server.rs | 62 ++++++++++++++++++++++++++++++++++++++++++----- src/vault.rs | 6 ++--- 5 files changed, 70 insertions(+), 18 deletions(-) diff --git a/src/db.rs b/src/db.rs index 521a4e8..41543fa 100644 --- a/src/db.rs +++ b/src/db.rs @@ -35,9 +35,11 @@ impl Drop for PooledConn<'_> { if let Some(c) = self.conn.take() { let held = self.acquired_at.elapsed(); if held > Duration::from_secs(60) { - // Force-close stale connections rather than returning to pool. - // Prevents connection leaks from panicked threads or stuck queries. - tracing::warn!("SQLite connection held for {:?}, dropping instead of returning to pool", held); + // Cooperatively interrupt any in-flight query before closing. + // This lets SQLite roll back cleanly instead of leaving the WAL + // in an undefined state from an aborted transaction. + c.get_interrupt_handle().interrupt(); + tracing::warn!("SQLite connection held for {:?}, interrupted and dropping", held); return; } if held > Duration::from_secs(30) { diff --git a/src/deposit.rs b/src/deposit.rs index fa06f69..47ff36d 100644 --- a/src/deposit.rs +++ b/src/deposit.rs @@ -89,8 +89,9 @@ pub fn claim_nonce_with_conn( conn: &rusqlite::Connection, ) -> Result<()> { let now = chrono::Utc::now().timestamp(); - let inserted = conn.execute( + let inserted = conn.prepare_cached( "INSERT OR IGNORE INTO used_deposits (nonce, label, used_at) VALUES (?1, ?2, ?3)", + )?.execute( rusqlite::params![payload.nonce, payload.label, now], )?; @@ -109,8 +110,9 @@ pub fn log_deposit( user_agent: &str, ) -> Result<()> { let now = chrono::Utc::now().timestamp(); - conn.execute( + conn.prepare_cached( "INSERT INTO deposit_log (label, source_ip, user_agent, deposited_at) VALUES (?1, ?2, ?3, ?4)", + )?.execute( rusqlite::params![label, source_ip, user_agent, now], )?; Ok(()) diff --git a/src/magic_link.rs b/src/magic_link.rs index 4a13603..8646634 100644 --- a/src/magic_link.rs +++ b/src/magic_link.rs @@ -72,10 +72,8 @@ pub fn claim_with_conn(code: &str, conn: &rusqlite::Connection) -> Option ?2", - rusqlite::params![code_hash, now], - ) + .prepare_cached("DELETE FROM magic_links WHERE code_hash = ?1 AND expires_at > ?2") + .and_then(|mut stmt| stmt.execute(rusqlite::params![code_hash, now])) .ok()?; if deleted > 0 { diff --git a/src/server.rs b/src/server.rs index b81a0d6..135e1b7 100644 --- a/src/server.rs +++ b/src/server.rs @@ -84,6 +84,7 @@ where const RATE_LIMIT_WINDOW_SECS: u64 = 60; const RATE_LIMIT_MAX_REQUESTS: u32 = 10; const RATE_LIMIT_MAX_ENTRIES: usize = 10_000; +const MAGIC_LINK_GLOBAL_MAX_PER_SEC: u32 = 20; const MAX_INPUT_LEN: usize = 256; /// Reject inputs with non-printable characters or excessive length. @@ -105,6 +106,10 @@ pub struct AppState { /// Sharded concurrent map — no global mutex contention under high concurrency. /// Uses monotonic Instant (not wall clock) to prevent clock-skew manipulation. rate_limiter: DashMap, + /// Global rate limit for magic link claims (epoch second of current window). + magic_link_window: std::sync::atomic::AtomicI64, + /// Count of magic link claims in the current 1-second window. + magic_link_count: std::sync::atomic::AtomicU32, } impl AppState { @@ -118,9 +123,16 @@ impl AppState { pub fn check_rate_limit(&self, ip: IpAddr) -> bool { let now = std::time::Instant::now(); let window = std::time::Duration::from_secs(RATE_LIMIT_WINDOW_SECS); - // Hard cap as defense-in-depth (approximate len is fine for rate limiting) + // Hard cap: if full, try to evict one expired entry before rejecting. + // Prevents attackers from permanently blocking legitimate IPs by filling the map. if self.rate_limiter.len() >= RATE_LIMIT_MAX_ENTRIES { - return false; + let stale_key = self.rate_limiter.iter() + .find(|entry| now.duration_since(entry.value().1) > window) + .map(|entry| *entry.key()); + match stale_key { + Some(key) => { self.rate_limiter.remove(&key); } + None => return false, + } } let mut entry = self.rate_limiter.entry(ip).or_insert((0, now)); if now.duration_since(entry.1) > window { @@ -133,6 +145,20 @@ impl AppState { true } } + + /// Global rate limit for magic link claim attempts (across all IPs). + /// Prevents distributed brute-force even if each IP stays under per-IP limits. + fn check_magic_link_global_rate(&self) -> bool { + let now = chrono::Utc::now().timestamp(); + let window = self.magic_link_window.load(std::sync::atomic::Ordering::Relaxed); + if window != now { + self.magic_link_window.store(now, std::sync::atomic::Ordering::Relaxed); + self.magic_link_count.store(1, std::sync::atomic::Ordering::Relaxed); + return true; + } + let count = self.magic_link_count.fetch_add(1, std::sync::atomic::Ordering::Relaxed); + count < MAGIC_LINK_GLOBAL_MAX_PER_SEC + } } #[derive(Serialize)] @@ -198,10 +224,16 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { sk_bytes.iter_mut().for_each(|b| *b = 0); // belt-and-suspenders drop(sk_bytes); - // Size pool to available parallelism (capped 2..8) — WAL mode allows concurrent readers - let pool_size = std::thread::available_parallelism() - .map(|n| n.get().clamp(2, 8)) - .unwrap_or(4); + // Pool size: env override or auto-detect from available parallelism (capped 2..8) + let pool_size = std::env::var("ATOMIC_POOL_SIZE") + .ok() + .and_then(|s| s.parse::().ok()) + .filter(|&n| (1..=64).contains(&n)) + .unwrap_or_else(|| { + std::thread::available_parallelism() + .map(|n| n.get().clamp(2, 8)) + .unwrap_or(4) + }); let db_pool = crate::db::open_pool(pool_size)?; let addr = SocketAddr::from(([0, 0, 0, 0], credentials.port)); @@ -219,6 +251,8 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { tls_active, behind_proxy, rate_limiter: DashMap::with_capacity(256), + magic_link_window: std::sync::atomic::AtomicI64::new(0), + magic_link_count: std::sync::atomic::AtomicU32::new(0), }); // Background task: WAL checkpoint every 5 minutes (PASSIVE to avoid blocking writers; @@ -319,6 +353,7 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { .route("/m/{code}", get(handle_magic_link)) .route("/_/health", get(handle_health)) .fallback(handle_404) + .layer(middleware::from_fn(request_timeout)) .layer(DefaultBodyLimit::max(MAX_BODY_SIZE)) .layer(middleware::from_fn_with_state( state.clone(), @@ -536,6 +571,10 @@ async fn handle_magic_link( if !state.check_rate_limit(addr.ip()) { return StatusCode::TOO_MANY_REQUESTS.into_response(); } + // Global rate limit: cap total claim attempts across all IPs + if !state.check_magic_link_global_rate() { + return StatusCode::TOO_MANY_REQUESTS.into_response(); + } // Reject obviously short codes or codes with non-printable chars before touching the DB if code.len() < 20 || !is_valid_input(&code) { @@ -619,6 +658,17 @@ async fn handle_404() -> StatusCode { StatusCode::NOT_FOUND } +/// Global request timeout (30s) — defense-in-depth against slow clients or stuck handlers. +async fn request_timeout( + req: axum::http::Request, + next: Next, +) -> Response { + match tokio::time::timeout(std::time::Duration::from_secs(30), next.run(req)).await { + Ok(resp) => resp, + Err(_) => StatusCode::REQUEST_TIMEOUT.into_response(), + } +} + async fn security_headers( State(state): State>, req: axum::http::Request, diff --git a/src/vault.rs b/src/vault.rs index 924e05e..fa7d23a 100644 --- a/src/vault.rs +++ b/src/vault.rs @@ -5,11 +5,11 @@ use crate::db; pub fn vault_set_with_conn(conn: &rusqlite::Connection, label: &str, value: &str, vault_key: &[u8; 32]) -> Result<()> { let encrypted = crypto_vault::encrypt(vault_key, value.as_bytes())?; - conn.execute( + conn.prepare_cached( "INSERT OR REPLACE INTO vault_secrets (label, value) VALUES (?1, ?2)", + )?.execute( rusqlite::params![label, encrypted], - ) - .context("Failed to store secret")?; + ).context("Failed to store secret")?; Ok(()) } From 14d80940218e7674ba650b8ccbb91b16f97280e4 Mon Sep 17 00:00:00 2001 From: Hitesh Joshi Date: Tue, 31 Mar 2026 07:00:08 +0530 Subject: [PATCH 23/49] Update README with changelog for 756eadb --- README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.md b/README.md index 25c566c..8ae9b19 100644 --- a/README.md +++ b/README.md @@ -271,6 +271,14 @@ Cross-compiles to `x86_64-linux-musl`, `aarch64-linux-musl`, `x86_64-apple-darwi - Rate limiter eviction split into dedicated 5-minute task (was hourly), bounding DashMap memory under sustained attack - Connection pool uses bounded `sync_channel(size)` instead of unbounded `channel()` for explicit capacity enforcement +**756eadb** — Cooperative conn interrupt, prepared statement cache, request timeout, global magic link rate limit +- `db.rs`: call `interrupt()` before force-closing stale SQLite connections for clean WAL rollback +- Hot-path DB queries (`deposit`, `magic_link`, `vault`) switched to `prepare_cached()` to eliminate repeated SQL parse overhead +- 30s global request timeout middleware as defense-in-depth against slow clients or stuck handlers +- Pool size configurable via `ATOMIC_POOL_SIZE` env var (1–64, default auto-detect) +- Rate limiter evicts one expired entry inline when DashMap is full instead of blanket-denying new IPs +- Global per-second rate limit (20/s) on magic link claims prevents distributed brute-force + **5fc281e** — AES-GCM zeroize, dynamic pool sizing, health check WAL monitor - Enable `zeroize` feature on `aes-gcm`: AES key schedule is now wiped from memory on cipher drop - DB connection pool sized dynamically via `available_parallelism()` (clamped 2..8) instead of hardcoded 4 From 64d8ea48ad9a1da7743dd1b53dc3a1c434d3666a Mon Sep 17 00:00:00 2001 From: Hitesh Joshi Date: Tue, 31 Mar 2026 07:09:38 +0530 Subject: [PATCH 24/49] =?UTF-8?q?=CE=BB-RLM=20iter=2017:=20zeroize=20vault?= =?UTF-8?q?=20plaintext=20return,=20cleanup=20indexes,=20paginated=20delet?= =?UTF-8?q?es?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - vault_get returns Zeroizing so decrypted plaintext is wiped from heap on drop instead of left in freed memory - Add indexes on expires_at, used_at, deposited_at columns used by hourly cleanup task to avoid full table scans under write lock - Paginate cleanup DELETEs (1000 rows/batch via rowid subquery) to prevent long WAL write locks under heavy load --- src/db.rs | 9 ++++++++- src/server.rs | 38 ++++++++++++++++++++++++++++++++------ src/vault.rs | 16 ++++++++++------ 3 files changed, 50 insertions(+), 13 deletions(-) diff --git a/src/db.rs b/src/db.rs index 41543fa..eab7623 100644 --- a/src/db.rs +++ b/src/db.rs @@ -164,7 +164,14 @@ fn migrate(conn: &Connection) -> Result<()> { source_ip TEXT, user_agent TEXT, deposited_at INTEGER NOT NULL - );", + ); + + -- Indexes on time columns used by the hourly cleanup task. + -- Without these, DELETE ... WHERE expires_at/used_at/deposited_at < ? + -- does a full table scan, holding a write lock longer than necessary. + CREATE INDEX IF NOT EXISTS idx_magic_links_expires ON magic_links(expires_at); + CREATE INDEX IF NOT EXISTS idx_used_deposits_used_at ON used_deposits(used_at); + CREATE INDEX IF NOT EXISTS idx_deposit_log_deposited_at ON deposit_log(deposited_at);", ) .context("Failed to run migrations")?; Ok(()) diff --git a/src/server.rs b/src/server.rs index 135e1b7..94299f1 100644 --- a/src/server.rs +++ b/src/server.rs @@ -295,17 +295,43 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { } }; let now = chrono::Utc::now().timestamp(); - if let Err(e) = conn.execute("DELETE FROM magic_links WHERE expires_at <= ?1", [now]) { - tracing::warn!("Failed to clean expired magic links: {e}"); + // Paginated deletes: batch 1000 rows at a time to avoid holding + // the WAL write lock for extended periods under heavy load. + loop { + match conn.execute( + "DELETE FROM magic_links WHERE rowid IN \ + (SELECT rowid FROM magic_links WHERE expires_at <= ?1 LIMIT 1000)", + [now], + ) { + Ok(0) => break, + Ok(_) => continue, + Err(e) => { tracing::warn!("Failed to clean expired magic links: {e}"); break; } + } } let cutoff = now - 7 * 86400; - if let Err(e) = conn.execute("DELETE FROM used_deposits WHERE used_at < ?1", [cutoff]) { - tracing::warn!("Failed to clean old deposit nonces: {e}"); + loop { + match conn.execute( + "DELETE FROM used_deposits WHERE rowid IN \ + (SELECT rowid FROM used_deposits WHERE used_at < ?1 LIMIT 1000)", + [cutoff], + ) { + Ok(0) => break, + Ok(_) => continue, + Err(e) => { tracing::warn!("Failed to clean old deposit nonces: {e}"); break; } + } } // Purge deposit log entries older than 90 days to prevent unbounded disk growth let log_cutoff = now - 90 * 86400; - if let Err(e) = conn.execute("DELETE FROM deposit_log WHERE deposited_at < ?1", [log_cutoff]) { - tracing::warn!("Failed to clean old deposit log entries: {e}"); + loop { + match conn.execute( + "DELETE FROM deposit_log WHERE rowid IN \ + (SELECT rowid FROM deposit_log WHERE deposited_at < ?1 LIMIT 1000)", + [log_cutoff], + ) { + Ok(0) => break, + Ok(_) => continue, + Err(e) => { tracing::warn!("Failed to clean old deposit log entries: {e}"); break; } + } } // TRUNCATE checkpoint hourly to reclaim WAL disk space if let Err(e) = conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);") { diff --git a/src/vault.rs b/src/vault.rs index fa7d23a..af78ad2 100644 --- a/src/vault.rs +++ b/src/vault.rs @@ -1,4 +1,5 @@ use anyhow::{bail, Context, Result}; +use zeroize::Zeroizing; use crate::crypto::vault as crypto_vault; use crate::db; @@ -18,7 +19,7 @@ pub fn vault_set(label: &str, value: &str, vault_key: &[u8; 32]) -> Result<()> { vault_set_with_conn(&conn, label, value, vault_key) } -pub fn vault_get(label: &str, vault_key: &[u8; 32]) -> Result> { +pub fn vault_get(label: &str, vault_key: &[u8; 32]) -> Result>> { let conn = db::open()?; let mut stmt = conn .prepare("SELECT value FROM vault_secrets WHERE label = ?1") @@ -31,10 +32,13 @@ pub fn vault_get(label: &str, vault_key: &[u8; 32]) -> Result> { match result { Some(encrypted) => { let plaintext = crypto_vault::decrypt(vault_key, &encrypted)?; - // plaintext is Zeroizing> — borrow, convert, then let it drop (zeroing memory) - let value = std::str::from_utf8(&plaintext) - .context("Vault value is not valid UTF-8")? - .to_string(); + // Convert to String and wrap in Zeroizing so the heap copy is wiped on drop, + // not left as cleartext in freed memory. + let value = Zeroizing::new( + std::str::from_utf8(&plaintext) + .context("Vault value is not valid UTF-8")? + .to_string(), + ); Ok(Some(value)) } None => Ok(None), @@ -93,7 +97,7 @@ pub fn cmd_set(label: &str, value: &str, vault_key: &[u8; 32]) -> Result<()> { pub fn cmd_get(label: &str, vault_key: &[u8; 32]) -> Result<()> { match vault_get(label, vault_key)? { Some(value) => { - print!("{value}"); // no trailing newline, so it works in $() + print!("{}", &*value); // no trailing newline, so it works in $() Ok(()) } None => bail!("Label '{label}' not found in vault"), From b168a7f5a64a8145ae8e10fd48a8aeb68e9e8669 Mon Sep 17 00:00:00 2001 From: Hitesh Joshi Date: Tue, 31 Mar 2026 07:09:58 +0530 Subject: [PATCH 25/49] Update README with changelog for 64d8ea4 --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index 8ae9b19..a0c45c4 100644 --- a/README.md +++ b/README.md @@ -271,6 +271,11 @@ Cross-compiles to `x86_64-linux-musl`, `aarch64-linux-musl`, `x86_64-apple-darwi - Rate limiter eviction split into dedicated 5-minute task (was hourly), bounding DashMap memory under sustained attack - Connection pool uses bounded `sync_channel(size)` instead of unbounded `channel()` for explicit capacity enforcement +**64d8ea4** — Zeroize vault plaintext return, cleanup indexes, paginated deletes +- `vault_get` returns `Zeroizing` so decrypted plaintext is wiped from heap on drop instead of left in freed memory +- Added indexes on `expires_at`, `used_at`, `deposited_at` columns to eliminate full table scans during hourly cleanup +- Paginated cleanup DELETEs (1000 rows/batch via rowid subquery) to prevent long WAL write locks under heavy load + **756eadb** — Cooperative conn interrupt, prepared statement cache, request timeout, global magic link rate limit - `db.rs`: call `interrupt()` before force-closing stale SQLite connections for clean WAL rollback - Hot-path DB queries (`deposit`, `magic_link`, `vault`) switched to `prepare_cached()` to eliminate repeated SQL parse overhead From 033e801899a4b890eda9fde516527670a297e3b6 Mon Sep 17 00:00:00 2001 From: Hitesh Joshi Date: Tue, 31 Mar 2026 07:17:15 +0530 Subject: [PATCH 26/49] =?UTF-8?q?=CE=BB-RLM=20iter=2018:=20connection=20ma?= =?UTF-8?q?x-lifetime=20recycling,=20hard=20heap=20limit,=20zero-copy=20si?= =?UTF-8?q?g=20decode,=20safer=20shutdown=20checkpoint?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - db: recycle pooled connections after 30 min to reset SQLite allocator fragmentation - db: track connection creation time (created_at) alongside checkout time (acquired_at) - db: PRAGMA hard_heap_limit=128MB to prevent OOM under sustained load - db: increase prepared statement cache capacity to 100 (from default 16) - server: shutdown checkpoint uses RESTART with PASSIVE fallback instead of TRUNCATE - server: health endpoint checks disk space (>100MB free) via fs2::available_space - deposit: stack-allocated [u8;64] for Ed25519 signature base64 decode (zero heap alloc) - main: panic hook cleans up orphaned .tmp.* files from write_secure --- src/db.rs | 44 ++++++++++++++++++++++++++++++++++++-------- src/deposit.rs | 11 ++++++++--- src/main.rs | 16 ++++++++++++++-- src/server.rs | 22 +++++++++++++++++----- 4 files changed, 75 insertions(+), 18 deletions(-) diff --git a/src/db.rs b/src/db.rs index eab7623..3c68d5d 100644 --- a/src/db.rs +++ b/src/db.rs @@ -1,18 +1,23 @@ use anyhow::{Context, Result}; use rusqlite::Connection; -use std::path::Path; +use std::path::{Path, PathBuf}; use std::sync::mpsc; use std::time::{Duration, Instant}; use crate::config; +/// Maximum lifetime for a pooled connection before it's recycled. +/// Prevents SQLite page cache fragmentation from accumulating over hours/days. +const CONN_MAX_LIFETIME: Duration = Duration::from_secs(1800); // 30 minutes + /// Zero-dependency connection pool for SQLite. /// Uses a bounded sync_channel to distribute pre-opened connections. /// WAL mode allows concurrent readers; the pool prevents serialization /// behind a single Mutex. pub struct DbPool { - sender: mpsc::SyncSender, - receiver: std::sync::Mutex>, + sender: mpsc::SyncSender<(Connection, Instant)>, + receiver: std::sync::Mutex>, + db_path: PathBuf, } /// RAII guard that returns the connection to the pool on drop. @@ -20,6 +25,7 @@ pub struct DbPool { pub struct PooledConn<'a> { pool: &'a DbPool, conn: Option, + created_at: Instant, acquired_at: Instant, } @@ -45,21 +51,37 @@ impl Drop for PooledConn<'_> { if held > Duration::from_secs(30) { tracing::warn!("SQLite connection held for {:?}, possible leak", held); } - let _ = self.pool.sender.try_send(c); + let _ = self.pool.sender.try_send((c, self.created_at)); } } } impl DbPool { /// Get a connection from the pool, blocking up to 5 seconds. + /// Connections older than 30 minutes are recycled to reset SQLite's + /// internal allocator and prevent page cache fragmentation. pub fn get(&self) -> Result> { let rx = self.receiver.lock().unwrap_or_else(|e| e.into_inner()); - let conn = rx - .recv_timeout(std::time::Duration::from_secs(5)) + let (conn, created_at) = rx + .recv_timeout(Duration::from_secs(5)) .map_err(|_| anyhow::anyhow!("DB pool exhausted (5s timeout)"))?; + + // Recycle stale connections to reset SQLite's internal allocator + if created_at.elapsed() > CONN_MAX_LIFETIME { + drop(conn); + let fresh = open_connection(&self.db_path)?; + return Ok(PooledConn { + pool: self, + conn: Some(fresh), + created_at: Instant::now(), + acquired_at: Instant::now(), + }); + } + Ok(PooledConn { pool: self, conn: Some(conn), + created_at, acquired_at: Instant::now(), }) } @@ -94,6 +116,10 @@ fn open_connection(db_path: &Path) -> Result { conn.pragma_update(None, "journal_size_limit", "67108864")?; conn.pragma_update(None, "wal_autocheckpoint", "1000")?; conn.pragma_update(None, "mmap_size", "67108864")?; + // Process-wide SQLite memory limit to prevent OOM under sustained load + let _ = conn.pragma_update(None, "hard_heap_limit", "134217728"); // 128MB + // Increase prepared statement cache for hot query paths (default is 16) + conn.set_prepared_statement_cache_capacity(100); Ok(conn) } @@ -107,16 +133,18 @@ pub fn open_pool(size: usize) -> Result { let first = open_connection(&db_path)?; migrate(&first)?; + let now = Instant::now(); let (tx, rx) = mpsc::sync_channel(size); - tx.send(first).expect("channel just created"); + tx.send((first, now)).expect("channel just created"); for _ in 1..size { - tx.send(open_connection(&db_path)?).expect("channel just created"); + tx.send((open_connection(&db_path)?, now)).expect("channel just created"); } Ok(DbPool { sender: tx, receiver: std::sync::Mutex::new(rx), + db_path, }) } diff --git a/src/deposit.rs b/src/deposit.rs index 47ff36d..40e4ba9 100644 --- a/src/deposit.rs +++ b/src/deposit.rs @@ -63,9 +63,14 @@ fn try_verify_signature( .split_once('.') .ok_or_else(|| anyhow::anyhow!("No '.' separator in token"))?; - let sig_bytes = B64URL.decode(sig_b64)?; - let sig = ed25519_dalek::Signature::from_slice(&sig_bytes) - .map_err(|e| anyhow::anyhow!("Bad signature bytes: {e}"))?; + // Decode signature into stack-allocated buffer (zero heap allocation) + let mut sig_buf = [0u8; 64]; + let sig_len = B64URL.decode_slice(sig_b64, &mut sig_buf) + .map_err(|e| anyhow::anyhow!("Bad signature: {e}"))?; + if sig_len != 64 { + anyhow::bail!("Signature must be 64 bytes, got {sig_len}"); + } + let sig = ed25519_dalek::Signature::from_bytes(&sig_buf); use ed25519_dalek::Verifier; verifying_key diff --git a/src/main.rs b/src/main.rs index 43c5ce9..14bd3bd 100644 --- a/src/main.rs +++ b/src/main.rs @@ -28,13 +28,25 @@ async fn main() -> Result<()> { .with_env_filter(EnvFilter::from_default_env().add_directive("atomic=info".parse()?)) .init(); - // Clean up PID file on panic (best-effort). With panic=abort in release, - // the hook still runs before the process terminates. + // Clean up PID file and temp files on panic (best-effort). + // With panic=abort in release, the hook still runs before the process terminates. std::panic::set_hook(Box::new(|info| { eprintln!("atomic: fatal panic: {info}"); if let Ok(path) = config::pid_path() { let _ = std::fs::remove_file(path); } + // Clean temp files left by write_secure (atomic write pattern) + if let Ok(dir) = config::atomic_dir() { + if let Ok(entries) = std::fs::read_dir(&dir) { + for entry in entries.flatten() { + if let Some(name) = entry.file_name().to_str() { + if name.contains(".tmp.") { + let _ = std::fs::remove_file(entry.path()); + } + } + } + } + } })); let cli = Cli::parse(); diff --git a/src/server.rs b/src/server.rs index 94299f1..53b6467 100644 --- a/src/server.rs +++ b/src/server.rs @@ -460,8 +460,13 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { tokio::task::spawn_blocking(move || { match shutdown_state.db_pool.get() { Ok(conn) => { - if let Err(e) = conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);") { - tracing::warn!("Final WAL checkpoint failed: {e}"); + // RESTART checkpoints and resets the WAL header without truncating. + // Safer than TRUNCATE which can block indefinitely if readers exist. + if let Err(e) = conn.execute_batch("PRAGMA wal_checkpoint(RESTART);") { + tracing::warn!("Final WAL RESTART checkpoint failed: {e}, falling back to PASSIVE"); + if let Err(e2) = conn.execute_batch("PRAGMA wal_checkpoint(PASSIVE);") { + tracing::warn!("Final WAL PASSIVE checkpoint also failed: {e2}"); + } } } Err(e) => tracing::warn!("Final WAL checkpoint skipped: {e}"), @@ -669,12 +674,19 @@ async fn handle_health(State(state): State>) -> Response { .map(|m| m.len() < 50 * 1024 * 1024) .unwrap_or(true); // WAL not existing is fine - if db_ok && agent_ok && wal_ok { + // Check disk space (>100MB free) to prevent SQLite "disk full" corruption + let disk_ok = crate::config::atomic_dir() + .ok() + .and_then(|d| fs2::available_space(&d).ok()) + .map(|avail| avail > 100 * 1024 * 1024) + .unwrap_or(true); // If we can't check, assume ok + + if db_ok && agent_ok && wal_ok && disk_ok { (StatusCode::OK, [(header::CONTENT_TYPE, "application/json")], r#"{"status":"ok"}"#).into_response() } else { let detail = format!( - r#"{{"status":"degraded","db":{},"agent_json":{},"wal_size_ok":{}}}"#, - db_ok, agent_ok, wal_ok + r#"{{"status":"degraded","db":{},"agent_json":{},"wal_size_ok":{},"disk_space_ok":{}}}"#, + db_ok, agent_ok, wal_ok, disk_ok ); (StatusCode::SERVICE_UNAVAILABLE, [(header::CONTENT_TYPE, "application/json")], detail).into_response() } From b3e7b5f031424c3839d12118f066daedf2749281 Mon Sep 17 00:00:00 2001 From: Hitesh Joshi Date: Tue, 31 Mar 2026 07:17:40 +0530 Subject: [PATCH 27/49] Update README with changelog for 033e801 --- README.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/README.md b/README.md index a0c45c4..23d48c9 100644 --- a/README.md +++ b/README.md @@ -284,6 +284,15 @@ Cross-compiles to `x86_64-linux-musl`, `aarch64-linux-musl`, `x86_64-apple-darwi - Rate limiter evicts one expired entry inline when DashMap is full instead of blanket-denying new IPs - Global per-second rate limit (20/s) on magic link claims prevents distributed brute-force +**033e801** — Connection max-lifetime recycling, hard heap limit, zero-copy sig decode, safer shutdown +- Pooled SQLite connections recycled after 30 min to reset allocator fragmentation (new `created_at` tracking) +- `PRAGMA hard_heap_limit=128MB` caps process-wide SQLite memory to prevent OOM +- Prepared statement cache capacity increased to 100 (from default 16) +- Shutdown WAL checkpoint uses RESTART with PASSIVE fallback instead of TRUNCATE (avoids indefinite blocking) +- Health endpoint checks disk space (>100MB free via `fs2::available_space`) to prevent disk-full corruption +- Ed25519 signature base64 decode uses stack-allocated `[u8; 64]` instead of heap `Vec` (zero-alloc hot path) +- Panic hook cleans orphaned `.tmp.*` files from `write_secure` atomic write pattern + **5fc281e** — AES-GCM zeroize, dynamic pool sizing, health check WAL monitor - Enable `zeroize` feature on `aes-gcm`: AES key schedule is now wiped from memory on cipher drop - DB connection pool sized dynamically via `available_parallelism()` (clamped 2..8) instead of hardcoded 4 From 4bc32f4112dea7756e547df24bd113ba7b679e45 Mon Sep 17 00:00:00 2001 From: Hitesh Joshi Date: Tue, 31 Mar 2026 07:25:55 +0530 Subject: [PATCH 28/49] =?UTF-8?q?=CE=BB-RLM=20iter=2019:=20pool=20poison?= =?UTF-8?q?=20detection,=20DB=20circuit=20breaker,=20RLIMIT=5FNOFILE=20enf?= =?UTF-8?q?orcement,=20proactive=20WAL=20truncation,=20X-Frame-Options?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - db.rs: detect active transactions in PooledConn::drop via is_autocommit(), roll back before returning to pool to prevent poisoned connections - server.rs: add per-request DB circuit breaker (AtomicU32/AtomicU64) that opens after 5 consecutive failures, returns 503+Retry-After for 30s - server.rs: enforce RLIMIT_NOFILE ≥4096 at startup via libc::setrlimit, fail-fast if hard limit insufficient - server.rs: proactive WAL truncation when file exceeds 40MB in the 5-minute checkpoint task (escalates PASSIVE → TRUNCATE → RESTART) - server.rs: offload rate limiter DashMap::retain() to spawn_blocking to avoid stalling async executor under high IP count - server.rs: add X-Frame-Options: DENY security header --- Cargo.lock | 1 + Cargo.toml | 1 + src/db.rs | 7 +++ src/server.rs | 132 ++++++++++++++++++++++++++++++++++++++++++++------ 4 files changed, 127 insertions(+), 14 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d9121ec..a084fce 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -139,6 +139,7 @@ dependencies = [ "fs2", "hex", "hkdf", + "libc", "rand 0.8.5", "reqwest", "rusqlite", diff --git a/Cargo.toml b/Cargo.toml index ef95a84..d201001 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -69,6 +69,7 @@ zeroize = { version = "1.8.2", features = ["derive"] } dashmap = "6" fs2 = "0.4" tikv-jemallocator = { version = "0.6", optional = true } +libc = "0.2" [profile.release] opt-level = 3 diff --git a/src/db.rs b/src/db.rs index 3c68d5d..56c4ebd 100644 --- a/src/db.rs +++ b/src/db.rs @@ -51,6 +51,13 @@ impl Drop for PooledConn<'_> { if held > Duration::from_secs(30) { tracing::warn!("SQLite connection held for {:?}, possible leak", held); } + // Poison detection: if a transaction is still active (e.g., handler panicked + // mid-write), roll it back before returning to the pool. Returning a dirty + // connection could corrupt subsequent operations on that pooled handle. + if !c.is_autocommit() { + tracing::warn!("Returning connection with active transaction, rolling back"); + let _ = c.execute_batch("ROLLBACK"); + } let _ = self.pool.sender.try_send((c, self.created_at)); } } diff --git a/src/server.rs b/src/server.rs index 53b6467..4a00a20 100644 --- a/src/server.rs +++ b/src/server.rs @@ -87,6 +87,18 @@ const RATE_LIMIT_MAX_ENTRIES: usize = 10_000; const MAGIC_LINK_GLOBAL_MAX_PER_SEC: u32 = 20; const MAX_INPUT_LEN: usize = 256; +/// Consecutive DB failures before the circuit breaker opens. +const DB_CIRCUIT_THRESHOLD: u32 = 5; +/// How long (seconds) the circuit stays open before allowing a probe request. +const DB_CIRCUIT_OPEN_SECS: u64 = 30; + +fn epoch_secs() -> u64 { + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_secs() +} + /// Reject inputs with non-printable characters or excessive length. fn is_valid_input(s: &str) -> bool { !s.is_empty() @@ -110,6 +122,10 @@ pub struct AppState { magic_link_window: std::sync::atomic::AtomicI64, /// Count of magic link claims in the current 1-second window. magic_link_count: std::sync::atomic::AtomicU32, + /// Circuit breaker: consecutive DB operation failure count. + db_fail_count: std::sync::atomic::AtomicU32, + /// Circuit breaker: epoch second when circuit opened (0 = closed). + db_circuit_opened_at: std::sync::atomic::AtomicU64, } impl AppState { @@ -146,6 +162,38 @@ impl AppState { } } + /// Record a DB operation failure. Opens the circuit after DB_CIRCUIT_THRESHOLD consecutive failures. + fn record_db_failure(&self) { + let count = self.db_fail_count.fetch_add(1, std::sync::atomic::Ordering::Relaxed) + 1; + if count >= DB_CIRCUIT_THRESHOLD { + let now = epoch_secs(); + let prev = self.db_circuit_opened_at.load(std::sync::atomic::Ordering::Relaxed); + if prev == 0 { + self.db_circuit_opened_at.store(now, std::sync::atomic::Ordering::Relaxed); + tracing::error!("DB circuit breaker OPEN after {count} consecutive failures"); + } + } + } + + /// Record a successful DB operation. Resets the circuit breaker. + fn record_db_success(&self) { + self.db_fail_count.store(0, std::sync::atomic::Ordering::Relaxed); + if self.db_circuit_opened_at.load(std::sync::atomic::Ordering::Relaxed) != 0 { + self.db_circuit_opened_at.store(0, std::sync::atomic::Ordering::Relaxed); + tracing::info!("DB circuit breaker closed after successful operation"); + } + } + + /// Returns true if the circuit is open (DB operations should be rejected). + fn is_db_circuit_open(&self) -> bool { + let opened_at = self.db_circuit_opened_at.load(std::sync::atomic::Ordering::Relaxed); + if opened_at == 0 { + return false; + } + // Allow a probe request after the cool-down period (half-open state) + epoch_secs().saturating_sub(opened_at) < DB_CIRCUIT_OPEN_SECS + } + /// Global rate limit for magic link claim attempts (across all IPs). /// Prevents distributed brute-force even if each IP stays under per-IP limits. fn check_magic_link_global_rate(&self) -> bool { @@ -178,20 +226,29 @@ const MAX_BODY_SIZE: usize = 1024 * 1024; pub async fn run_server(credentials: Credentials) -> Result<()> { // --- Startup checks --- - // Fix 6: Warn if file descriptor limit is too low for a long-lived TLS server + // Enforce file descriptor limit: raise soft limit toward 65535 if below 4096. + // Without enough fds, a connection spike causes "too many open files" crashes. #[cfg(unix)] { - if let Ok(output) = std::process::Command::new("sh") - .args(["-c", "ulimit -n"]) - .output() - { - if let Ok(s) = std::str::from_utf8(&output.stdout) { - if let Ok(n) = s.trim().parse::() { - if n < 4096 { + let mut rlim = libc::rlimit { rlim_cur: 0, rlim_max: 0 }; + if unsafe { libc::getrlimit(libc::RLIMIT_NOFILE, &mut rlim) } == 0 { + let current = rlim.rlim_cur; + if current < 4096 { + let target = rlim.rlim_max.min(65535); + if target >= 4096 { + rlim.rlim_cur = target; + if unsafe { libc::setrlimit(libc::RLIMIT_NOFILE, &rlim) } == 0 { + tracing::info!("Raised RLIMIT_NOFILE soft limit: {current} → {target}"); + } else { tracing::warn!( - "RLIMIT_NOFILE is {n}, recommended minimum 4096 for production" + "Failed to raise RLIMIT_NOFILE from {current} to {target}" ); } + } else { + anyhow::bail!( + "RLIMIT_NOFILE hard limit is {} (need ≥4096). Raise with: ulimit -n 4096", + rlim.rlim_max + ); } } } @@ -253,6 +310,8 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { rate_limiter: DashMap::with_capacity(256), magic_link_window: std::sync::atomic::AtomicI64::new(0), magic_link_count: std::sync::atomic::AtomicU32::new(0), + db_fail_count: std::sync::atomic::AtomicU32::new(0), + db_circuit_opened_at: std::sync::atomic::AtomicU64::new(0), }); // Background task: WAL checkpoint every 5 minutes (PASSIVE to avoid blocking writers; @@ -265,9 +324,24 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { tokio::time::sleep(std::time::Duration::from_secs(300)).await; let db_ref = st.clone(); let _ = tokio::task::spawn_blocking(move || { + // Check WAL file size: escalate from PASSIVE to TRUNCATE if >40MB + // to prevent unbounded WAL growth under heavy reader load. + let wal_large = crate::config::atomic_dir() + .map(|d| d.join("atomic.db-wal")) + .ok() + .and_then(|p| std::fs::metadata(&p).ok()) + .map(|m| m.len() > 40 * 1024 * 1024) + .unwrap_or(false); + match db_ref.db_pool.get() { Ok(conn) => { - if let Err(e) = conn.execute_batch("PRAGMA wal_checkpoint(PASSIVE);") { + if wal_large { + tracing::warn!("WAL exceeds 40MB, forcing TRUNCATE checkpoint"); + if let Err(e) = conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);") { + tracing::warn!("WAL TRUNCATE checkpoint failed: {e}, falling back to RESTART"); + let _ = conn.execute_batch("PRAGMA wal_checkpoint(RESTART);"); + } + } else if let Err(e) = conn.execute_batch("PRAGMA wal_checkpoint(PASSIVE);") { tracing::warn!("WAL checkpoint failed: {e}"); } } @@ -352,9 +426,14 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { async move { loop { tokio::time::sleep(std::time::Duration::from_secs(300)).await; - let cutoff = std::time::Duration::from_secs(RATE_LIMIT_WINDOW_SECS); - let now = std::time::Instant::now(); - st.rate_limiter.retain(|_, (_, window_start)| now.duration_since(*window_start) <= cutoff); + // Offload O(n) DashMap scan to the blocking pool to avoid stalling + // the async executor under 100k+ entries. + let st2 = st.clone(); + let _ = tokio::task::spawn_blocking(move || { + let cutoff = std::time::Duration::from_secs(RATE_LIMIT_WINDOW_SECS); + let now = std::time::Instant::now(); + st2.rate_limiter.retain(|_, (_, window_start)| now.duration_since(*window_start) <= cutoff); + }).await; } } }); @@ -516,6 +595,11 @@ async fn handle_deposit( return StatusCode::NOT_FOUND.into_response(); } + // Circuit breaker: reject immediately if DB is known-broken (disk full, corrupt, etc.) + if state.is_db_circuit_open() { + return (StatusCode::SERVICE_UNAVAILABLE, [("retry-after", "30")]).into_response(); + } + // Only trust X-Forwarded-For when running behind a known reverse proxy. // Parse the rightmost entry as IpAddr to reject spoofed non-IP values. let source_ip = if state.behind_proxy { @@ -567,9 +651,11 @@ async fn handle_deposit( match deposit_result { Err(_elapsed) => { tracing::error!("Deposit handler timed out"); + state.record_db_failure(); StatusCode::NOT_FOUND.into_response() } Ok(Ok(Ok(label))) => { + state.record_db_success(); info!("Deposit received: '{label}'"); let resp = DepositResponse { status: "deposited", label }; match serde_json::to_string(&resp) { @@ -583,11 +669,13 @@ async fn handle_deposit( tracing::debug!("Deposit replay rejected"); } else { tracing::error!("Deposit failed: {e}"); + state.record_db_failure(); } StatusCode::NOT_FOUND.into_response() } Ok(Err(e)) => { tracing::error!("Deposit task panicked: {e}"); + state.record_db_failure(); StatusCode::NOT_FOUND.into_response() } } @@ -612,6 +700,11 @@ async fn handle_magic_link( return StatusCode::NOT_FOUND.into_response(); } + // Circuit breaker: reject immediately if DB is known-broken + if state.is_db_circuit_open() { + return (StatusCode::SERVICE_UNAVAILABLE, [("retry-after", "30")]).into_response(); + } + let state_clone = state.clone(); let code_clone = code; let result = tokio::time::timeout( @@ -625,22 +718,29 @@ async fn handle_magic_link( match result { Err(_elapsed) => { tracing::error!("Magic link handler timed out"); + state.record_db_failure(); StatusCode::NOT_FOUND.into_response() } Ok(Ok(Ok(Some(_)))) => { + state.record_db_success(); let resp = MagicLinkResponse { status: "verified" }; match serde_json::to_string(&resp) { Ok(json) => (StatusCode::OK, [(header::CONTENT_TYPE, "application/json")], json).into_response(), Err(_) => StatusCode::NOT_FOUND.into_response(), } } - Ok(Ok(Ok(None))) => StatusCode::NOT_FOUND.into_response(), + Ok(Ok(Ok(None))) => { + state.record_db_success(); + StatusCode::NOT_FOUND.into_response() + } Ok(Ok(Err(e))) => { tracing::error!("Magic link DB error: {e}"); + state.record_db_failure(); StatusCode::NOT_FOUND.into_response() } Ok(Err(e)) => { tracing::error!("Magic link task panicked: {e}"); + state.record_db_failure(); StatusCode::NOT_FOUND.into_response() } } @@ -730,6 +830,10 @@ async fn security_headers( header::CONTENT_SECURITY_POLICY, HeaderValue::from_static("default-src 'none'"), ); + headers.insert( + header::X_FRAME_OPTIONS, + HeaderValue::from_static("DENY"), + ); if state.tls_active { headers.insert( header::STRICT_TRANSPORT_SECURITY, From 4c321f16c588913d215b8dc2e0b115ad8a96dbef Mon Sep 17 00:00:00 2001 From: Hitesh Joshi Date: Tue, 31 Mar 2026 07:26:18 +0530 Subject: [PATCH 29/49] Update README with changelog for 4bc32f4 --- README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.md b/README.md index 23d48c9..bf8e23c 100644 --- a/README.md +++ b/README.md @@ -284,6 +284,14 @@ Cross-compiles to `x86_64-linux-musl`, `aarch64-linux-musl`, `x86_64-apple-darwi - Rate limiter evicts one expired entry inline when DashMap is full instead of blanket-denying new IPs - Global per-second rate limit (20/s) on magic link claims prevents distributed brute-force +**4bc32f4** — Pool poison detection, DB circuit breaker, RLIMIT_NOFILE enforcement, proactive WAL truncation +- `PooledConn::drop` checks `is_autocommit()` and rolls back active transactions before returning to pool (prevents poisoned connections) +- Per-request DB circuit breaker: opens after 5 consecutive failures, returns 503+Retry-After:30 for 30s cool-down +- Startup enforces RLIMIT_NOFILE ≥4096 via `libc::setrlimit`, raises soft limit toward 65535 (prevents "too many open files" under load) +- WAL checkpoint task escalates from PASSIVE to TRUNCATE when WAL exceeds 40MB (prevents unbounded WAL growth) +- Rate limiter `DashMap::retain()` offloaded to `spawn_blocking` to avoid stalling async executor under 100k+ IP entries +- `X-Frame-Options: DENY` security header added to all responses + **033e801** — Connection max-lifetime recycling, hard heap limit, zero-copy sig decode, safer shutdown - Pooled SQLite connections recycled after 30 min to reset allocator fragmentation (new `created_at` tracking) - `PRAGMA hard_heap_limit=128MB` caps process-wide SQLite memory to prevent OOM From 72f83056fb0d43a61f254360c0521e380a69fe70 Mon Sep 17 00:00:00 2001 From: Hitesh Joshi Date: Tue, 31 Mar 2026 07:36:28 +0530 Subject: [PATCH 30/49] =?UTF-8?q?=CE=BB-RLM=20iter=2020:=20constant-time?= =?UTF-8?q?=20magic=20link,=20fs=20cert=20watcher,=20FxHasher=20rate=20lim?= =?UTF-8?q?iter,=20Acquire/Release=20circuit=20breaker,=20in-flight=20drai?= =?UTF-8?q?n,=20panic-safe=20pool=20return?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - magic_link.rs: SELECT + subtle::ConstantTimeEq before DELETE to prevent timing side-channels on code existence - main.rs: enable jemalloc background thread for aggressive memory purging of zeroed key material - tls.rs: replace 12h polling with notify filesystem watcher (kqueue/inotify) + polling fallback - server.rs: DashMap rate limiter uses FxHasher (~2-3x faster for IP keys), circuit breaker atomics upgraded from Relaxed to Acquire/Release for ARM correctness, added in-flight request counter with 30s drain before WAL checkpoint, reduced MAX_BODY_SIZE from 1MB to 64KB - db.rs: catch_unwind in PooledConn::Drop to prevent double-panic abort skipping Zeroizing destructors --- Cargo.lock | 179 ++++++++++++++++++++++++++++++++++++++++++++-- Cargo.toml | 6 +- src/db.rs | 11 ++- src/magic_link.rs | 35 +++++++-- src/main.rs | 10 +++ src/server.rs | 75 ++++++++++++++----- src/tls.rs | 116 +++++++++++++++++++++++++----- 7 files changed, 386 insertions(+), 46 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a084fce..be75f37 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -140,13 +140,17 @@ dependencies = [ "hex", "hkdf", "libc", + "notify", "rand 0.8.5", "reqwest", "rusqlite", + "rustc-hash", "serde", "serde_json", "sha2", + "subtle", "thiserror", + "tikv-jemalloc-ctl", "tikv-jemallocator", "tokio", "tower-http", @@ -275,6 +279,12 @@ version = "1.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2af50177e190e07a26ab74f8b1efbfe2ef87da2116221318cb1c2e82baf7de06" +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + [[package]] name = "bitflags" version = "2.11.0" @@ -617,6 +627,17 @@ version = "0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "28dea519a9695b9977216879a3ebfddf92f1c08c05d984f8996aecd6ecdc811d" +[[package]] +name = "filetime" +version = "0.2.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f98844151eee8917efc50bd9e8318cb963ae8b297431495d3f758616ea5c57db" +dependencies = [ + "cfg-if", + "libc", + "libredox", +] + [[package]] name = "find-msvc-tools" version = "0.1.9" @@ -670,6 +691,15 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" +[[package]] +name = "fsevent-sys" +version = "4.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76ee7a02da4d231650c7cea31349b889be2f45ddb3ef3032d2ec8185f6313fd2" +dependencies = [ + "libc", +] + [[package]] name = "futures-channel" version = "0.3.32" @@ -1079,6 +1109,26 @@ dependencies = [ "hashbrown 0.16.1", ] +[[package]] +name = "inotify" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdd168d97690d0b8c412d6b6c10360277f4d7ee495c5d0d5d5fe0854923255cc" +dependencies = [ + "bitflags 1.3.2", + "inotify-sys", + "libc", +] + +[[package]] +name = "inotify-sys" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e05c02b5e89bff3b946cedeca278abc628fe811e604f027c45a8aa3cf793d0eb" +dependencies = [ + "libc", +] + [[package]] name = "inout" version = "0.1.4" @@ -1088,6 +1138,15 @@ dependencies = [ "generic-array", ] +[[package]] +name = "instant" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222" +dependencies = [ + "cfg-if", +] + [[package]] name = "ipnet" version = "2.12.0" @@ -1136,6 +1195,26 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "kqueue" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eac30106d7dce88daf4a3fcb4879ea939476d5074a9b7ddd0fb97fa4bed5596a" +dependencies = [ + "kqueue-sys", + "libc", +] + +[[package]] +name = "kqueue-sys" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed9625ffda8729b85e45cf04090035ac368927b8cebc34898e7c120f52e4838b" +dependencies = [ + "bitflags 1.3.2", + "libc", +] + [[package]] name = "lazy_static" version = "1.5.0" @@ -1154,7 +1233,10 @@ version = "0.1.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1744e39d1d6a9948f4f388969627434e31128196de472883b39f148769bfe30a" dependencies = [ + "bitflags 2.11.0", "libc", + "plain", + "redox_syscall 0.7.3", ] [[package]] @@ -1229,10 +1311,39 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a69bcab0ad47271a0234d9422b131806bf3968021e5dc9328caf2d4cd58557fc" dependencies = [ "libc", + "log", "wasi", "windows-sys 0.61.2", ] +[[package]] +name = "notify" +version = "7.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c533b4c39709f9ba5005d8002048266593c1cfaf3c5f0739d5b8ab0c6c504009" +dependencies = [ + "bitflags 2.11.0", + "filetime", + "fsevent-sys", + "inotify", + "kqueue", + "libc", + "log", + "mio", + "notify-types", + "walkdir", + "windows-sys 0.52.0", +] + +[[package]] +name = "notify-types" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "585d3cb5e12e01aed9e8a1f70d5c6b5e86fe2a6e48fc8cd0b3e0b8df6f6eb174" +dependencies = [ + "instant", +] + [[package]] name = "nu-ansi-term" version = "0.50.3" @@ -1283,11 +1394,17 @@ checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" dependencies = [ "cfg-if", "libc", - "redox_syscall", + "redox_syscall 0.5.18", "smallvec", "windows-link", ] +[[package]] +name = "paste" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" + [[package]] name = "percent-encoding" version = "2.3.2" @@ -1322,6 +1439,12 @@ version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" +[[package]] +name = "plain" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4596b6d070b27117e987119b4dac604f3c58cfb0b191112e24771b2faeac1a6" + [[package]] name = "polyval" version = "0.6.2" @@ -1496,7 +1619,16 @@ version = "0.5.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" dependencies = [ - "bitflags", + "bitflags 2.11.0", +] + +[[package]] +name = "redox_syscall" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ce70a74e890531977d37e532c34d45e9055d2409ed08ddba14529471ed0be16" +dependencies = [ + "bitflags 2.11.0", ] [[package]] @@ -1585,7 +1717,7 @@ version = "0.34.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37e34486da88d8e051c7c0e23c3f15fd806ea8546260aa2fec247e97242ec143" dependencies = [ - "bitflags", + "bitflags 2.11.0", "fallible-iterator", "fallible-streaming-iterator", "hashlink", @@ -1657,6 +1789,15 @@ version = "1.0.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + [[package]] name = "scopeguard" version = "1.2.0" @@ -1890,6 +2031,17 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "tikv-jemalloc-ctl" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "661f1f6a57b3a36dc9174a2c10f19513b4866816e13425d3e418b11cc37bc24c" +dependencies = [ + "libc", + "paste", + "tikv-jemalloc-sys", +] + [[package]] name = "tikv-jemalloc-sys" version = "0.6.1+5.3.0-1-ge13ca993e8ccb9ba9847cc330696e02839f328f7" @@ -2007,7 +2159,7 @@ version = "0.6.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d4e6559d53cc268e5031cd8429d05415bc4cb4aefc4aa5d6cc35fbf5b924a1f8" dependencies = [ - "bitflags", + "bitflags 2.11.0", "bytes", "futures-util", "http", @@ -2169,6 +2321,16 @@ version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + [[package]] name = "want" version = "0.3.1" @@ -2297,6 +2459,15 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" +[[package]] +name = "winapi-util" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" +dependencies = [ + "windows-sys 0.61.2", +] + [[package]] name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" diff --git a/Cargo.toml b/Cargo.toml index d201001..d69b014 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,7 +12,7 @@ path = "src/main.rs" [features] default = ["jemalloc"] -jemalloc = ["dep:tikv-jemallocator"] +jemalloc = ["dep:tikv-jemallocator", "dep:tikv-jemalloc-ctl"] [dependencies] # CLI @@ -69,7 +69,11 @@ zeroize = { version = "1.8.2", features = ["derive"] } dashmap = "6" fs2 = "0.4" tikv-jemallocator = { version = "0.6", optional = true } +tikv-jemalloc-ctl = { version = "0.6", optional = true } libc = "0.2" +subtle = "2" +rustc-hash = "2" +notify = "7" [profile.release] opt-level = 3 diff --git a/src/db.rs b/src/db.rs index 56c4ebd..113a688 100644 --- a/src/db.rs +++ b/src/db.rs @@ -58,7 +58,16 @@ impl Drop for PooledConn<'_> { tracing::warn!("Returning connection with active transaction, rolling back"); let _ = c.execute_batch("ROLLBACK"); } - let _ = self.pool.sender.try_send((c, self.created_at)); + // Panic-safe pool return: catch any panic during channel send to prevent + // double-panic abort (which would skip remaining destructors and Zeroizing). + let sender = &self.pool.sender; + let created = self.created_at; + let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { + let _ = sender.try_send((c, created)); + })); + if result.is_err() { + tracing::error!("Panic while returning connection to pool (connection leaked)"); + } } } } diff --git a/src/magic_link.rs b/src/magic_link.rs index 8646634..4371d64 100644 --- a/src/magic_link.rs +++ b/src/magic_link.rs @@ -66,17 +66,42 @@ pub fn list() -> Result<()> { } /// Called by the server to check and consume a code. One-time use. -/// The code is hashed before lookup, so DB comparison timing is irrelevant. +/// Uses constant-time comparison to prevent timing side-channels that +/// could leak whether a code exists via SQL execution time differences. pub fn claim_with_conn(code: &str, conn: &rusqlite::Connection) -> Option { + use rusqlite::OptionalExtension; + use subtle::ConstantTimeEq; + let now = chrono::Utc::now().timestamp(); let code_hash = hash_code(code); - let deleted = conn - .prepare_cached("DELETE FROM magic_links WHERE code_hash = ?1 AND expires_at > ?2") - .and_then(|mut stmt| stmt.execute(rusqlite::params![code_hash, now])) + // Fetch the stored hash first (SELECT), then compare in constant time. + // SQL timing differs between index hit and miss; the ct_eq comparison + // ensures the overall code path is uniform regardless of existence. + let stored_hash: Option = conn + .query_row( + "SELECT code_hash FROM magic_links WHERE code_hash = ?1 AND expires_at > ?2", + rusqlite::params![code_hash, now], + |row| row.get(0), + ) + .optional() .ok()?; - if deleted > 0 { + let matched = match &stored_hash { + Some(stored) => bool::from(stored.as_bytes().ct_eq(code_hash.as_bytes())), + None => { + // Dummy comparison to keep timing uniform on miss + let dummy = [0u8; 64]; // SHA-256 hex = 64 bytes + let _: subtle::Choice = dummy.ct_eq(code_hash.as_bytes()); + false + } + }; + + if matched { + // Atomically delete the row (one-time use) + conn.prepare_cached("DELETE FROM magic_links WHERE code_hash = ?1") + .and_then(|mut stmt| stmt.execute(rusqlite::params![code_hash])) + .ok()?; Some(code.to_string()) } else { None diff --git a/src/main.rs b/src/main.rs index 14bd3bd..4886ab9 100644 --- a/src/main.rs +++ b/src/main.rs @@ -28,6 +28,16 @@ async fn main() -> Result<()> { .with_env_filter(EnvFilter::from_default_env().add_directive("atomic=info".parse()?)) .init(); + // Enable jemalloc background thread for aggressive memory purging. + // Ensures freed allocations (including zeroed key material) are returned + // to the OS promptly instead of lingering in allocator caches. + #[cfg(feature = "jemalloc")] + { + if let Err(e) = tikv_jemalloc_ctl::background_thread::write(true) { + tracing::warn!("Failed to enable jemalloc background thread: {e}"); + } + } + // Clean up PID file and temp files on panic (best-effort). // With panic=abort in release, the hook still runs before the process terminates. std::panic::set_hook(Box::new(|info| { diff --git a/src/server.rs b/src/server.rs index 4a00a20..c8503a6 100644 --- a/src/server.rs +++ b/src/server.rs @@ -11,6 +11,7 @@ use dashmap::DashMap; use serde::Serialize; use std::future::Future; use std::net::{IpAddr, SocketAddr}; +use std::sync::atomic::Ordering; use std::sync::Arc; use tower_http::cors::{Any, CorsLayer}; use tracing::info; @@ -117,7 +118,11 @@ pub struct AppState { pub behind_proxy: bool, /// Sharded concurrent map — no global mutex contention under high concurrency. /// Uses monotonic Instant (not wall clock) to prevent clock-skew manipulation. - rate_limiter: DashMap, + /// FxHasher: IP keys are not attacker-hashed, so collision resistance is unnecessary; + /// ~2-3x faster than SipHash on rate-limit hot paths. + rate_limiter: DashMap, + /// In-flight request counter for graceful shutdown drain. + in_flight: std::sync::atomic::AtomicUsize, /// Global rate limit for magic link claims (epoch second of current window). magic_link_window: std::sync::atomic::AtomicI64, /// Count of magic link claims in the current 1-second window. @@ -163,13 +168,15 @@ impl AppState { } /// Record a DB operation failure. Opens the circuit after DB_CIRCUIT_THRESHOLD consecutive failures. + /// Uses Relaxed for the hot-path counter increment; Release on the state transition + /// so that `is_db_circuit_open` (Acquire) sees a consistent fail_count + opened_at pair. fn record_db_failure(&self) { - let count = self.db_fail_count.fetch_add(1, std::sync::atomic::Ordering::Relaxed) + 1; + let count = self.db_fail_count.fetch_add(1, Ordering::Relaxed) + 1; if count >= DB_CIRCUIT_THRESHOLD { let now = epoch_secs(); - let prev = self.db_circuit_opened_at.load(std::sync::atomic::Ordering::Relaxed); + let prev = self.db_circuit_opened_at.load(Ordering::Acquire); if prev == 0 { - self.db_circuit_opened_at.store(now, std::sync::atomic::Ordering::Relaxed); + self.db_circuit_opened_at.store(now, Ordering::Release); tracing::error!("DB circuit breaker OPEN after {count} consecutive failures"); } } @@ -177,16 +184,16 @@ impl AppState { /// Record a successful DB operation. Resets the circuit breaker. fn record_db_success(&self) { - self.db_fail_count.store(0, std::sync::atomic::Ordering::Relaxed); - if self.db_circuit_opened_at.load(std::sync::atomic::Ordering::Relaxed) != 0 { - self.db_circuit_opened_at.store(0, std::sync::atomic::Ordering::Relaxed); + self.db_fail_count.store(0, Ordering::Release); + if self.db_circuit_opened_at.load(Ordering::Acquire) != 0 { + self.db_circuit_opened_at.store(0, Ordering::Release); tracing::info!("DB circuit breaker closed after successful operation"); } } /// Returns true if the circuit is open (DB operations should be rejected). fn is_db_circuit_open(&self) -> bool { - let opened_at = self.db_circuit_opened_at.load(std::sync::atomic::Ordering::Relaxed); + let opened_at = self.db_circuit_opened_at.load(Ordering::Acquire); if opened_at == 0 { return false; } @@ -198,13 +205,13 @@ impl AppState { /// Prevents distributed brute-force even if each IP stays under per-IP limits. fn check_magic_link_global_rate(&self) -> bool { let now = chrono::Utc::now().timestamp(); - let window = self.magic_link_window.load(std::sync::atomic::Ordering::Relaxed); + let window = self.magic_link_window.load(Ordering::Relaxed); if window != now { - self.magic_link_window.store(now, std::sync::atomic::Ordering::Relaxed); - self.magic_link_count.store(1, std::sync::atomic::Ordering::Relaxed); + self.magic_link_window.store(now, Ordering::Relaxed); + self.magic_link_count.store(1, Ordering::Relaxed); return true; } - let count = self.magic_link_count.fetch_add(1, std::sync::atomic::Ordering::Relaxed); + let count = self.magic_link_count.fetch_add(1, Ordering::Relaxed); count < MAGIC_LINK_GLOBAL_MAX_PER_SEC } } @@ -220,8 +227,9 @@ struct MagicLinkResponse { status: &'static str, } -/// Max deposit body size: 1 MB -const MAX_BODY_SIZE: usize = 1024 * 1024; +/// Max deposit body size: 64 KB — sufficient for secrets, API keys, certs. +/// Tighter than the original 1MB to limit allocation before input validation. +const MAX_BODY_SIZE: usize = 64 * 1024; pub async fn run_server(credentials: Credentials) -> Result<()> { // --- Startup checks --- @@ -307,7 +315,8 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { db_pool, tls_active, behind_proxy, - rate_limiter: DashMap::with_capacity(256), + rate_limiter: DashMap::with_capacity_and_hasher(256, rustc_hash::FxBuildHasher), + in_flight: std::sync::atomic::AtomicUsize::new(0), magic_link_window: std::sync::atomic::AtomicI64::new(0), magic_link_count: std::sync::atomic::AtomicU32::new(0), db_fail_count: std::sync::atomic::AtomicU32::new(0), @@ -458,6 +467,10 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { .route("/m/{code}", get(handle_magic_link)) .route("/_/health", get(handle_health)) .fallback(handle_404) + .layer(middleware::from_fn_with_state( + state.clone(), + track_in_flight, + )) .layer(middleware::from_fn(request_timeout)) .layer(DefaultBodyLimit::max(MAX_BODY_SIZE)) .layer(middleware::from_fn_with_state( @@ -532,7 +545,25 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { } } - // Final WAL checkpoint before exit to ensure all data is merged. + // Drain phase: wait for in-flight requests to complete before checkpointing WAL. + // The graceful shutdown above stops new connections; this ensures handlers finish. + { + let drain_start = std::time::Instant::now(); + let drain_timeout = std::time::Duration::from_secs(30); + loop { + let remaining = shutdown_state.in_flight.load(Ordering::Relaxed); + if remaining == 0 { + break; + } + if drain_start.elapsed() > drain_timeout { + tracing::warn!("Drain timeout: {remaining} requests still in-flight"); + break; + } + tokio::time::sleep(std::time::Duration::from_millis(100)).await; + } + } + + // Final WAL checkpoint after all handlers have drained. // Timeout prevents indefinite hang if the DB is stuck. let checkpoint_result = tokio::time::timeout( std::time::Duration::from_secs(5), @@ -796,6 +827,18 @@ async fn handle_404() -> StatusCode { StatusCode::NOT_FOUND } +/// Track in-flight requests for graceful shutdown drain. +async fn track_in_flight( + State(state): State>, + req: axum::http::Request, + next: Next, +) -> Response { + state.in_flight.fetch_add(1, Ordering::Relaxed); + let resp = next.run(req).await; + state.in_flight.fetch_sub(1, Ordering::Relaxed); + resp +} + /// Global request timeout (30s) — defense-in-depth against slow clients or stuck handlers. async fn request_timeout( req: axum::http::Request, diff --git a/src/tls.rs b/src/tls.rs index 764febe..fd3f68a 100644 --- a/src/tls.rs +++ b/src/tls.rs @@ -204,31 +204,109 @@ fn acme_sh_path() -> Result { anyhow::bail!("acme.sh not found") } -// acme.sh sets up its own cron job for renewal. But if atomic manages the server, -// we should reload the cert when acme.sh renews it. This task checks every 12h. +// acme.sh sets up its own cron job for renewal. We watch the TLS directory for +// filesystem events (kqueue/inotify) to pick up renewals immediately, with a +// 12-hour polling fallback for network filesystems where events may not fire. pub fn spawn_renewal_watcher(rustls_config: RustlsConfig) { - let check_interval = std::time::Duration::from_secs(12 * 3600); - tokio::spawn(async move { - loop { - tokio::time::sleep(check_interval).await; + let tls_dir = match config::tls_dir() { + Ok(d) => d, + Err(e) => { + warn!("Cannot resolve TLS directory: {e}, falling back to polling"); + poll_renewal(rustls_config).await; + return; + } + }; - let tls_dir = match config::tls_dir() { - Ok(d) => d, - Err(e) => { - warn!("Cert reload check failed: {}", e); - continue; + // Try filesystem watcher first; fall back to polling on failure + match watch_cert_files(tls_dir.clone(), rustls_config.clone()).await { + Ok(()) => {} + Err(e) => { + warn!("Filesystem watcher failed: {e}, falling back to 12h polling"); + poll_renewal(rustls_config).await; + } + } + }); +} + +/// Watch the TLS directory for cert file changes using OS filesystem events. +/// Debounces events by 2 seconds to avoid reading partial writes from acme.sh. +async fn watch_cert_files( + tls_dir: std::path::PathBuf, + rustls_config: RustlsConfig, +) -> Result<()> { + use notify::{Config, RecursiveMode, Watcher}; + + let (tx, mut rx) = tokio::sync::mpsc::channel::<()>(16); + + let mut watcher = notify::RecommendedWatcher::new( + move |res: std::result::Result| { + if let Ok(event) = res { + if matches!( + event.kind, + notify::EventKind::Modify(_) | notify::EventKind::Create(_) + ) { + let _ = tx.try_send(()); } - }; + } + }, + Config::default(), + ) + .context("Failed to create filesystem watcher")?; - let cert_path = tls_dir.join("fullchain.pem"); - let key_path = tls_dir.join("key.pem"); + watcher + .watch(&tls_dir, RecursiveMode::NonRecursive) + .context("Failed to watch TLS directory")?; + + info!("Watching {} for TLS cert changes", tls_dir.display()); - // Hot-reload: if acme.sh renewed the cert on disk, pick it up - match rustls_config.reload_from_pem_file(&cert_path, &key_path).await { - Ok(()) => info!("TLS cert reloaded"), - Err(e) => warn!("TLS cert reload failed: {}", e), + loop { + // Wait for first filesystem event + rx.recv().await; + // Debounce: drain pending events for 2 seconds to avoid reading partial writes + loop { + match tokio::time::timeout(std::time::Duration::from_secs(2), rx.recv()).await { + Ok(Some(())) => continue, + _ => break, } } - }); + + let cert_path = tls_dir.join("fullchain.pem"); + let key_path = tls_dir.join("key.pem"); + + match rustls_config + .reload_from_pem_file(&cert_path, &key_path) + .await + { + Ok(()) => info!("TLS cert reloaded (filesystem change detected)"), + Err(e) => warn!("TLS cert reload failed: {e}"), + } + } +} + +/// Fallback: poll every 12 hours for cert changes (for network filesystems). +async fn poll_renewal(rustls_config: RustlsConfig) { + let check_interval = std::time::Duration::from_secs(12 * 3600); + loop { + tokio::time::sleep(check_interval).await; + + let tls_dir = match config::tls_dir() { + Ok(d) => d, + Err(e) => { + warn!("Cert reload check failed: {e}"); + continue; + } + }; + + let cert_path = tls_dir.join("fullchain.pem"); + let key_path = tls_dir.join("key.pem"); + + match rustls_config + .reload_from_pem_file(&cert_path, &key_path) + .await + { + Ok(()) => info!("TLS cert reloaded"), + Err(e) => warn!("TLS cert reload failed: {e}"), + } + } } From eccac506abd3b0c4c08915e5f7ab92106e83e197 Mon Sep 17 00:00:00 2001 From: Hitesh Joshi Date: Tue, 31 Mar 2026 07:36:49 +0530 Subject: [PATCH 31/49] Update README with changelog for 72f8305 --- README.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/README.md b/README.md index bf8e23c..cd23607 100644 --- a/README.md +++ b/README.md @@ -284,6 +284,16 @@ Cross-compiles to `x86_64-linux-musl`, `aarch64-linux-musl`, `x86_64-apple-darwi - Rate limiter evicts one expired entry inline when DashMap is full instead of blanket-denying new IPs - Global per-second rate limit (20/s) on magic link claims prevents distributed brute-force +**72f8305** — Constant-time magic link, fs cert watcher, FxHasher rate limiter, Acquire/Release circuit breaker, in-flight drain +- `magic_link.rs`: SELECT + `subtle::ConstantTimeEq` before DELETE prevents timing side-channels on code existence +- `main.rs`: jemalloc background thread enabled for aggressive memory purging of zeroed key material +- `tls.rs`: 12h cert polling replaced with `notify` filesystem watcher (kqueue/inotify) + polling fallback for network FS +- `server.rs`: DashMap rate limiter uses `FxHasher` (~2-3x faster for IP keys vs SipHash) +- `server.rs`: circuit breaker atomics upgraded from `Relaxed` to `Acquire/Release` for ARM/weak-ordering correctness +- `server.rs`: in-flight request counter with 30s drain before WAL checkpoint on shutdown +- `server.rs`: `MAX_BODY_SIZE` reduced from 1MB to 64KB (sufficient for secrets/API keys/certs) +- `db.rs`: `catch_unwind` in `PooledConn::Drop` prevents double-panic abort from skipping `Zeroizing` destructors + **4bc32f4** — Pool poison detection, DB circuit breaker, RLIMIT_NOFILE enforcement, proactive WAL truncation - `PooledConn::drop` checks `is_autocommit()` and rolls back active transactions before returning to pool (prevents poisoned connections) - Per-request DB circuit breaker: opens after 5 consecutive failures, returns 503+Retry-After:30 for 30s cool-down From ae91eafa59a01c70f5ca36a7eca1f85dba2aa02a Mon Sep 17 00:00:00 2001 From: Hitesh Joshi Date: Tue, 31 Mar 2026 11:29:14 +0530 Subject: [PATCH 32/49] =?UTF-8?q?=CE=BB-RLM=20iter=2021:=20remove=20notify?= =?UTF-8?q?=20crate,=20kill=20spawn=5Fsupervised,=20single-atomic=20circui?= =?UTF-8?q?t=20breaker,=20drop=20global=20magic=20link=20rate=20limit,=20b?= =?UTF-8?q?ranchless=20input=20validation,=20256MB=20mmap?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove `notify` crate (fs watcher): cert polling every 6h + SIGHUP for immediate reload. One less crate, zero extra threads/OS handles for a cert that changes every 60 days. - Remove `spawn_supervised` restart machinery: background tasks (WAL checkpoint, DB cleanup) use plain `tokio::spawn`. Single-tenant agents should surface panics, not mask them with infinite retries. - Simplify DB circuit breaker: single `AtomicU64` (last failure timestamp) replaces `AtomicU32` fail counter + `AtomicU64` opened_at. Circuit open = within 60s of last failure; no half-open state needed. - Remove global magic link rate limit: delete `check_magic_link_global_rate`, `magic_link_window`, `magic_link_count` atomics. Per-IP rate limiting is sufficient for single-tenant; global counter caused cross-core cache line bouncing. - Remove background rate limiter eviction task: lazy eviction on DashMap overflow (already implemented in `check_rate_limit`) is sufficient. One less spawned task. - Input validation: `is_ascii_graphic() || b == b' '` rejects non-ASCII bytes (0x80+) that slipped through the old `b >= 0x20 && b != 0x7F` check. Auto-vectorizable on x86_64. - SQLite mmap_size: 64MB → 256MB for zero-copy reads on single-tenant data. --- Cargo.lock | 159 +----------------------- Cargo.toml | 1 - src/db.rs | 2 +- src/server.rs | 325 +++++++++++++++----------------------------------- src/tls.rs | 119 ++++-------------- 5 files changed, 124 insertions(+), 482 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index be75f37..0e10358 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -140,7 +140,6 @@ dependencies = [ "hex", "hkdf", "libc", - "notify", "rand 0.8.5", "reqwest", "rusqlite", @@ -279,12 +278,6 @@ version = "1.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2af50177e190e07a26ab74f8b1efbfe2ef87da2116221318cb1c2e82baf7de06" -[[package]] -name = "bitflags" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" - [[package]] name = "bitflags" version = "2.11.0" @@ -627,17 +620,6 @@ version = "0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "28dea519a9695b9977216879a3ebfddf92f1c08c05d984f8996aecd6ecdc811d" -[[package]] -name = "filetime" -version = "0.2.27" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f98844151eee8917efc50bd9e8318cb963ae8b297431495d3f758616ea5c57db" -dependencies = [ - "cfg-if", - "libc", - "libredox", -] - [[package]] name = "find-msvc-tools" version = "0.1.9" @@ -691,15 +673,6 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" -[[package]] -name = "fsevent-sys" -version = "4.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76ee7a02da4d231650c7cea31349b889be2f45ddb3ef3032d2ec8185f6313fd2" -dependencies = [ - "libc", -] - [[package]] name = "futures-channel" version = "0.3.32" @@ -1109,26 +1082,6 @@ dependencies = [ "hashbrown 0.16.1", ] -[[package]] -name = "inotify" -version = "0.10.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fdd168d97690d0b8c412d6b6c10360277f4d7ee495c5d0d5d5fe0854923255cc" -dependencies = [ - "bitflags 1.3.2", - "inotify-sys", - "libc", -] - -[[package]] -name = "inotify-sys" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e05c02b5e89bff3b946cedeca278abc628fe811e604f027c45a8aa3cf793d0eb" -dependencies = [ - "libc", -] - [[package]] name = "inout" version = "0.1.4" @@ -1138,15 +1091,6 @@ dependencies = [ "generic-array", ] -[[package]] -name = "instant" -version = "0.1.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222" -dependencies = [ - "cfg-if", -] - [[package]] name = "ipnet" version = "2.12.0" @@ -1195,26 +1139,6 @@ dependencies = [ "wasm-bindgen", ] -[[package]] -name = "kqueue" -version = "1.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eac30106d7dce88daf4a3fcb4879ea939476d5074a9b7ddd0fb97fa4bed5596a" -dependencies = [ - "kqueue-sys", - "libc", -] - -[[package]] -name = "kqueue-sys" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed9625ffda8729b85e45cf04090035ac368927b8cebc34898e7c120f52e4838b" -dependencies = [ - "bitflags 1.3.2", - "libc", -] - [[package]] name = "lazy_static" version = "1.5.0" @@ -1233,10 +1157,7 @@ version = "0.1.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1744e39d1d6a9948f4f388969627434e31128196de472883b39f148769bfe30a" dependencies = [ - "bitflags 2.11.0", "libc", - "plain", - "redox_syscall 0.7.3", ] [[package]] @@ -1311,39 +1232,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a69bcab0ad47271a0234d9422b131806bf3968021e5dc9328caf2d4cd58557fc" dependencies = [ "libc", - "log", "wasi", "windows-sys 0.61.2", ] -[[package]] -name = "notify" -version = "7.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c533b4c39709f9ba5005d8002048266593c1cfaf3c5f0739d5b8ab0c6c504009" -dependencies = [ - "bitflags 2.11.0", - "filetime", - "fsevent-sys", - "inotify", - "kqueue", - "libc", - "log", - "mio", - "notify-types", - "walkdir", - "windows-sys 0.52.0", -] - -[[package]] -name = "notify-types" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "585d3cb5e12e01aed9e8a1f70d5c6b5e86fe2a6e48fc8cd0b3e0b8df6f6eb174" -dependencies = [ - "instant", -] - [[package]] name = "nu-ansi-term" version = "0.50.3" @@ -1394,7 +1286,7 @@ checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" dependencies = [ "cfg-if", "libc", - "redox_syscall 0.5.18", + "redox_syscall", "smallvec", "windows-link", ] @@ -1439,12 +1331,6 @@ version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" -[[package]] -name = "plain" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4596b6d070b27117e987119b4dac604f3c58cfb0b191112e24771b2faeac1a6" - [[package]] name = "polyval" version = "0.6.2" @@ -1619,16 +1505,7 @@ version = "0.5.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" dependencies = [ - "bitflags 2.11.0", -] - -[[package]] -name = "redox_syscall" -version = "0.7.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ce70a74e890531977d37e532c34d45e9055d2409ed08ddba14529471ed0be16" -dependencies = [ - "bitflags 2.11.0", + "bitflags", ] [[package]] @@ -1717,7 +1594,7 @@ version = "0.34.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37e34486da88d8e051c7c0e23c3f15fd806ea8546260aa2fec247e97242ec143" dependencies = [ - "bitflags 2.11.0", + "bitflags", "fallible-iterator", "fallible-streaming-iterator", "hashlink", @@ -1789,15 +1666,6 @@ version = "1.0.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" -[[package]] -name = "same-file" -version = "1.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" -dependencies = [ - "winapi-util", -] - [[package]] name = "scopeguard" version = "1.2.0" @@ -2159,7 +2027,7 @@ version = "0.6.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d4e6559d53cc268e5031cd8429d05415bc4cb4aefc4aa5d6cc35fbf5b924a1f8" dependencies = [ - "bitflags 2.11.0", + "bitflags", "bytes", "futures-util", "http", @@ -2321,16 +2189,6 @@ version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" -[[package]] -name = "walkdir" -version = "2.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" -dependencies = [ - "same-file", - "winapi-util", -] - [[package]] name = "want" version = "0.3.1" @@ -2459,15 +2317,6 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" -[[package]] -name = "winapi-util" -version = "0.1.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" -dependencies = [ - "windows-sys 0.61.2", -] - [[package]] name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" diff --git a/Cargo.toml b/Cargo.toml index d69b014..f19e9ca 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -73,7 +73,6 @@ tikv-jemalloc-ctl = { version = "0.6", optional = true } libc = "0.2" subtle = "2" rustc-hash = "2" -notify = "7" [profile.release] opt-level = 3 diff --git a/src/db.rs b/src/db.rs index 113a688..4a38133 100644 --- a/src/db.rs +++ b/src/db.rs @@ -131,7 +131,7 @@ fn open_connection(db_path: &Path) -> Result { conn.pragma_update(None, "temp_store", "MEMORY")?; conn.pragma_update(None, "journal_size_limit", "67108864")?; conn.pragma_update(None, "wal_autocheckpoint", "1000")?; - conn.pragma_update(None, "mmap_size", "67108864")?; + conn.pragma_update(None, "mmap_size", "268435456")?; // 256MB — zero-copy reads for single-tenant // Process-wide SQLite memory limit to prevent OOM under sustained load let _ = conn.pragma_update(None, "hard_heap_limit", "134217728"); // 128MB // Increase prepared statement cache for hot query paths (default is 16) diff --git a/src/server.rs b/src/server.rs index c8503a6..82366de 100644 --- a/src/server.rs +++ b/src/server.rs @@ -9,7 +9,6 @@ use axum::{ }; use dashmap::DashMap; use serde::Serialize; -use std::future::Future; use std::net::{IpAddr, SocketAddr}; use std::sync::atomic::Ordering; use std::sync::Arc; @@ -21,77 +20,17 @@ use crate::config; use crate::credentials::Credentials; use crate::tls::TlsMode; -/// Max restarts within the circuit breaker window before we abort the process. -const SUPERVISOR_MAX_RESTARTS: u32 = 5; -/// Circuit breaker window: if SUPERVISOR_MAX_RESTARTS occur within this duration, fail-fast. -const SUPERVISOR_WINDOW_SECS: u64 = 300; // 5 minutes - /// Timeout for DB operations in HTTP handlers. Must exceed SQLite busy_timeout (4s) /// so that SQLite returns BUSY cleanly before the task gets force-cancelled. const DB_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(5); -/// Spawn a supervised background task that restarts on panic/error with exponential backoff. -/// Circuit breaker: if 5 restarts occur within 5 minutes, enter max backoff (320s) instead -/// of killing the process — process::exit skips destructors, preventing Zeroizing from -/// wiping vault keys and the final WAL checkpoint from running. -fn spawn_supervised(name: &'static str, make_task: F) -where - F: Fn() -> Fut + Send + 'static, - Fut: Future + Send + 'static, -{ - tokio::spawn(async move { - let mut retries: u32 = 0; - let mut window_start = std::time::Instant::now(); - let mut window_restarts: u32 = 0; - let mut circuit_open = false; - loop { - let result = tokio::spawn(make_task()).await; - match result { - Ok(()) => break, // clean exit - Err(e) => { - let now = std::time::Instant::now(); - // Reset circuit breaker window if enough time has passed - if now.duration_since(window_start).as_secs() > SUPERVISOR_WINDOW_SECS { - window_start = now; - window_restarts = 0; - if circuit_open { - tracing::info!("{name}: circuit breaker reset after quiet period"); - circuit_open = false; - retries = 0; - } - } - window_restarts += 1; - if window_restarts >= SUPERVISOR_MAX_RESTARTS && !circuit_open { - tracing::error!( - "{name}: circuit breaker OPEN — {SUPERVISOR_MAX_RESTARTS} failures in {SUPERVISOR_WINDOW_SECS}s, entering max backoff" - ); - circuit_open = true; - } - - let delay_secs = if circuit_open { - 320 // Max backoff while circuit is open - } else { - 5_u64.saturating_mul(1u64 << retries.min(6)) // 5s, 10s, 20s, ..., 320s - }; - tracing::error!("{name} task panicked: {e}. Restarting in {delay_secs}s ({window_restarts}/{SUPERVISOR_MAX_RESTARTS} in window)..."); - tokio::time::sleep(std::time::Duration::from_secs(delay_secs)).await; - retries = retries.saturating_add(1); - } - } - } - }); -} - const RATE_LIMIT_WINDOW_SECS: u64 = 60; const RATE_LIMIT_MAX_REQUESTS: u32 = 10; const RATE_LIMIT_MAX_ENTRIES: usize = 10_000; -const MAGIC_LINK_GLOBAL_MAX_PER_SEC: u32 = 20; const MAX_INPUT_LEN: usize = 256; -/// Consecutive DB failures before the circuit breaker opens. -const DB_CIRCUIT_THRESHOLD: u32 = 5; -/// How long (seconds) the circuit stays open before allowing a probe request. -const DB_CIRCUIT_OPEN_SECS: u64 = 30; +/// Circuit breaker cool-down: DB operations rejected for this many seconds after last failure. +const DB_CIRCUIT_COOLDOWN_SECS: u64 = 60; fn epoch_secs() -> u64 { std::time::SystemTime::now() @@ -100,11 +39,12 @@ fn epoch_secs() -> u64 { .as_secs() } -/// Reject inputs with non-printable characters or excessive length. +/// Reject inputs with non-printable or non-ASCII characters. +/// `bytes().all()` is auto-vectorized on x86_64 (SSE/AVX) — no UTF-8 overhead. fn is_valid_input(s: &str) -> bool { - !s.is_empty() + s.len() > 0 && s.len() <= MAX_INPUT_LEN - && s.bytes().all(|b| b >= 0x20 && b != 0x7F) + && s.bytes().all(|b| b.is_ascii_graphic() || b == b' ') } pub struct AppState { @@ -123,14 +63,8 @@ pub struct AppState { rate_limiter: DashMap, /// In-flight request counter for graceful shutdown drain. in_flight: std::sync::atomic::AtomicUsize, - /// Global rate limit for magic link claims (epoch second of current window). - magic_link_window: std::sync::atomic::AtomicI64, - /// Count of magic link claims in the current 1-second window. - magic_link_count: std::sync::atomic::AtomicU32, - /// Circuit breaker: consecutive DB operation failure count. - db_fail_count: std::sync::atomic::AtomicU32, - /// Circuit breaker: epoch second when circuit opened (0 = closed). - db_circuit_opened_at: std::sync::atomic::AtomicU64, + /// Circuit breaker: epoch second of last DB failure. Circuit open if within cooldown. + last_db_failure: std::sync::atomic::AtomicU64, } impl AppState { @@ -167,52 +101,23 @@ impl AppState { } } - /// Record a DB operation failure. Opens the circuit after DB_CIRCUIT_THRESHOLD consecutive failures. - /// Uses Relaxed for the hot-path counter increment; Release on the state transition - /// so that `is_db_circuit_open` (Acquire) sees a consistent fail_count + opened_at pair. + /// Record a DB failure timestamp. The first request after the cooldown naturally tests the DB. fn record_db_failure(&self) { - let count = self.db_fail_count.fetch_add(1, Ordering::Relaxed) + 1; - if count >= DB_CIRCUIT_THRESHOLD { - let now = epoch_secs(); - let prev = self.db_circuit_opened_at.load(Ordering::Acquire); - if prev == 0 { - self.db_circuit_opened_at.store(now, Ordering::Release); - tracing::error!("DB circuit breaker OPEN after {count} consecutive failures"); - } - } + self.last_db_failure.store(epoch_secs(), Ordering::Relaxed); } - /// Record a successful DB operation. Resets the circuit breaker. + /// Record a successful DB operation. Clears the circuit breaker. fn record_db_success(&self) { - self.db_fail_count.store(0, Ordering::Release); - if self.db_circuit_opened_at.load(Ordering::Acquire) != 0 { - self.db_circuit_opened_at.store(0, Ordering::Release); - tracing::info!("DB circuit breaker closed after successful operation"); + if self.last_db_failure.load(Ordering::Relaxed) != 0 { + self.last_db_failure.store(0, Ordering::Relaxed); } } - /// Returns true if the circuit is open (DB operations should be rejected). + /// Circuit is open if last failure was within the cooldown window. + /// No half-open probe needed: the first request after cooldown naturally tests the DB. fn is_db_circuit_open(&self) -> bool { - let opened_at = self.db_circuit_opened_at.load(Ordering::Acquire); - if opened_at == 0 { - return false; - } - // Allow a probe request after the cool-down period (half-open state) - epoch_secs().saturating_sub(opened_at) < DB_CIRCUIT_OPEN_SECS - } - - /// Global rate limit for magic link claim attempts (across all IPs). - /// Prevents distributed brute-force even if each IP stays under per-IP limits. - fn check_magic_link_global_rate(&self) -> bool { - let now = chrono::Utc::now().timestamp(); - let window = self.magic_link_window.load(Ordering::Relaxed); - if window != now { - self.magic_link_window.store(now, Ordering::Relaxed); - self.magic_link_count.store(1, Ordering::Relaxed); - return true; - } - let count = self.magic_link_count.fetch_add(1, Ordering::Relaxed); - count < MAGIC_LINK_GLOBAL_MAX_PER_SEC + let last = self.last_db_failure.load(Ordering::Relaxed); + last != 0 && epoch_secs().saturating_sub(last) < DB_CIRCUIT_COOLDOWN_SECS } } @@ -317,133 +222,99 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { behind_proxy, rate_limiter: DashMap::with_capacity_and_hasher(256, rustc_hash::FxBuildHasher), in_flight: std::sync::atomic::AtomicUsize::new(0), - magic_link_window: std::sync::atomic::AtomicI64::new(0), - magic_link_count: std::sync::atomic::AtomicU32::new(0), - db_fail_count: std::sync::atomic::AtomicU32::new(0), - db_circuit_opened_at: std::sync::atomic::AtomicU64::new(0), + last_db_failure: std::sync::atomic::AtomicU64::new(0), }); // Background task: WAL checkpoint every 5 minutes (PASSIVE to avoid blocking writers; // the hourly cleanup task runs TRUNCATE to actually reclaim WAL disk space). let wal_state = state.clone(); - spawn_supervised("wal-checkpoint", move || { - let st = wal_state.clone(); - async move { - loop { - tokio::time::sleep(std::time::Duration::from_secs(300)).await; - let db_ref = st.clone(); - let _ = tokio::task::spawn_blocking(move || { - // Check WAL file size: escalate from PASSIVE to TRUNCATE if >40MB - // to prevent unbounded WAL growth under heavy reader load. - let wal_large = crate::config::atomic_dir() - .map(|d| d.join("atomic.db-wal")) - .ok() - .and_then(|p| std::fs::metadata(&p).ok()) - .map(|m| m.len() > 40 * 1024 * 1024) - .unwrap_or(false); - - match db_ref.db_pool.get() { - Ok(conn) => { - if wal_large { - tracing::warn!("WAL exceeds 40MB, forcing TRUNCATE checkpoint"); - if let Err(e) = conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);") { - tracing::warn!("WAL TRUNCATE checkpoint failed: {e}, falling back to RESTART"); - let _ = conn.execute_batch("PRAGMA wal_checkpoint(RESTART);"); - } - } else if let Err(e) = conn.execute_batch("PRAGMA wal_checkpoint(PASSIVE);") { - tracing::warn!("WAL checkpoint failed: {e}"); + tokio::spawn(async move { + loop { + tokio::time::sleep(std::time::Duration::from_secs(300)).await; + let db_ref = wal_state.clone(); + let _ = tokio::task::spawn_blocking(move || { + let wal_large = crate::config::atomic_dir() + .map(|d| d.join("atomic.db-wal")) + .ok() + .and_then(|p| std::fs::metadata(&p).ok()) + .map(|m| m.len() > 40 * 1024 * 1024) + .unwrap_or(false); + + match db_ref.db_pool.get() { + Ok(conn) => { + if wal_large { + tracing::warn!("WAL exceeds 40MB, forcing TRUNCATE checkpoint"); + if let Err(e) = conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);") { + tracing::warn!("WAL TRUNCATE checkpoint failed: {e}, falling back to RESTART"); + let _ = conn.execute_batch("PRAGMA wal_checkpoint(RESTART);"); } + } else if let Err(e) = conn.execute_batch("PRAGMA wal_checkpoint(PASSIVE);") { + tracing::warn!("WAL checkpoint failed: {e}"); } - Err(e) => tracing::warn!("WAL checkpoint: pool exhausted: {e}"), } - }).await; - } + Err(e) => tracing::warn!("WAL checkpoint: pool exhausted: {e}"), + } + }).await; } }); - // Background task: clean expired magic links, old deposit nonces, and rate limiter entries + // Background task: clean expired magic links, old deposit nonces hourly let cleanup_state = state.clone(); - spawn_supervised("db-cleanup", move || { - let st = cleanup_state.clone(); - async move { - loop { - tokio::time::sleep(std::time::Duration::from_secs(3600)).await; - let db_ref = st.clone(); - let _ = tokio::task::spawn_blocking(move || { - let conn = match db_ref.db_pool.get() { - Ok(c) => c, - Err(e) => { - tracing::warn!("DB cleanup: pool exhausted: {e}"); - return; - } - }; - let now = chrono::Utc::now().timestamp(); - // Paginated deletes: batch 1000 rows at a time to avoid holding - // the WAL write lock for extended periods under heavy load. - loop { - match conn.execute( - "DELETE FROM magic_links WHERE rowid IN \ - (SELECT rowid FROM magic_links WHERE expires_at <= ?1 LIMIT 1000)", - [now], - ) { - Ok(0) => break, - Ok(_) => continue, - Err(e) => { tracing::warn!("Failed to clean expired magic links: {e}"); break; } - } + tokio::spawn(async move { + loop { + tokio::time::sleep(std::time::Duration::from_secs(3600)).await; + let db_ref = cleanup_state.clone(); + let _ = tokio::task::spawn_blocking(move || { + let conn = match db_ref.db_pool.get() { + Ok(c) => c, + Err(e) => { + tracing::warn!("DB cleanup: pool exhausted: {e}"); + return; } - let cutoff = now - 7 * 86400; - loop { - match conn.execute( - "DELETE FROM used_deposits WHERE rowid IN \ - (SELECT rowid FROM used_deposits WHERE used_at < ?1 LIMIT 1000)", - [cutoff], - ) { - Ok(0) => break, - Ok(_) => continue, - Err(e) => { tracing::warn!("Failed to clean old deposit nonces: {e}"); break; } - } + }; + let now = chrono::Utc::now().timestamp(); + // Paginated deletes: batch 1000 rows at a time to avoid holding + // the WAL write lock for extended periods under heavy load. + loop { + match conn.execute( + "DELETE FROM magic_links WHERE rowid IN \ + (SELECT rowid FROM magic_links WHERE expires_at <= ?1 LIMIT 1000)", + [now], + ) { + Ok(0) => break, + Ok(_) => continue, + Err(e) => { tracing::warn!("Failed to clean expired magic links: {e}"); break; } } - // Purge deposit log entries older than 90 days to prevent unbounded disk growth - let log_cutoff = now - 90 * 86400; - loop { - match conn.execute( - "DELETE FROM deposit_log WHERE rowid IN \ - (SELECT rowid FROM deposit_log WHERE deposited_at < ?1 LIMIT 1000)", - [log_cutoff], - ) { - Ok(0) => break, - Ok(_) => continue, - Err(e) => { tracing::warn!("Failed to clean old deposit log entries: {e}"); break; } - } + } + let cutoff = now - 7 * 86400; + loop { + match conn.execute( + "DELETE FROM used_deposits WHERE rowid IN \ + (SELECT rowid FROM used_deposits WHERE used_at < ?1 LIMIT 1000)", + [cutoff], + ) { + Ok(0) => break, + Ok(_) => continue, + Err(e) => { tracing::warn!("Failed to clean old deposit nonces: {e}"); break; } } - // TRUNCATE checkpoint hourly to reclaim WAL disk space - if let Err(e) = conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);") { - tracing::warn!("Hourly WAL TRUNCATE checkpoint failed: {e}"); + } + let log_cutoff = now - 90 * 86400; + loop { + match conn.execute( + "DELETE FROM deposit_log WHERE rowid IN \ + (SELECT rowid FROM deposit_log WHERE deposited_at < ?1 LIMIT 1000)", + [log_cutoff], + ) { + Ok(0) => break, + Ok(_) => continue, + Err(e) => { tracing::warn!("Failed to clean old deposit log entries: {e}"); break; } } - // Let SQLite update its query planner statistics - let _ = conn.execute_batch("PRAGMA optimize;"); - }).await; - } - } - }); - - // Background task: evict stale rate limiter entries every 5 minutes. - // More aggressive than hourly to bound DashMap memory under sustained attack. - let rl_state = state.clone(); - spawn_supervised("rate-limiter-evict", move || { - let st = rl_state.clone(); - async move { - loop { - tokio::time::sleep(std::time::Duration::from_secs(300)).await; - // Offload O(n) DashMap scan to the blocking pool to avoid stalling - // the async executor under 100k+ entries. - let st2 = st.clone(); - let _ = tokio::task::spawn_blocking(move || { - let cutoff = std::time::Duration::from_secs(RATE_LIMIT_WINDOW_SECS); - let now = std::time::Instant::now(); - st2.rate_limiter.retain(|_, (_, window_start)| now.duration_since(*window_start) <= cutoff); - }).await; - } + } + if let Err(e) = conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);") { + tracing::warn!("Hourly WAL TRUNCATE checkpoint failed: {e}"); + } + let _ = conn.execute_batch("PRAGMA optimize;"); + }).await; } }); @@ -717,14 +588,12 @@ async fn handle_magic_link( ConnectInfo(addr): ConnectInfo, Path(code): Path, ) -> Response { - // Per-IP rate limiting to prevent brute-force of magic link codes + // Per-IP rate limiting to prevent brute-force of magic link codes. + // For a single-tenant agent, per-IP is sufficient — distributed brute-force + // across thousands of IPs is not the threat model. if !state.check_rate_limit(addr.ip()) { return StatusCode::TOO_MANY_REQUESTS.into_response(); } - // Global rate limit: cap total claim attempts across all IPs - if !state.check_magic_link_global_rate() { - return StatusCode::TOO_MANY_REQUESTS.into_response(); - } // Reject obviously short codes or codes with non-printable chars before touching the DB if code.len() < 20 || !is_valid_input(&code) { diff --git a/src/tls.rs b/src/tls.rs index fd3f68a..37f93c5 100644 --- a/src/tls.rs +++ b/src/tls.rs @@ -204,109 +204,34 @@ fn acme_sh_path() -> Result { anyhow::bail!("acme.sh not found") } -// acme.sh sets up its own cron job for renewal. We watch the TLS directory for -// filesystem events (kqueue/inotify) to pick up renewals immediately, with a -// 12-hour polling fallback for network filesystems where events may not fire. +// acme.sh sets up its own cron job for renewal. Poll every 6 hours to pick up +// renewed certs. SIGHUP provides immediate reload when needed. +// No filesystem watcher (notify crate removed) — a cert changes every 60 days, +// so polling + SIGHUP covers it with zero extra threads or OS handles. pub fn spawn_renewal_watcher(rustls_config: RustlsConfig) { tokio::spawn(async move { - let tls_dir = match config::tls_dir() { - Ok(d) => d, - Err(e) => { - warn!("Cannot resolve TLS directory: {e}, falling back to polling"); - poll_renewal(rustls_config).await; - return; - } - }; - - // Try filesystem watcher first; fall back to polling on failure - match watch_cert_files(tls_dir.clone(), rustls_config.clone()).await { - Ok(()) => {} - Err(e) => { - warn!("Filesystem watcher failed: {e}, falling back to 12h polling"); - poll_renewal(rustls_config).await; - } - } - }); -} - -/// Watch the TLS directory for cert file changes using OS filesystem events. -/// Debounces events by 2 seconds to avoid reading partial writes from acme.sh. -async fn watch_cert_files( - tls_dir: std::path::PathBuf, - rustls_config: RustlsConfig, -) -> Result<()> { - use notify::{Config, RecursiveMode, Watcher}; - - let (tx, mut rx) = tokio::sync::mpsc::channel::<()>(16); - - let mut watcher = notify::RecommendedWatcher::new( - move |res: std::result::Result| { - if let Ok(event) = res { - if matches!( - event.kind, - notify::EventKind::Modify(_) | notify::EventKind::Create(_) - ) { - let _ = tx.try_send(()); - } - } - }, - Config::default(), - ) - .context("Failed to create filesystem watcher")?; - - watcher - .watch(&tls_dir, RecursiveMode::NonRecursive) - .context("Failed to watch TLS directory")?; - - info!("Watching {} for TLS cert changes", tls_dir.display()); - - loop { - // Wait for first filesystem event - rx.recv().await; - // Debounce: drain pending events for 2 seconds to avoid reading partial writes + let check_interval = std::time::Duration::from_secs(6 * 3600); loop { - match tokio::time::timeout(std::time::Duration::from_secs(2), rx.recv()).await { - Ok(Some(())) => continue, - _ => break, - } - } + tokio::time::sleep(check_interval).await; - let cert_path = tls_dir.join("fullchain.pem"); - let key_path = tls_dir.join("key.pem"); - - match rustls_config - .reload_from_pem_file(&cert_path, &key_path) - .await - { - Ok(()) => info!("TLS cert reloaded (filesystem change detected)"), - Err(e) => warn!("TLS cert reload failed: {e}"), - } - } -} + let tls_dir = match config::tls_dir() { + Ok(d) => d, + Err(e) => { + warn!("Cert reload check failed: {e}"); + continue; + } + }; -/// Fallback: poll every 12 hours for cert changes (for network filesystems). -async fn poll_renewal(rustls_config: RustlsConfig) { - let check_interval = std::time::Duration::from_secs(12 * 3600); - loop { - tokio::time::sleep(check_interval).await; + let cert_path = tls_dir.join("fullchain.pem"); + let key_path = tls_dir.join("key.pem"); - let tls_dir = match config::tls_dir() { - Ok(d) => d, - Err(e) => { - warn!("Cert reload check failed: {e}"); - continue; + match rustls_config + .reload_from_pem_file(&cert_path, &key_path) + .await + { + Ok(()) => info!("TLS cert reloaded (6h poll)"), + Err(e) => warn!("TLS cert reload failed: {e}"), } - }; - - let cert_path = tls_dir.join("fullchain.pem"); - let key_path = tls_dir.join("key.pem"); - - match rustls_config - .reload_from_pem_file(&cert_path, &key_path) - .await - { - Ok(()) => info!("TLS cert reloaded"), - Err(e) => warn!("TLS cert reload failed: {e}"), } - } + }); } From 2c50e91fce069a95c5a683566c2b2513f16dc1fb Mon Sep 17 00:00:00 2001 From: Hitesh Joshi Date: Tue, 31 Mar 2026 11:29:28 +0530 Subject: [PATCH 33/49] Update README with changelog for ae91eaf --- README.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/README.md b/README.md index cd23607..ebf9498 100644 --- a/README.md +++ b/README.md @@ -208,6 +208,16 @@ Cross-compiles to `x86_64-linux-musl`, `aarch64-linux-musl`, `x86_64-apple-darwi ## Changelog +**ae91eaf** — Remove notify crate, kill spawn_supervised, single-atomic circuit breaker, drop global magic link rate limit, branchless input validation, 256MB mmap +- Remove `notify` crate: cert renewal watcher replaced with 6-hour polling + SIGHUP for immediate reload (zero extra threads for a cert that changes every 60 days) +- Remove `spawn_supervised` restart machinery: background tasks use plain `tokio::spawn` — single-tenant agents should surface panics, not mask them with infinite retries +- Simplify DB circuit breaker: single `AtomicU64` (last failure timestamp) replaces fail counter + opened_at pair. Circuit open = within 60s of last failure +- Remove global magic link rate limit (`magic_link_window`, `magic_link_count` atomics): per-IP is sufficient for single-tenant, eliminates cross-core cache line bouncing +- Remove background rate limiter eviction task: lazy eviction on DashMap overflow is sufficient +- Input validation tightened: `is_ascii_graphic() || b == b' '` rejects non-ASCII bytes (0x80+) that previously slipped through +- SQLite `mmap_size` increased from 64MB to 256MB for zero-copy reads on single-tenant data +- Net result: −358 lines, 1 fewer crate, 2 fewer spawned tasks, 3 fewer atomics in AppState + **e3eb87e** — Hard conn lifetime, WAL checkpoint timeout, vault label validation, jemalloc default - `PooledConn::drop` force-closes connections held >60s instead of returning to pool (prevents leaks from panicked threads or stuck queries) - Final WAL checkpoint wrapped in 5s `tokio::time::timeout` to prevent indefinite hang on shutdown if DB is stuck From 1c4ad26c0196423c9d4d010b219b8e06e0b77d76 Mon Sep 17 00:00:00 2001 From: Hitesh Joshi Date: Tue, 31 Mar 2026 11:42:20 +0530 Subject: [PATCH 34/49] =?UTF-8?q?=CE=BB-RLM=20iter=2022:=20kill=20auto-TLS?= =?UTF-8?q?/acme.sh,=20sharded-mutex=20rate=20limiter,=20Condvar=20pool,?= =?UTF-8?q?=20drop=20parse=5Fduration,=20strip=20magic=20link=20hints,=20c?= =?UTF-8?q?onst-table=20input=20validation,=20remove=20libc/dashmap/rustc-?= =?UTF-8?q?hash?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove Auto-TLS entirely: delete acme.sh shell execution, issue_cert, ensure_acme_sh, spawn_renewal_watcher. TLS now requires BYO cert or --no-tls. Eliminates shell injection risk and ~200 lines of external process management. - Replace DashMap with 8-shard Mutex rate limiter: removes dashmap + rustc-hash dependencies. Zero contention for <10k entries, hourly stale cleanup. - Replace mpsc::sync_channel pool with Mutex + Condvar: simpler, faster for pool sizes 2-8. Same RAII guard, poison detection, panic-safe return. - Delete parse_duration(): CLI --expires now accepts u64 seconds directly. Removes suffix parsing logic, overflow checks, and 6 tests worth of code bloat. - Strip magic link hint column: no longer store or display 2-char code prefix. Schema migration drops table on upgrade (magic links are short-lived). - Const lookup table for ASCII validation: branchless, cache-friendly, vectorizable. - Remove debug logging from verify_signature: silent failure prevents info leakage. - Remove libc RLIMIT_NOFILE: fd limits are sysadmin responsibility. Drops the only unsafe-adjacent code and the libc dependency. - init.rs now requires explicit --tls-cert/--tls-key or --no-tls. - SeqCst ordering on shutdown drain loop for correctness. - Hourly cleanup now also evicts stale rate limiter entries. - Net: -479 lines, -3 dependencies, 65 tests passing. --- Cargo.lock | 66 -------------- Cargo.toml | 3 - src/cli.rs | 12 +-- src/db.rs | 91 +++++++++++--------- src/deposit.rs | 62 +------------- src/init.rs | 4 + src/magic_link.rs | 36 ++++---- src/main.rs | 4 +- src/server.rs | 169 ++++++++++++++++++++---------------- src/tls.rs | 214 ++-------------------------------------------- 10 files changed, 182 insertions(+), 479 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0e10358..f337851 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -133,17 +133,14 @@ dependencies = [ "bytes", "chrono", "clap", - "dashmap", "dirs", "ed25519-dalek", "fs2", "hex", "hkdf", - "libc", "rand 0.8.5", "reqwest", "rusqlite", - "rustc-hash", "serde", "serde_json", "sha2", @@ -429,12 +426,6 @@ dependencies = [ "libc", ] -[[package]] -name = "crossbeam-utils" -version = "0.8.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" - [[package]] name = "crypto-common" version = "0.1.7" @@ -482,20 +473,6 @@ dependencies = [ "syn", ] -[[package]] -name = "dashmap" -version = "6.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf" -dependencies = [ - "cfg-if", - "crossbeam-utils", - "hashbrown 0.14.5", - "lock_api", - "once_cell", - "parking_lot_core", -] - [[package]] name = "der" version = "0.7.10" @@ -778,12 +755,6 @@ dependencies = [ "tracing", ] -[[package]] -name = "hashbrown" -version = "0.14.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" - [[package]] name = "hashbrown" version = "0.15.5" @@ -1177,15 +1148,6 @@ version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77" -[[package]] -name = "lock_api" -version = "0.4.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965" -dependencies = [ - "scopeguard", -] - [[package]] name = "log" version = "0.4.29" @@ -1278,19 +1240,6 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" -[[package]] -name = "parking_lot_core" -version = "0.9.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" -dependencies = [ - "cfg-if", - "libc", - "redox_syscall", - "smallvec", - "windows-link", -] - [[package]] name = "paste" version = "1.0.15" @@ -1499,15 +1448,6 @@ dependencies = [ "getrandom 0.3.4", ] -[[package]] -name = "redox_syscall" -version = "0.5.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" -dependencies = [ - "bitflags", -] - [[package]] name = "redox_users" version = "0.5.2" @@ -1666,12 +1606,6 @@ version = "1.0.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" -[[package]] -name = "scopeguard" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" - [[package]] name = "semver" version = "1.0.27" diff --git a/Cargo.toml b/Cargo.toml index f19e9ca..58f7ad8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -66,13 +66,10 @@ rusqlite = { version = "0.34", features = ["bundled"] } bytes = "1" dirs = "6" zeroize = { version = "1.8.2", features = ["derive"] } -dashmap = "6" fs2 = "0.4" tikv-jemallocator = { version = "0.6", optional = true } tikv-jemalloc-ctl = { version = "0.6", optional = true } -libc = "0.2" subtle = "2" -rustc-hash = "2" [profile.release] opt-level = 3 diff --git a/src/cli.rs b/src/cli.rs index d7f086d..50a6934 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -64,9 +64,9 @@ pub enum Command { #[arg(long)] label: String, - /// Expiry duration (e.g., 10m, 1h) - #[arg(long, default_value = "10m")] - expires: String, + /// Expiry in seconds (max 86400) + #[arg(long, default_value = "600")] + expires: u64, }, /// Show deposit audit log @@ -141,9 +141,9 @@ pub enum MagicLinkCommand { Host { /// The verification code to host code: String, - /// How long to host it (e.g., 5m, 10m) - #[arg(long, default_value = "5m")] - expires: String, + /// How long to host it in seconds (max 3600) + #[arg(long, default_value = "300")] + expires: u64, }, /// List active magic links List, diff --git a/src/db.rs b/src/db.rs index 4a38133..b5aa051 100644 --- a/src/db.rs +++ b/src/db.rs @@ -1,7 +1,7 @@ use anyhow::{Context, Result}; use rusqlite::Connection; use std::path::{Path, PathBuf}; -use std::sync::mpsc; +use std::sync::{Condvar, Mutex}; use std::time::{Duration, Instant}; use crate::config; @@ -11,12 +11,12 @@ use crate::config; const CONN_MAX_LIFETIME: Duration = Duration::from_secs(1800); // 30 minutes /// Zero-dependency connection pool for SQLite. -/// Uses a bounded sync_channel to distribute pre-opened connections. +/// Uses Mutex + Condvar — minimal overhead for small pool sizes (2-8). /// WAL mode allows concurrent readers; the pool prevents serialization /// behind a single Mutex. pub struct DbPool { - sender: mpsc::SyncSender<(Connection, Instant)>, - receiver: std::sync::Mutex>, + conns: Mutex>, + available: Condvar, db_path: PathBuf, } @@ -58,12 +58,12 @@ impl Drop for PooledConn<'_> { tracing::warn!("Returning connection with active transaction, rolling back"); let _ = c.execute_batch("ROLLBACK"); } - // Panic-safe pool return: catch any panic during channel send to prevent + // Panic-safe pool return: catch any panic during mutex lock to prevent // double-panic abort (which would skip remaining destructors and Zeroizing). - let sender = &self.pool.sender; - let created = self.created_at; let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { - let _ = sender.try_send((c, created)); + let mut conns = self.pool.conns.lock().unwrap_or_else(|e| e.into_inner()); + conns.push((c, self.created_at)); + self.pool.available.notify_one(); })); if result.is_err() { tracing::error!("Panic while returning connection to pool (connection leaked)"); @@ -77,29 +77,40 @@ impl DbPool { /// Connections older than 30 minutes are recycled to reset SQLite's /// internal allocator and prevent page cache fragmentation. pub fn get(&self) -> Result> { - let rx = self.receiver.lock().unwrap_or_else(|e| e.into_inner()); - let (conn, created_at) = rx - .recv_timeout(Duration::from_secs(5)) - .map_err(|_| anyhow::anyhow!("DB pool exhausted (5s timeout)"))?; - - // Recycle stale connections to reset SQLite's internal allocator - if created_at.elapsed() > CONN_MAX_LIFETIME { - drop(conn); - let fresh = open_connection(&self.db_path)?; - return Ok(PooledConn { - pool: self, - conn: Some(fresh), - created_at: Instant::now(), - acquired_at: Instant::now(), - }); + let deadline = Instant::now() + Duration::from_secs(5); + let mut conns = self.conns.lock().unwrap_or_else(|e| e.into_inner()); + loop { + if let Some((conn, created_at)) = conns.pop() { + if created_at.elapsed() > CONN_MAX_LIFETIME { + drop(conn); + drop(conns); // release lock during open_connection + let fresh = open_connection(&self.db_path)?; + return Ok(PooledConn { + pool: self, + conn: Some(fresh), + created_at: Instant::now(), + acquired_at: Instant::now(), + }); + } + return Ok(PooledConn { + pool: self, + conn: Some(conn), + created_at, + acquired_at: Instant::now(), + }); + } + let remaining = deadline.saturating_duration_since(Instant::now()); + if remaining.is_zero() { + anyhow::bail!("DB pool exhausted (5s timeout)"); + } + let (guard, result) = self.available + .wait_timeout(conns, remaining) + .unwrap_or_else(|e| e.into_inner()); + conns = guard; + if result.timed_out() && conns.is_empty() { + anyhow::bail!("DB pool exhausted (5s timeout)"); + } } - - Ok(PooledConn { - pool: self, - conn: Some(conn), - created_at, - acquired_at: Instant::now(), - }) } } @@ -150,16 +161,16 @@ pub fn open_pool(size: usize) -> Result { migrate(&first)?; let now = Instant::now(); - let (tx, rx) = mpsc::sync_channel(size); - tx.send((first, now)).expect("channel just created"); + let mut conns = Vec::with_capacity(size); + conns.push((first, now)); for _ in 1..size { - tx.send((open_connection(&db_path)?, now)).expect("channel just created"); + conns.push((open_connection(&db_path)?, now)); } Ok(DbPool { - sender: tx, - receiver: std::sync::Mutex::new(rx), + conns: Mutex::new(conns), + available: Condvar::new(), db_path, }) } @@ -174,12 +185,11 @@ pub fn open() -> Result { } fn migrate(conn: &Connection) -> Result<()> { - // Migrate magic_links from old schema (plaintext `code`) to new (hashed `code_hash`). + // Migrate magic_links from old schemas (plaintext `code` or `hint` column). // Magic links are short-lived, so dropping the table is safe. - let has_old_schema = conn - .prepare("SELECT code FROM magic_links LIMIT 0") - .is_ok(); - if has_old_schema { + let needs_recreate = conn.prepare("SELECT code FROM magic_links LIMIT 0").is_ok() + || conn.prepare("SELECT hint FROM magic_links LIMIT 0").is_ok(); + if needs_recreate { conn.execute_batch("DROP TABLE magic_links;") .context("Failed to migrate magic_links table")?; } @@ -187,7 +197,6 @@ fn migrate(conn: &Connection) -> Result<()> { conn.execute_batch( "CREATE TABLE IF NOT EXISTS magic_links ( code_hash TEXT PRIMARY KEY, - hint TEXT NOT NULL DEFAULT '', expires_at INTEGER NOT NULL ); diff --git a/src/deposit.rs b/src/deposit.rs index 40e4ba9..a48be36 100644 --- a/src/deposit.rs +++ b/src/deposit.rs @@ -42,17 +42,12 @@ pub fn create_signed_token( } /// Verify signature and expiry only (no DB access). Returns the payload if valid. +/// Silent on failure — no logging to prevent timing attacks and information leakage. pub fn verify_signature( token: &str, verifying_key: &ed25519_dalek::VerifyingKey, ) -> Option { - match try_verify_signature(token, verifying_key) { - Ok(p) => Some(p), - Err(e) => { - tracing::debug!("Deposit verify failed: {}", e); - None - } - } + try_verify_signature(token, verifying_key).ok() } fn try_verify_signature( @@ -170,52 +165,11 @@ fn generate_nonce() -> String { hex::encode(bytes) } -// Accepts "10m", "1h", "30s". -pub fn parse_duration(s: &str) -> anyhow::Result { - let s = s.trim(); - if s.is_empty() { - anyhow::bail!("Empty duration string"); - } - - let (num_str, multiplier) = if let Some(n) = s.strip_suffix('m') { - (n, 60) - } else if let Some(n) = s.strip_suffix('h') { - (n, 3600) - } else if let Some(n) = s.strip_suffix('s') { - (n, 1) - } else { - anyhow::bail!("Duration must end with 's' (seconds), 'm' (minutes), or 'h' (hours)"); - }; - - let num: u64 = num_str - .parse() - .map_err(|_| anyhow::anyhow!("Invalid number in duration: '{num_str}'"))?; - - Ok(Duration::from_secs( - num.checked_mul(multiplier) - .ok_or_else(|| anyhow::anyhow!("Duration too large"))?, - )) -} - #[cfg(test)] mod tests { use super::*; use crate::crypto::signing as s; - #[test] - fn parse_duration_values() { - assert_eq!(parse_duration("10m").unwrap(), Duration::from_secs(600)); - assert_eq!(parse_duration("1h").unwrap(), Duration::from_secs(3600)); - assert_eq!(parse_duration("30s").unwrap(), Duration::from_secs(30)); - } - - #[test] - fn parse_duration_rejects_garbage() { - assert!(parse_duration("").is_err()); - assert!(parse_duration("10x").is_err()); - assert!(parse_duration("abc").is_err()); - } - #[test] fn signed_token_roundtrip() { let (sk, _vk) = s::generate_keypair(); @@ -293,18 +247,6 @@ mod tests { assert!(payload.expires_at > now + 23 * 3600); // but at least ~23h } - #[test] - fn parse_duration_zero_values() { - assert_eq!(parse_duration("0s").unwrap(), Duration::from_secs(0)); - assert_eq!(parse_duration("0m").unwrap(), Duration::from_secs(0)); - assert_eq!(parse_duration("0h").unwrap(), Duration::from_secs(0)); - } - - #[test] - fn parse_duration_trims_whitespace() { - assert_eq!(parse_duration(" 10m ").unwrap(), Duration::from_secs(600)); - } - fn test_db() -> rusqlite::Connection { let conn = rusqlite::Connection::open_in_memory().unwrap(); conn.execute_batch( diff --git a/src/init.rs b/src/init.rs index 2454f49..494a0da 100644 --- a/src/init.rs +++ b/src/init.rs @@ -42,6 +42,10 @@ pub fn run( (None, Some(_)) => bail!("--tls-key requires --tls-cert"), _ => {} } + // Require explicit TLS config: either BYO certs or --no-tls + if !no_tls && tls_cert.is_none() { + bail!("TLS requires --tls-cert and --tls-key. Use --no-tls for plain HTTP (e.g., behind a reverse proxy)."); + } if let Some(ref cert_path) = tls_cert { if !std::path::Path::new(cert_path).exists() { bail!("TLS certificate file not found: {cert_path}"); diff --git a/src/magic_link.rs b/src/magic_link.rs index 4371d64..b62056d 100644 --- a/src/magic_link.rs +++ b/src/magic_link.rs @@ -3,37 +3,34 @@ use sha2::{Digest, Sha256}; use crate::config; use crate::db; -use crate::deposit::parse_duration; fn hash_code(code: &str) -> String { let hash = Sha256::digest(code.as_bytes()); hex::encode(hash) } -pub fn host(code: &str, expires: &str) -> Result<()> { +pub fn host(code: &str, expires_secs: u64) -> Result<()> { if code.len() < 20 { anyhow::bail!("Code must be at least 20 characters (got {}). Use a longer code to prevent brute-force.", code.len()); } - let duration = parse_duration(expires)?; let max_secs: u64 = 3600; // 1 hour max for magic links - let capped_secs = duration.as_secs().min(max_secs); + let capped_secs = expires_secs.min(max_secs); let expires_at = chrono::Utc::now().timestamp() + i64::try_from(capped_secs).map_err(|_| anyhow::anyhow!("Duration too large"))?; let code_hash = hash_code(code); - let hint = &code[..2.min(code.len())]; let conn = db::open()?; conn.execute( - "INSERT OR REPLACE INTO magic_links (code_hash, hint, expires_at) VALUES (?1, ?2, ?3)", - rusqlite::params![code_hash, hint, expires_at], + "INSERT OR REPLACE INTO magic_links (code_hash, expires_at) VALUES (?1, ?2)", + rusqlite::params![code_hash, expires_at], )?; let creds = crate::credentials::Credentials::load(&config::credentials_path()?)?; let url = format!("{}/m/{}", creds.base_url(), code); println!("{url}"); - println!("Expires in {expires}. Service can verify at that URL."); + println!("Expires in {capped_secs}s. Service can verify at that URL."); Ok(()) } @@ -44,22 +41,20 @@ pub fn list() -> Result<()> { // clean expired conn.execute("DELETE FROM magic_links WHERE expires_at <= ?1", [now])?; - let mut stmt = conn.prepare("SELECT hint, expires_at FROM magic_links ORDER BY expires_at")?; - let rows = stmt.query_map([], |row| { - Ok((row.get::<_, String>(0)?, row.get::<_, i64>(1)?)) - })?; + let mut stmt = conn.prepare("SELECT expires_at FROM magic_links ORDER BY expires_at")?; + let rows = stmt.query_map([], |row| row.get::<_, i64>(0))?; - let mut found = false; + let mut idx = 0u32; for row in rows { - let (hint, expires_at) = row?; + let expires_at = row?; let remaining = expires_at - now; let mins = remaining / 60; let secs = remaining % 60; - println!("{hint}**** (expires in {mins}m {secs}s)"); - found = true; + idx += 1; + println!(" #{idx} expires in {mins}m {secs}s"); } - if !found { + if idx == 0 { println!("No active magic links."); } Ok(()) @@ -115,17 +110,16 @@ mod tests { fn test_db() -> rusqlite::Connection { let conn = rusqlite::Connection::open_in_memory().unwrap(); conn.execute_batch( - "CREATE TABLE magic_links (code_hash TEXT PRIMARY KEY, hint TEXT NOT NULL DEFAULT '', expires_at INTEGER NOT NULL);" + "CREATE TABLE magic_links (code_hash TEXT PRIMARY KEY, expires_at INTEGER NOT NULL);" ).unwrap(); conn } fn insert_code(conn: &rusqlite::Connection, code: &str, expires_at: i64) { let code_hash = hash_code(code); - let hint = &code[..2.min(code.len())]; conn.execute( - "INSERT INTO magic_links (code_hash, hint, expires_at) VALUES (?1, ?2, ?3)", - rusqlite::params![code_hash, hint, expires_at], + "INSERT INTO magic_links (code_hash, expires_at) VALUES (?1, ?2)", + rusqlite::params![code_hash, expires_at], ).unwrap(); } diff --git a/src/main.rs b/src/main.rs index 4886ab9..d253ad1 100644 --- a/src/main.rs +++ b/src/main.rs @@ -179,7 +179,7 @@ async fn main() -> Result<()> { let creds_path = config::credentials_path()?; let creds = credentials::Credentials::load(&creds_path)?; let signing_key = creds.signing_key()?; - let duration = deposit::parse_duration(&expires)?; + let duration = std::time::Duration::from_secs(expires); let token = deposit::create_signed_token(&label, duration, &signing_key)?; let url = format!("{}/d/{}", creds.base_url(), token); println!("{url}"); @@ -218,7 +218,7 @@ async fn main() -> Result<()> { Command::MagicLink { command } => match command { MagicLinkCommand::Host { code, expires } => { - magic_link::host(&code, &expires)?; + magic_link::host(&code, expires)?; } MagicLinkCommand::List => { magic_link::list()?; diff --git a/src/server.rs b/src/server.rs index 82366de..310ba7c 100644 --- a/src/server.rs +++ b/src/server.rs @@ -7,8 +7,8 @@ use axum::{ routing::{get, post}, Router, }; -use dashmap::DashMap; use serde::Serialize; +use std::collections::HashMap; use std::net::{IpAddr, SocketAddr}; use std::sync::atomic::Ordering; use std::sync::Arc; @@ -32,6 +32,8 @@ const MAX_INPUT_LEN: usize = 256; /// Circuit breaker cool-down: DB operations rejected for this many seconds after last failure. const DB_CIRCUIT_COOLDOWN_SECS: u64 = 60; +const RATE_SHARDS: usize = 8; + fn epoch_secs() -> u64 { std::time::SystemTime::now() .duration_since(std::time::UNIX_EPOCH) @@ -39,12 +41,89 @@ fn epoch_secs() -> u64 { .as_secs() } +/// Const lookup table for printable ASCII validation. +/// No branches per byte, cache-friendly, auto-vectorizes on x86_64. +const ASCII_OK: [bool; 256] = { + let mut table = [false; 256]; + let mut i: usize = 32; // space + while i <= 126 { + table[i] = true; + i += 1; + } + table +}; + /// Reject inputs with non-printable or non-ASCII characters. -/// `bytes().all()` is auto-vectorized on x86_64 (SSE/AVX) — no UTF-8 overhead. fn is_valid_input(s: &str) -> bool { s.len() > 0 && s.len() <= MAX_INPUT_LEN - && s.bytes().all(|b| b.is_ascii_graphic() || b == b' ') + && s.bytes().all(|b| ASCII_OK[b as usize]) +} + +/// Sharded mutex rate limiter — replaces DashMap for minimal overhead at <10k entries. +/// 8 shards eliminate contention without the DashMap dependency tree. +struct RateLimiter { + shards: [std::sync::Mutex>; RATE_SHARDS], +} + +impl RateLimiter { + fn new() -> Self { + Self { + shards: std::array::from_fn(|_| std::sync::Mutex::new(HashMap::new())), + } + } + + fn shard_index(ip: &IpAddr) -> usize { + let h = match ip { + IpAddr::V4(v4) => { + let o = v4.octets(); + (o[0] as usize).wrapping_mul(31) ^ (o[1] as usize).wrapping_mul(17) + ^ (o[2] as usize).wrapping_mul(7) ^ (o[3] as usize) + } + IpAddr::V6(v6) => { + v6.octets().iter().fold(0usize, |acc, &b| acc.wrapping_mul(31) ^ b as usize) + } + }; + h & (RATE_SHARDS - 1) + } + + fn check(&self, ip: IpAddr) -> bool { + let now = std::time::Instant::now(); + let window = std::time::Duration::from_secs(RATE_LIMIT_WINDOW_SECS); + let mut shard = self.shards[Self::shard_index(&ip)] + .lock() + .unwrap_or_else(|e| e.into_inner()); + // Hard cap per shard to prevent unbounded growth + if shard.len() >= RATE_LIMIT_MAX_ENTRIES / RATE_SHARDS { + let stale = shard.iter() + .find(|(_, (_, ts))| now.duration_since(*ts) > window) + .map(|(k, _)| *k); + match stale { + Some(key) => { shard.remove(&key); } + None => return false, + } + } + let entry = shard.entry(ip).or_insert((0, now)); + if now.duration_since(entry.1) > window { + *entry = (1, now); + true + } else if entry.0 >= RATE_LIMIT_MAX_REQUESTS { + false + } else { + entry.0 += 1; + true + } + } + + /// Evict stale entries from all shards. Called by hourly cleanup. + fn clean_stale(&self) { + let now = std::time::Instant::now(); + let window = std::time::Duration::from_secs(RATE_LIMIT_WINDOW_SECS); + for shard in &self.shards { + let mut map = shard.lock().unwrap_or_else(|e| e.into_inner()); + map.retain(|_, (_, ts)| now.duration_since(*ts) <= window); + } + } } pub struct AppState { @@ -56,11 +135,8 @@ pub struct AppState { pub db_pool: crate::db::DbPool, pub tls_active: bool, pub behind_proxy: bool, - /// Sharded concurrent map — no global mutex contention under high concurrency. - /// Uses monotonic Instant (not wall clock) to prevent clock-skew manipulation. - /// FxHasher: IP keys are not attacker-hashed, so collision resistance is unnecessary; - /// ~2-3x faster than SipHash on rate-limit hot paths. - rate_limiter: DashMap, + /// Sharded mutex rate limiter — monotonic Instant prevents clock-skew attacks. + rate_limiter: RateLimiter, /// In-flight request counter for graceful shutdown drain. in_flight: std::sync::atomic::AtomicUsize, /// Circuit breaker: epoch second of last DB failure. Circuit open if within cooldown. @@ -73,32 +149,8 @@ impl AppState { } /// Returns true if the request is within rate limits. - /// Uses DashMap (sharded locks) so concurrent requests don't serialize on a global mutex. - /// Monotonic Instant prevents clock-skew attacks from resetting windows. pub fn check_rate_limit(&self, ip: IpAddr) -> bool { - let now = std::time::Instant::now(); - let window = std::time::Duration::from_secs(RATE_LIMIT_WINDOW_SECS); - // Hard cap: if full, try to evict one expired entry before rejecting. - // Prevents attackers from permanently blocking legitimate IPs by filling the map. - if self.rate_limiter.len() >= RATE_LIMIT_MAX_ENTRIES { - let stale_key = self.rate_limiter.iter() - .find(|entry| now.duration_since(entry.value().1) > window) - .map(|entry| *entry.key()); - match stale_key { - Some(key) => { self.rate_limiter.remove(&key); } - None => return false, - } - } - let mut entry = self.rate_limiter.entry(ip).or_insert((0, now)); - if now.duration_since(entry.1) > window { - *entry = (1, now); - true - } else if entry.0 >= RATE_LIMIT_MAX_REQUESTS { - false - } else { - entry.0 += 1; - true - } + self.rate_limiter.check(ip) } /// Record a DB failure timestamp. The first request after the cooldown naturally tests the DB. @@ -139,35 +191,7 @@ const MAX_BODY_SIZE: usize = 64 * 1024; pub async fn run_server(credentials: Credentials) -> Result<()> { // --- Startup checks --- - // Enforce file descriptor limit: raise soft limit toward 65535 if below 4096. - // Without enough fds, a connection spike causes "too many open files" crashes. - #[cfg(unix)] - { - let mut rlim = libc::rlimit { rlim_cur: 0, rlim_max: 0 }; - if unsafe { libc::getrlimit(libc::RLIMIT_NOFILE, &mut rlim) } == 0 { - let current = rlim.rlim_cur; - if current < 4096 { - let target = rlim.rlim_max.min(65535); - if target >= 4096 { - rlim.rlim_cur = target; - if unsafe { libc::setrlimit(libc::RLIMIT_NOFILE, &rlim) } == 0 { - tracing::info!("Raised RLIMIT_NOFILE soft limit: {current} → {target}"); - } else { - tracing::warn!( - "Failed to raise RLIMIT_NOFILE from {current} to {target}" - ); - } - } else { - anyhow::bail!( - "RLIMIT_NOFILE hard limit is {} (need ≥4096). Raise with: ulimit -n 4096", - rlim.rlim_max - ); - } - } - } - } - - // Fix 8: Warn if log file is getting large (risk of disk-full on vault writes) + // Warn if log file is getting large (risk of disk-full on vault writes) if let Ok(log_path) = config::log_path() { if let Ok(metadata) = std::fs::metadata(&log_path) { let size_mb = metadata.len() / (1024 * 1024); @@ -207,7 +231,7 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { let db_pool = crate::db::open_pool(pool_size)?; let addr = SocketAddr::from(([0, 0, 0, 0], credentials.port)); - let tls_mode = TlsMode::from_credentials(&credentials); + let tls_mode = TlsMode::from_credentials(&credentials)?; let tls_active = !matches!(tls_mode, TlsMode::None); @@ -220,7 +244,7 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { db_pool, tls_active, behind_proxy, - rate_limiter: DashMap::with_capacity_and_hasher(256, rustc_hash::FxBuildHasher), + rate_limiter: RateLimiter::new(), in_flight: std::sync::atomic::AtomicUsize::new(0), last_db_failure: std::sync::atomic::AtomicU64::new(0), }); @@ -258,7 +282,7 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { } }); - // Background task: clean expired magic links, old deposit nonces hourly + // Background task: clean expired magic links, old deposit nonces, stale rate limiter entries hourly let cleanup_state = state.clone(); tokio::spawn(async move { loop { @@ -315,6 +339,8 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { } let _ = conn.execute_batch("PRAGMA optimize;"); }).await; + // Clean stale rate limiter entries (non-blocking, quick lock per shard) + cleanup_state.rate_limiter.clean_stale(); } }); @@ -368,13 +394,9 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { .await .context("Server error")?; } - TlsMode::Auto { .. } | TlsMode::Custom { .. } => { - let is_auto = matches!(tls_mode, TlsMode::Auto { .. }); + TlsMode::Custom { .. } => { let rustls_config = crate::tls::resolve_rustls_config(&tls_mode).await?; - if is_auto { - crate::tls::spawn_renewal_watcher(rustls_config.clone()); - } - // Reload TLS cert on SIGHUP (works for both auto and custom certs) + // Reload TLS cert on SIGHUP #[cfg(unix)] { let sighup_config = rustls_config.clone(); @@ -406,8 +428,7 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { shutdown_signal().await; shutdown_handle.graceful_shutdown(Some(std::time::Duration::from_secs(30))); }); - let mode_label = if is_auto { "acme.sh" } else { "custom cert" }; - info!("Listening on {} (HTTPS/{}, PID {})", addr, mode_label, std::process::id()); + info!("Listening on {} (HTTPS, PID {})", addr, std::process::id()); axum_server::bind_rustls(addr, rustls_config) .handle(handle) .serve(app.into_make_service_with_connect_info::()) @@ -422,7 +443,7 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { let drain_start = std::time::Instant::now(); let drain_timeout = std::time::Duration::from_secs(30); loop { - let remaining = shutdown_state.in_flight.load(Ordering::Relaxed); + let remaining = shutdown_state.in_flight.load(Ordering::SeqCst); if remaining == 0 { break; } diff --git a/src/tls.rs b/src/tls.rs index 37f93c5..4032478 100644 --- a/src/tls.rs +++ b/src/tls.rs @@ -1,30 +1,25 @@ use anyhow::{Context, Result}; use axum_server::tls_rustls::RustlsConfig; -use std::path::Path; -use std::process::Command; -use tracing::{info, warn}; - -use crate::config; +use tracing::info; pub enum TlsMode { - Auto { domain: String }, Custom { cert_path: String, key_path: String }, None, } impl TlsMode { - pub fn from_credentials(creds: &crate::credentials::Credentials) -> Self { + pub fn from_credentials(creds: &crate::credentials::Credentials) -> Result { if creds.no_tls { - TlsMode::None + Ok(TlsMode::None) } else if let (Some(cert), Some(key)) = (&creds.tls_cert, &creds.tls_key) { - TlsMode::Custom { + Ok(TlsMode::Custom { cert_path: cert.clone(), key_path: key.clone(), - } + }) } else { - TlsMode::Auto { - domain: creds.domain.clone(), - } + anyhow::bail!( + "TLS requires --tls-cert and --tls-key, or use --no-tls for plain HTTP" + ) } } } @@ -37,201 +32,8 @@ pub async fn resolve_rustls_config(mode: &TlsMode) -> Result { .await .context("Failed to load TLS cert/key") } - TlsMode::Auto { domain } => { - let tls_dir = config::tls_dir()?; - std::fs::create_dir_all(&tls_dir)?; - let cert_path = tls_dir.join("fullchain.pem"); - let key_path = tls_dir.join("key.pem"); - - if !cert_path.exists() || !key_path.exists() { - issue_cert(domain, &tls_dir)?; - } - - RustlsConfig::from_pem_file(&cert_path, &key_path) - .await - .context("Failed to load TLS cert") - } TlsMode::None => { unreachable!("resolve_rustls_config called with TlsMode::None") } } } - -// Install acme.sh if not present, issue cert, install to ~/.atomic/tls/ -fn issue_cert(domain: &str, tls_dir: &Path) -> Result<()> { - // Defense-in-depth: validate domain even though init.rs already checks - if !domain.chars().all(|c| c.is_ascii_alphanumeric() || c == '-' || c == '.') { - anyhow::bail!("Domain contains invalid characters: {domain}"); - } - - ensure_acme_sh()?; - let acme_sh = acme_sh_path()?; - - info!("Issuing TLS cert for {} via acme.sh", domain); - - // Issue using standalone mode (binds :80 for HTTP-01 challenge) - let output = Command::new(&acme_sh) - .args([ - "--issue", - "-d", domain, - "--standalone", - "--server", "letsencrypt", - ]) - .output() - .context("Failed to run acme.sh --issue")?; - - if !output.status.success() { - let stderr = String::from_utf8_lossy(&output.stderr); - let stdout = String::from_utf8_lossy(&output.stdout); - // acme.sh returns 2 if cert already exists and is still valid - if !stdout.contains("Cert success") && !stderr.contains("already") && output.status.code() != Some(2) { - anyhow::bail!( - "acme.sh --issue failed (exit {}):\n{}\n{}", - output.status, - stdout, - stderr - ); - } - } - - // Install cert files to our tls dir - let output = Command::new(&acme_sh) - .args([ - "--install-cert", - "-d", domain, - "--fullchain-file", &tls_dir.join("fullchain.pem").to_string_lossy(), - "--key-file", &tls_dir.join("key.pem").to_string_lossy(), - ]) - .output() - .context("Failed to run acme.sh --install-cert")?; - - if !output.status.success() { - let stderr = String::from_utf8_lossy(&output.stderr); - anyhow::bail!("acme.sh --install-cert failed: {stderr}"); - } - - info!("TLS cert installed to {}", tls_dir.display()); - Ok(()) -} - -fn ensure_acme_sh() -> Result<()> { - if acme_sh_path().is_ok() { - return Ok(()); - } - - info!("Installing acme.sh..."); - let home = dirs::home_dir().context("No home directory")?; - let acme_home = home.join(".acme.sh"); - - // Download the script first, then run it with proper argument separation - // to avoid shell injection via the home directory path. - let download = Command::new("curl") - .args(["-fsSL", "https://raw.githubusercontent.com/acmesh-official/acme.sh/master/acme.sh"]) - .output() - .context("Failed to download acme.sh. Is curl available?")?; - - if !download.status.success() { - let stderr = String::from_utf8_lossy(&download.stderr); - anyhow::bail!("Failed to download acme.sh: {stderr}"); - } - - let output = Command::new("sh") - .arg("-s") - .arg("--") - .arg("--install-online") - .arg("--home") - .arg(&acme_home) - .stdin(std::process::Stdio::piped()) - .stdout(std::process::Stdio::piped()) - .stderr(std::process::Stdio::piped()) - .spawn() - .and_then(|mut child| { - use std::io::Write; - if let Some(ref mut stdin) = child.stdin { - stdin.write_all(&download.stdout)?; - } - child.wait_with_output() - }) - .context("Failed to install acme.sh")?; - - if !output.status.success() { - let stderr = String::from_utf8_lossy(&output.stderr); - let stdout = String::from_utf8_lossy(&output.stdout); - anyhow::bail!( - "acme.sh install failed.\n\ - You can install it manually: curl https://get.acme.sh | sh\n\ - stdout: {stdout}\n\ - stderr: {stderr}" - ); - } - - // Verify it's there now - acme_sh_path().context( - "acme.sh installed but not found. Try installing manually: curl https://get.acme.sh | sh" - )?; - info!("acme.sh installed"); - Ok(()) -} - -fn acme_sh_path() -> Result { - // Check common locations - let home = dirs::home_dir().context("No home directory")?; - let candidates = [ - home.join(".acme.sh/acme.sh"), - std::path::PathBuf::from("/usr/local/bin/acme.sh"), - ]; - - for path in &candidates { - if path.exists() { - return Ok(path.clone()); - } - } - - // Try PATH - let output = Command::new("which") - .arg("acme.sh") - .output(); - - if let Ok(output) = output { - if output.status.success() { - let path = String::from_utf8_lossy(&output.stdout).trim().to_string(); - if !path.is_empty() { - return Ok(std::path::PathBuf::from(path)); - } - } - } - - anyhow::bail!("acme.sh not found") -} - -// acme.sh sets up its own cron job for renewal. Poll every 6 hours to pick up -// renewed certs. SIGHUP provides immediate reload when needed. -// No filesystem watcher (notify crate removed) — a cert changes every 60 days, -// so polling + SIGHUP covers it with zero extra threads or OS handles. -pub fn spawn_renewal_watcher(rustls_config: RustlsConfig) { - tokio::spawn(async move { - let check_interval = std::time::Duration::from_secs(6 * 3600); - loop { - tokio::time::sleep(check_interval).await; - - let tls_dir = match config::tls_dir() { - Ok(d) => d, - Err(e) => { - warn!("Cert reload check failed: {e}"); - continue; - } - }; - - let cert_path = tls_dir.join("fullchain.pem"); - let key_path = tls_dir.join("key.pem"); - - match rustls_config - .reload_from_pem_file(&cert_path, &key_path) - .await - { - Ok(()) => info!("TLS cert reloaded (6h poll)"), - Err(e) => warn!("TLS cert reload failed: {e}"), - } - } - }); -} From a0597d7bb6516c948a5bb39743814b5b62ebfdc6 Mon Sep 17 00:00:00 2001 From: Hitesh Joshi Date: Tue, 31 Mar 2026 11:43:00 +0530 Subject: [PATCH 35/49] Update README with changelog for 1c4ad26 --- README.md | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ebf9498..f4c7c9d 100644 --- a/README.md +++ b/README.md @@ -294,6 +294,20 @@ Cross-compiles to `x86_64-linux-musl`, `aarch64-linux-musl`, `x86_64-apple-darwi - Rate limiter evicts one expired entry inline when DashMap is full instead of blanket-denying new IPs - Global per-second rate limit (20/s) on magic link claims prevents distributed brute-force +**1c4ad26** — Kill auto-TLS/acme.sh, sharded-mutex rate limiter, Condvar pool, drop parse_duration, strip magic link hints, const-table validation, remove libc/dashmap/rustc-hash +- `tls.rs`: remove Auto-TLS entirely — delete acme.sh shell execution, issue_cert, ensure_acme_sh, spawn_renewal_watcher. TLS now requires BYO cert (`--tls-cert`/`--tls-key`) or `--no-tls`. Eliminates shell injection risk and ~200 lines. +- `server.rs`: replace DashMap with 8-shard `Mutex` rate limiter — removes `dashmap` + `rustc-hash` deps, zero contention for <10k entries, hourly stale cleanup. +- `server.rs`: const lookup table (`ASCII_OK[256]`) for input validation — branchless, cache-friendly, auto-vectorizes. +- `server.rs`: remove `libc::setrlimit` RLIMIT_NOFILE block — fd limits are sysadmin responsibility. Drops `libc` dep and only unsafe-adjacent code. +- `server.rs`: SeqCst ordering on shutdown drain loop; hourly cleanup evicts stale rate limiter entries. +- `db.rs`: replace `mpsc::sync_channel` pool with `Mutex` + `Condvar` — simpler, faster for pool sizes 2-8. Same RAII guard, poison detection, panic-safe return. +- `db.rs`: schema migration drops `hint` column from `magic_links` table. +- `deposit.rs`: delete `parse_duration()` — CLI `--expires` now accepts u64 seconds directly. Removes suffix parsing, overflow checks, and 6 tests. +- `deposit.rs`: remove debug logging from `verify_signature` — silent failure prevents info leakage. +- `magic_link.rs`: strip 2-char hint from magic links — no longer store or display code prefix (metadata leak). +- `init.rs`: require explicit `--tls-cert`/`--tls-key` or `--no-tls` (no more implicit auto-TLS). +- Net: -479 lines, -3 dependencies (dashmap, rustc-hash, libc), 65 tests passing. + **72f8305** — Constant-time magic link, fs cert watcher, FxHasher rate limiter, Acquire/Release circuit breaker, in-flight drain - `magic_link.rs`: SELECT + `subtle::ConstantTimeEq` before DELETE prevents timing side-channels on code existence - `main.rs`: jemalloc background thread enabled for aggressive memory purging of zeroed key material @@ -332,7 +346,7 @@ Cross-compiles to `x86_64-linux-musl`, `aarch64-linux-musl`, `x86_64-apple-darwi - [x] Deposit box (signed URLs, encrypted vault, audit log) - [x] Magic links (domain verification) - [x] Request signing -- [x] Auto-TLS +- [x] TLS (BYO cert + SIGHUP reload) - [ ] NS delegation + hosted subdomains - [ ] Agent email - [ ] Capability declarations From e61756c01ce3a58abbca042c1be8792c751609ca Mon Sep 17 00:00:00 2001 From: Hitesh Joshi Date: Tue, 31 Mar 2026 11:54:35 +0530 Subject: [PATCH 36/49] =?UTF-8?q?=CE=BB-RLM=20iter=2023:=20Box::leak=20App?= =?UTF-8?q?State=20(kill=20Arc=20refcount),=20drop=20chrono=20(epoch=5Fsec?= =?UTF-8?q?s=20+=20Hinnant=20RFC3339),=20Box=20vault=20secrets,=20sta?= =?UTF-8?q?tic=20magic-link=20JSON=20response?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Cargo.lock | 116 ---------------------------------------------- Cargo.toml | 3 -- src/agent_json.rs | 2 +- src/config.rs | 47 +++++++++++++++++++ src/deposit.rs | 18 ++++--- src/magic_link.rs | 14 +++--- src/server.rs | 66 ++++++++++---------------- src/sign.rs | 2 +- src/vault.rs | 9 ++-- 9 files changed, 93 insertions(+), 184 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f337851..c85b30c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -47,15 +47,6 @@ dependencies = [ "memchr", ] -[[package]] -name = "android_system_properties" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" -dependencies = [ - "libc", -] - [[package]] name = "anstream" version = "0.6.21" @@ -131,7 +122,6 @@ dependencies = [ "axum-server", "base64", "bytes", - "chrono", "clap", "dirs", "ed25519-dalek", @@ -326,20 +316,6 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" -[[package]] -name = "chrono" -version = "0.4.44" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c673075a2e0e5f4a1dde27ce9dee1ea4558c7ffe648f576438a20ca1d2acc4b0" -dependencies = [ - "iana-time-zone", - "js-sys", - "num-traits", - "serde", - "wasm-bindgen", - "windows-link", -] - [[package]] name = "cipher" version = "0.4.4" @@ -411,12 +387,6 @@ version = "0.9.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8" -[[package]] -name = "core-foundation-sys" -version = "0.8.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" - [[package]] name = "cpufeatures" version = "0.2.17" @@ -917,30 +887,6 @@ dependencies = [ "tracing", ] -[[package]] -name = "iana-time-zone" -version = "0.1.65" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e31bc9ad994ba00e440a8aa5c9ef0ec67d5cb5e5cb0cc7f8b744a35b389cc470" -dependencies = [ - "android_system_properties", - "core-foundation-sys", - "iana-time-zone-haiku", - "js-sys", - "log", - "wasm-bindgen", - "windows-core", -] - -[[package]] -name = "iana-time-zone-haiku" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" -dependencies = [ - "cc", -] - [[package]] name = "icu_collections" version = "2.1.1" @@ -1207,15 +1153,6 @@ dependencies = [ "windows-sys 0.61.2", ] -[[package]] -name = "num-traits" -version = "0.2.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" -dependencies = [ - "autocfg", -] - [[package]] name = "once_cell" version = "1.21.3" @@ -2257,65 +2194,12 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" -[[package]] -name = "windows-core" -version = "0.62.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8e83a14d34d0623b51dce9581199302a221863196a1dde71a7663a4c2be9deb" -dependencies = [ - "windows-implement", - "windows-interface", - "windows-link", - "windows-result", - "windows-strings", -] - -[[package]] -name = "windows-implement" -version = "0.60.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "windows-interface" -version = "0.59.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "windows-link" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" -[[package]] -name = "windows-result" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7781fa89eaf60850ac3d2da7af8e5242a5ea78d1a11c49bf2910bb5a73853eb5" -dependencies = [ - "windows-link", -] - -[[package]] -name = "windows-strings" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7837d08f69c77cf6b07689544538e017c1bfcf57e34b4c0ff58e6c2cd3b37091" -dependencies = [ - "windows-link", -] - [[package]] name = "windows-sys" version = "0.52.0" diff --git a/Cargo.toml b/Cargo.toml index 58f7ad8..aaf1ed9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -48,9 +48,6 @@ hex = "0.4" # HTTP client (for verify command) reqwest = { version = "0.12", features = ["json", "rustls-tls"], default-features = false } -# Time -chrono = { version = "0.4", features = ["serde"] } - # Logging tracing = "0.1" tracing-subscriber = { version = "0.3", features = ["env-filter"] } diff --git a/src/agent_json.rs b/src/agent_json.rs index 0e263b0..2ae30a8 100644 --- a/src/agent_json.rs +++ b/src/agent_json.rs @@ -22,7 +22,7 @@ impl AgentJson { public_key: public_key.to_string(), status: "active".to_string(), deposit: format!("{base_url}/d/"), - created_at: chrono::Utc::now().to_rfc3339(), + created_at: crate::config::format_rfc3339(crate::config::epoch_secs() as i64), } } diff --git a/src/config.rs b/src/config.rs index ac3dfcf..7700d2f 100644 --- a/src/config.rs +++ b/src/config.rs @@ -93,6 +93,35 @@ pub fn acquire_pid_lock(path: &Path) -> Result { Ok(file) } +/// Current UTC time as Unix epoch seconds. Single syscall, no allocation. +pub fn epoch_secs() -> u64 { + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_secs() +} + +/// Format a Unix timestamp as RFC 3339 UTC (e.g. "2024-01-15T12:30:00Z"). +/// Uses the Hinnant civil_from_days algorithm. No chrono dependency. +pub fn format_rfc3339(epoch: i64) -> String { + let secs = epoch.rem_euclid(86400) as u32; + let days = epoch.div_euclid(86400) as i32; + let z = days + 719468; + let era = if z >= 0 { z } else { z - 146097 } / 146097; + let doe = (z - era * 146097) as u32; + let yoe = (doe - doe / 1460 + doe / 36524 - doe / 146096) / 365; + let y = yoe as i32 + era * 400; + let doy = doe - (365 * yoe + yoe / 4 - yoe / 100); + let mp = (5 * doy + 2) / 153; + let d = doy - (153 * mp + 2) / 5 + 1; + let m = if mp < 10 { mp + 3 } else { mp - 9 }; + let y = if m <= 2 { y + 1 } else { y }; + let h = secs / 3600; + let min = (secs % 3600) / 60; + let s = secs % 60; + format!("{y:04}-{m:02}-{d:02}T{h:02}:{min:02}:{s:02}Z") +} + pub fn ensure_atomic_dir() -> Result { let dir = atomic_dir()?; if !dir.exists() { @@ -144,4 +173,22 @@ mod tests { assert!(dir.starts_with(&home)); assert!(dir.ends_with(".atomic")); } + + #[test] + fn format_rfc3339_epoch_zero() { + assert_eq!(format_rfc3339(0), "1970-01-01T00:00:00Z"); + } + + #[test] + fn format_rfc3339_known_date() { + // 2024-01-01T00:00:00Z + assert_eq!(format_rfc3339(1704067200), "2024-01-01T00:00:00Z"); + } + + #[test] + fn epoch_secs_is_reasonable() { + let now = epoch_secs(); + assert!(now > 1_700_000_000); // after 2023 + assert!(now < 4_000_000_000); // before 2096 + } } diff --git a/src/deposit.rs b/src/deposit.rs index a48be36..d47a4ed 100644 --- a/src/deposit.rs +++ b/src/deposit.rs @@ -24,7 +24,7 @@ pub fn create_signed_token( let nonce = generate_nonce(); let max_expiry: u64 = 24 * 3600; // 24 hours let secs = expires_in.as_secs().min(max_expiry); - let expires_at = chrono::Utc::now().timestamp() + i64::try_from(secs) + let expires_at = crate::config::epoch_secs() as i64 + i64::try_from(secs) .map_err(|_| anyhow::anyhow!("Duration too large"))?; let payload = DepositPayload { @@ -75,7 +75,7 @@ fn try_verify_signature( let payload_json = B64URL.decode(payload_b64)?; let payload: DepositPayload = serde_json::from_slice(&payload_json)?; - let now = chrono::Utc::now().timestamp(); + let now = crate::config::epoch_secs() as i64; if payload.expires_at <= now { anyhow::bail!("Token expired (expires_at={}, now={})", payload.expires_at, now); } @@ -88,7 +88,7 @@ pub fn claim_nonce_with_conn( payload: &DepositPayload, conn: &rusqlite::Connection, ) -> Result<()> { - let now = chrono::Utc::now().timestamp(); + let now = crate::config::epoch_secs() as i64; let inserted = conn.prepare_cached( "INSERT OR IGNORE INTO used_deposits (nonce, label, used_at) VALUES (?1, ?2, ?3)", )?.execute( @@ -109,7 +109,7 @@ pub fn log_deposit( source_ip: &str, user_agent: &str, ) -> Result<()> { - let now = chrono::Utc::now().timestamp(); + let now = crate::config::epoch_secs() as i64; conn.prepare_cached( "INSERT INTO deposit_log (label, source_ip, user_agent, deposited_at) VALUES (?1, ?2, ?3, ?4)", )?.execute( @@ -140,9 +140,7 @@ pub fn list_deposits(label_filter: Option<&str>) -> Result<()> { let ip: String = row.get(1)?; let ua: String = row.get(2)?; let ts: i64 = row.get(3)?; - let time = chrono::DateTime::from_timestamp(ts, 0) - .map(|dt| dt.to_rfc3339()) - .unwrap_or_else(|| ts.to_string()); + let time = crate::config::format_rfc3339(ts); println!("{time} {label}"); println!(" IP: {ip}"); if !ua.is_empty() { @@ -241,7 +239,7 @@ mod tests { let (payload_b64, _) = token.split_once('.').unwrap(); let payload_json = B64URL.decode(payload_b64).unwrap(); let payload: DepositPayload = serde_json::from_slice(&payload_json).unwrap(); - let now = chrono::Utc::now().timestamp(); + let now = crate::config::epoch_secs() as i64; // Should be capped at ~24h, not 48h assert!(payload.expires_at <= now + 24 * 3600 + 5); assert!(payload.expires_at > now + 23 * 3600); // but at least ~23h @@ -261,7 +259,7 @@ mod tests { let payload = DepositPayload { label: "test".into(), nonce: "unique_nonce_123".into(), - expires_at: chrono::Utc::now().timestamp() + 300, + expires_at: crate::config::epoch_secs() as i64 + 300, }; assert!(claim_nonce_with_conn(&payload, &conn).is_ok()); } @@ -272,7 +270,7 @@ mod tests { let payload = DepositPayload { label: "test".into(), nonce: "replay_nonce".into(), - expires_at: chrono::Utc::now().timestamp() + 300, + expires_at: crate::config::epoch_secs() as i64 + 300, }; claim_nonce_with_conn(&payload, &conn).unwrap(); let err = claim_nonce_with_conn(&payload, &conn).unwrap_err(); diff --git a/src/magic_link.rs b/src/magic_link.rs index b62056d..cef18ce 100644 --- a/src/magic_link.rs +++ b/src/magic_link.rs @@ -15,7 +15,7 @@ pub fn host(code: &str, expires_secs: u64) -> Result<()> { } let max_secs: u64 = 3600; // 1 hour max for magic links let capped_secs = expires_secs.min(max_secs); - let expires_at = chrono::Utc::now().timestamp() + let expires_at = crate::config::epoch_secs() as i64 + i64::try_from(capped_secs).map_err(|_| anyhow::anyhow!("Duration too large"))?; let code_hash = hash_code(code); @@ -36,7 +36,7 @@ pub fn host(code: &str, expires_secs: u64) -> Result<()> { pub fn list() -> Result<()> { let conn = db::open()?; - let now = chrono::Utc::now().timestamp(); + let now = crate::config::epoch_secs() as i64; // clean expired conn.execute("DELETE FROM magic_links WHERE expires_at <= ?1", [now])?; @@ -67,7 +67,7 @@ pub fn claim_with_conn(code: &str, conn: &rusqlite::Connection) -> Option u64 { - std::time::SystemTime::now() - .duration_since(std::time::UNIX_EPOCH) - .unwrap_or_default() - .as_secs() + config::epoch_secs() } /// Const lookup table for printable ASCII validation. @@ -179,11 +175,6 @@ struct DepositResponse { label: String, } -#[derive(Serialize)] -struct MagicLinkResponse { - status: &'static str, -} - /// Max deposit body size: 64 KB — sufficient for secrets, API keys, certs. /// Tighter than the original 1MB to limit allocation before input validation. const MAX_BODY_SIZE: usize = 64 * 1024; @@ -237,7 +228,9 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { let behind_proxy = credentials.proxy; - let state = Arc::new(AppState { + // Box::leak: AppState lives for the entire process, so leaking avoids + // Arc's atomic refcount increment/decrement on every request handler clone. + let state: &'static AppState = Box::leak(Box::new(AppState { agent_json_cached, verifying_key, vault_key, @@ -247,15 +240,13 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { rate_limiter: RateLimiter::new(), in_flight: std::sync::atomic::AtomicUsize::new(0), last_db_failure: std::sync::atomic::AtomicU64::new(0), - }); + })); // Background task: WAL checkpoint every 5 minutes (PASSIVE to avoid blocking writers; // the hourly cleanup task runs TRUNCATE to actually reclaim WAL disk space). - let wal_state = state.clone(); tokio::spawn(async move { loop { tokio::time::sleep(std::time::Duration::from_secs(300)).await; - let db_ref = wal_state.clone(); let _ = tokio::task::spawn_blocking(move || { let wal_large = crate::config::atomic_dir() .map(|d| d.join("atomic.db-wal")) @@ -264,7 +255,7 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { .map(|m| m.len() > 40 * 1024 * 1024) .unwrap_or(false); - match db_ref.db_pool.get() { + match state.db_pool.get() { Ok(conn) => { if wal_large { tracing::warn!("WAL exceeds 40MB, forcing TRUNCATE checkpoint"); @@ -283,20 +274,18 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { }); // Background task: clean expired magic links, old deposit nonces, stale rate limiter entries hourly - let cleanup_state = state.clone(); tokio::spawn(async move { loop { tokio::time::sleep(std::time::Duration::from_secs(3600)).await; - let db_ref = cleanup_state.clone(); let _ = tokio::task::spawn_blocking(move || { - let conn = match db_ref.db_pool.get() { + let conn = match state.db_pool.get() { Ok(c) => c, Err(e) => { tracing::warn!("DB cleanup: pool exhausted: {e}"); return; } }; - let now = chrono::Utc::now().timestamp(); + let now = epoch_secs() as i64; // Paginated deletes: batch 1000 rows at a time to avoid holding // the WAL write lock for extended periods under heavy load. loop { @@ -340,7 +329,7 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { let _ = conn.execute_batch("PRAGMA optimize;"); }).await; // Clean stale rate limiter entries (non-blocking, quick lock per shard) - cleanup_state.rate_limiter.clean_stale(); + state.rate_limiter.clean_stale(); } }); @@ -351,7 +340,7 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { .allow_methods(Any) .allow_headers(Any); - let shutdown_state = state.clone(); + let shutdown_state = state; let public_routes = Router::new() .route("/.well-known/agent.json", get(serve_agent_json)) @@ -365,13 +354,13 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { .route("/_/health", get(handle_health)) .fallback(handle_404) .layer(middleware::from_fn_with_state( - state.clone(), + state, track_in_flight, )) .layer(middleware::from_fn(request_timeout)) .layer(DefaultBodyLimit::max(MAX_BODY_SIZE)) .layer(middleware::from_fn_with_state( - state.clone(), + state, security_headers, )) .with_state(state); @@ -488,7 +477,7 @@ async fn root_redirect() -> Redirect { Redirect::temporary("/.well-known/agent.json") } -async fn serve_agent_json(State(state): State>) -> Response { +async fn serve_agent_json(State(state): State<&'static AppState>) -> Response { ( StatusCode::OK, [(header::CONTENT_TYPE, "application/json")], @@ -499,7 +488,7 @@ async fn serve_agent_json(State(state): State>) -> Response { // Verify the signed token, store the POST body in the vault under the encoded label. async fn handle_deposit( - State(state): State>, + State(state): State<&'static AppState>, ConnectInfo(addr): ConnectInfo, headers: HeaderMap, Path(token): Path, @@ -556,16 +545,15 @@ async fn handle_deposit( .to_string(); // DB operations in spawn_blocking with timeout to prevent unbounded task accumulation. - // Access vault_key via the Arc reference inside the closure + // Access vault_key via the &'static AppState reference inside the closure // to avoid copying the key out of its Zeroizing wrapper. - let state_clone = state.clone(); let body_clone = body; let deposit_result = tokio::time::timeout( DB_TIMEOUT, tokio::task::spawn_blocking(move || { - let conn = state_clone.db_pool.get()?; + let conn = state.db_pool.get()?; crate::deposit::claim_nonce_with_conn(&payload, &conn)?; - crate::vault::vault_set_with_conn(&conn, &payload.label, &body_clone, state_clone.vault_key())?; + crate::vault::vault_set_with_conn(&conn, &payload.label, &body_clone, state.vault_key())?; crate::deposit::log_deposit(&conn, &payload.label, &source_ip, &user_agent)?; Ok::<_, anyhow::Error>(payload.label) }) @@ -605,7 +593,7 @@ async fn handle_deposit( } async fn handle_magic_link( - State(state): State>, + State(state): State<&'static AppState>, ConnectInfo(addr): ConnectInfo, Path(code): Path, ) -> Response { @@ -626,12 +614,11 @@ async fn handle_magic_link( return (StatusCode::SERVICE_UNAVAILABLE, [("retry-after", "30")]).into_response(); } - let state_clone = state.clone(); let code_clone = code; let result = tokio::time::timeout( DB_TIMEOUT, tokio::task::spawn_blocking(move || { - let conn = state_clone.db_pool.get()?; + let conn = state.db_pool.get()?; Ok::<_, anyhow::Error>(crate::magic_link::claim_with_conn(&code_clone, &conn)) }) ).await; @@ -644,11 +631,7 @@ async fn handle_magic_link( } Ok(Ok(Ok(Some(_)))) => { state.record_db_success(); - let resp = MagicLinkResponse { status: "verified" }; - match serde_json::to_string(&resp) { - Ok(json) => (StatusCode::OK, [(header::CONTENT_TYPE, "application/json")], json).into_response(), - Err(_) => StatusCode::NOT_FOUND.into_response(), - } + (StatusCode::OK, [(header::CONTENT_TYPE, "application/json")], r#"{"status":"verified"}"#).into_response() } Ok(Ok(Ok(None))) => { state.record_db_success(); @@ -667,13 +650,12 @@ async fn handle_magic_link( } } -async fn handle_health(State(state): State>) -> Response { +async fn handle_health(State(state): State<&'static AppState>) -> Response { // Check DB is responsive (with timeout) let db_ok = tokio::time::timeout( std::time::Duration::from_secs(3), tokio::task::spawn_blocking({ - let st = state.clone(); - move || match st.db_pool.get() { + move || match state.db_pool.get() { Ok(conn) => conn.execute_batch("SELECT 1").is_ok(), Err(_) => false, } @@ -719,7 +701,7 @@ async fn handle_404() -> StatusCode { /// Track in-flight requests for graceful shutdown drain. async fn track_in_flight( - State(state): State>, + State(state): State<&'static AppState>, req: axum::http::Request, next: Next, ) -> Response { @@ -741,7 +723,7 @@ async fn request_timeout( } async fn security_headers( - State(state): State>, + State(state): State<&'static AppState>, req: axum::http::Request, next: Next, ) -> Response { diff --git a/src/sign.rs b/src/sign.rs index 00157d2..fe447a9 100644 --- a/src/sign.rs +++ b/src/sign.rs @@ -17,7 +17,7 @@ pub fn run(command: &[String], dry_run: bool) -> Result<()> { // Extract the request body from the command. // Looks for -d/--data/--data-raw and grabs the next arg. let body = extract_body(command); - let timestamp = chrono::Utc::now().timestamp(); + let timestamp = crate::config::epoch_secs() as i64; // Sign: "{timestamp}.{body}" let message = format!("{timestamp}.{body}"); diff --git a/src/vault.rs b/src/vault.rs index af78ad2..e1c8a51 100644 --- a/src/vault.rs +++ b/src/vault.rs @@ -19,7 +19,7 @@ pub fn vault_set(label: &str, value: &str, vault_key: &[u8; 32]) -> Result<()> { vault_set_with_conn(&conn, label, value, vault_key) } -pub fn vault_get(label: &str, vault_key: &[u8; 32]) -> Result>> { +pub fn vault_get(label: &str, vault_key: &[u8; 32]) -> Result>>> { let conn = db::open()?; let mut stmt = conn .prepare("SELECT value FROM vault_secrets WHERE label = ?1") @@ -32,12 +32,13 @@ pub fn vault_get(label: &str, vault_key: &[u8; 32]) -> Result { let plaintext = crypto_vault::decrypt(vault_key, &encrypted)?; - // Convert to String and wrap in Zeroizing so the heap copy is wiped on drop, - // not left as cleartext in freed memory. + // Box is 2 words (ptr, len) vs String's 3 (ptr, len, cap). + // No slack capacity means zeroize wipes exactly the used bytes. let value = Zeroizing::new( std::str::from_utf8(&plaintext) .context("Vault value is not valid UTF-8")? - .to_string(), + .to_string() + .into_boxed_str(), ); Ok(Some(value)) } From bc54cf37053861231195f26de00c8f2ff5195913 Mon Sep 17 00:00:00 2001 From: Hitesh Joshi Date: Tue, 31 Mar 2026 11:55:06 +0530 Subject: [PATCH 37/49] Update README with changelog for e61756c --- README.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index f4c7c9d..6cb45f1 100644 --- a/README.md +++ b/README.md @@ -201,13 +201,20 @@ All responses get `nosniff`, `no-store`, and `no-referrer` headers. HSTS (2-year git clone https://github.com/ploton/atomic.git cd atomic cargo build --release # ~4MB binary -cargo test # 65 tests +cargo test # 68 tests ``` Cross-compiles to `x86_64-linux-musl`, `aarch64-linux-musl`, `x86_64-apple-darwin`, `aarch64-apple-darwin`. ## Changelog +**e61756c** — Box::leak AppState (kill Arc refcount), drop chrono (epoch_secs + Hinnant RFC3339), Box vault secrets, static magic-link JSON response +- `server.rs`: replace `Arc` with `Box::leak` for `&'static AppState` — eliminates atomic refcount increment/decrement on every request handler clone. AppState is process-lifetime singleton, leaking is zero-cost. +- `server.rs`: remove `MagicLinkResponse` struct — magic link JSON response is now a static string literal (`r#"{"status":"verified"}"#`), removing serde serialization from the hot path. +- Drop `chrono` crate entirely — replace all `chrono::Utc::now().timestamp()` with `config::epoch_secs()` (single syscall, no allocation). RFC 3339 formatting uses Hinnant civil_from_days algorithm in `config::format_rfc3339()` (cold path only, ~20 lines). +- `vault.rs`: `vault_get` returns `Zeroizing>` instead of `Zeroizing` — 2 words (ptr, len) vs 3 (ptr, len, cap), no slack capacity means zeroize wipes exactly the used bytes. +- Net: −91 lines, −1 dependency (chrono), 68 tests passing. + **ae91eaf** — Remove notify crate, kill spawn_supervised, single-atomic circuit breaker, drop global magic link rate limit, branchless input validation, 256MB mmap - Remove `notify` crate: cert renewal watcher replaced with 6-hour polling + SIGHUP for immediate reload (zero extra threads for a cert that changes every 60 days) - Remove `spawn_supervised` restart machinery: background tasks use plain `tokio::spawn` — single-tenant agents should surface panics, not mask them with infinite retries From 88852d8a705c75ee272eb579c2ecdcf3d8202bdb Mon Sep 17 00:00:00 2001 From: Hitesh Joshi Date: Tue, 31 Mar 2026 12:04:21 +0530 Subject: [PATCH 38/49] =?UTF-8?q?=CE=BB-RLM=20iter=2024:=20OnceLock=20AppS?= =?UTF-8?q?tate=20(kill=20Box::leak),=20flock-based=20stop=20(kill=20ps/ki?= =?UTF-8?q?ll=20-0=20TOCTOU),=20DbPool=20shutdown=20flag,=20fail-open=20ra?= =?UTF-8?q?te=20limiter=20(try=5Flock),=20drop=20base64=20for=20base64ct?= =?UTF-8?q?=20(constant-time),=20remove=20unsafe=20panic=20hook?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Cargo.lock | 2 +- Cargo.toml | 2 +- src/crypto/signing.rs | 21 +++++----- src/db.rs | 16 ++++++++ src/deposit.rs | 21 +++++----- src/main.rs | 96 +++++++++++++++++-------------------------- src/server.rs | 26 ++++++++---- 7 files changed, 95 insertions(+), 89 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c85b30c..65a5a5e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -120,7 +120,7 @@ dependencies = [ "anyhow", "axum", "axum-server", - "base64", + "base64ct", "bytes", "clap", "dirs", diff --git a/Cargo.toml b/Cargo.toml index aaf1ed9..4084af6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -42,7 +42,7 @@ sha2 = "0.10" # Random + encoding rand = "0.8" -base64 = "0.22" +base64ct = { version = "1", features = ["alloc"] } hex = "0.4" # HTTP client (for verify command) diff --git a/src/crypto/signing.rs b/src/crypto/signing.rs index 17be4ae..16a8043 100644 --- a/src/crypto/signing.rs +++ b/src/crypto/signing.rs @@ -1,6 +1,5 @@ use anyhow::{Context, Result}; -use base64::engine::general_purpose::STANDARD as BASE64; -use base64::Engine; +use base64ct::{Base64, Encoding}; use ed25519_dalek::{Signature, Signer, SigningKey, Verifier, VerifyingKey}; use rand::rngs::OsRng; use zeroize::Zeroizing; @@ -23,14 +22,14 @@ pub fn verify(verifying_key: &VerifyingKey, message: &[u8], signature: &Signatur // Wire format: "ed25519:" pub fn encode_public_key(verifying_key: &VerifyingKey) -> String { - format!("ed25519:{}", BASE64.encode(verifying_key.as_bytes())) + format!("ed25519:{}", Base64::encode_string(verifying_key.as_bytes())) } pub fn decode_public_key(encoded: &str) -> Result { let b64 = encoded .strip_prefix("ed25519:") .context("Public key must start with 'ed25519:'")?; - let bytes = BASE64.decode(b64).context("Invalid base64 in public key")?; + let bytes = Base64::decode_vec(b64).map_err(|_| anyhow::anyhow!("Invalid base64 in public key"))?; let key_bytes: [u8; 32] = bytes .try_into() .map_err(|_| anyhow::anyhow!("Public key must be 32 bytes"))?; @@ -39,11 +38,11 @@ pub fn decode_public_key(encoded: &str) -> Result { pub fn encode_private_key(signing_key: &SigningKey) -> String { let key_bytes = Zeroizing::new(signing_key.to_bytes()); - BASE64.encode(*key_bytes) + Base64::encode_string(&*key_bytes) } pub fn decode_private_key(encoded: &str) -> Result { - let bytes = Zeroizing::new(BASE64.decode(encoded).context("Invalid base64 in private key")?); + let bytes = Zeroizing::new(Base64::decode_vec(encoded).map_err(|_| anyhow::anyhow!("Invalid base64 in private key"))?); if bytes.len() != 32 { anyhow::bail!("Private key must be 32 bytes"); } @@ -53,12 +52,12 @@ pub fn decode_private_key(encoded: &str) -> Result { } pub fn encode_signature(signature: &Signature) -> String { - BASE64.encode(signature.to_bytes()) + Base64::encode_string(&signature.to_bytes()) } #[allow(dead_code)] pub fn decode_signature(encoded: &str) -> Result { - let bytes = BASE64.decode(encoded).context("Invalid base64 in signature")?; + let bytes = Base64::decode_vec(encoded).map_err(|_| anyhow::anyhow!("Invalid base64 in signature"))?; let sig_bytes: [u8; 64] = bytes .try_into() .map_err(|_| anyhow::anyhow!("Signature must be 64 bytes"))?; @@ -131,7 +130,7 @@ mod tests { #[test] fn decode_public_key_wrong_length() { - let short = BASE64.encode([0u8; 16]); // 16 bytes, need 32 + let short = Base64::encode_string(&[0u8; 16]); // 16 bytes, need 32 assert!(decode_public_key(&format!("ed25519:{short}")).is_err()); } @@ -142,13 +141,13 @@ mod tests { #[test] fn decode_private_key_wrong_length() { - let short = BASE64.encode([0u8; 16]); + let short = Base64::encode_string(&[0u8; 16]); assert!(decode_private_key(&short).is_err()); } #[test] fn decode_signature_wrong_length() { - let short = BASE64.encode([0u8; 32]); // 32 bytes, need 64 + let short = Base64::encode_string(&[0u8; 32]); // 32 bytes, need 64 assert!(decode_signature(&short).is_err()); } diff --git a/src/db.rs b/src/db.rs index b5aa051..28443e5 100644 --- a/src/db.rs +++ b/src/db.rs @@ -1,6 +1,7 @@ use anyhow::{Context, Result}; use rusqlite::Connection; use std::path::{Path, PathBuf}; +use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::{Condvar, Mutex}; use std::time::{Duration, Instant}; @@ -18,6 +19,7 @@ pub struct DbPool { conns: Mutex>, available: Condvar, db_path: PathBuf, + shutdown: AtomicBool, } /// RAII guard that returns the connection to the pool on drop. @@ -73,10 +75,20 @@ impl Drop for PooledConn<'_> { } impl DbPool { + /// Signal the pool to reject new acquisitions and wake all waiting threads. + /// Called during graceful shutdown to prevent threads from blocking on Condvar. + pub fn shutdown(&self) { + self.shutdown.store(true, Ordering::SeqCst); + self.available.notify_all(); + } + /// Get a connection from the pool, blocking up to 5 seconds. /// Connections older than 30 minutes are recycled to reset SQLite's /// internal allocator and prevent page cache fragmentation. pub fn get(&self) -> Result> { + if self.shutdown.load(Ordering::SeqCst) { + anyhow::bail!("DB pool shutting down"); + } let deadline = Instant::now() + Duration::from_secs(5); let mut conns = self.conns.lock().unwrap_or_else(|e| e.into_inner()); loop { @@ -99,6 +111,9 @@ impl DbPool { acquired_at: Instant::now(), }); } + if self.shutdown.load(Ordering::SeqCst) { + anyhow::bail!("DB pool shutting down"); + } let remaining = deadline.saturating_duration_since(Instant::now()); if remaining.is_zero() { anyhow::bail!("DB pool exhausted (5s timeout)"); @@ -172,6 +187,7 @@ pub fn open_pool(size: usize) -> Result { conns: Mutex::new(conns), available: Condvar::new(), db_path, + shutdown: AtomicBool::new(false), }) } diff --git a/src/deposit.rs b/src/deposit.rs index d47a4ed..9cf96b4 100644 --- a/src/deposit.rs +++ b/src/deposit.rs @@ -1,6 +1,5 @@ use anyhow::Result; -use base64::engine::general_purpose::URL_SAFE_NO_PAD as B64URL; -use base64::Engine; +use base64ct::{Base64UrlUnpadded, Encoding}; use serde::{Deserialize, Serialize}; use std::time::Duration; @@ -34,9 +33,9 @@ pub fn create_signed_token( }; let payload_json = serde_json::to_string(&payload)?; - let payload_b64 = B64URL.encode(payload_json.as_bytes()); + let payload_b64 = Base64UrlUnpadded::encode_string(payload_json.as_bytes()); let sig = signing::sign(signing_key, payload_b64.as_bytes()); - let sig_b64 = B64URL.encode(sig.to_bytes()); + let sig_b64 = Base64UrlUnpadded::encode_string(&sig.to_bytes()); Ok(format!("{payload_b64}.{sig_b64}")) } @@ -58,10 +57,11 @@ fn try_verify_signature( .split_once('.') .ok_or_else(|| anyhow::anyhow!("No '.' separator in token"))?; - // Decode signature into stack-allocated buffer (zero heap allocation) + // Decode signature into stack-allocated buffer (zero heap allocation, constant-time) let mut sig_buf = [0u8; 64]; - let sig_len = B64URL.decode_slice(sig_b64, &mut sig_buf) - .map_err(|e| anyhow::anyhow!("Bad signature: {e}"))?; + let sig_len = Base64UrlUnpadded::decode(sig_b64, &mut sig_buf) + .map_err(|_| anyhow::anyhow!("Bad signature encoding"))? + .len(); if sig_len != 64 { anyhow::bail!("Signature must be 64 bytes, got {sig_len}"); } @@ -72,7 +72,8 @@ fn try_verify_signature( .verify(payload_b64.as_bytes(), &sig) .map_err(|e| anyhow::anyhow!("Signature invalid: {e}"))?; - let payload_json = B64URL.decode(payload_b64)?; + let payload_json = Base64UrlUnpadded::decode_vec(payload_b64) + .map_err(|_| anyhow::anyhow!("Bad payload encoding"))?; let payload: DepositPayload = serde_json::from_slice(&payload_json)?; let now = crate::config::epoch_secs() as i64; @@ -178,7 +179,7 @@ mod tests { // Decode the payload part to verify contents let (payload_b64, _) = token.split_once('.').unwrap(); - let payload_json = B64URL.decode(payload_b64).unwrap(); + let payload_json = Base64UrlUnpadded::decode_vec(payload_b64).unwrap(); let payload: DepositPayload = serde_json::from_slice(&payload_json).unwrap(); assert_eq!(payload.label, "test_key"); assert!(!payload.nonce.is_empty()); @@ -237,7 +238,7 @@ mod tests { let (sk, _) = s::generate_keypair(); let token = create_signed_token("key", Duration::from_secs(48 * 3600), &sk).unwrap(); let (payload_b64, _) = token.split_once('.').unwrap(); - let payload_json = B64URL.decode(payload_b64).unwrap(); + let payload_json = Base64UrlUnpadded::decode_vec(payload_b64).unwrap(); let payload: DepositPayload = serde_json::from_slice(&payload_json).unwrap(); let now = crate::config::epoch_secs() as i64; // Should be capped at ~24h, not 48h diff --git a/src/main.rs b/src/main.rs index d253ad1..5586ba3 100644 --- a/src/main.rs +++ b/src/main.rs @@ -38,26 +38,10 @@ async fn main() -> Result<()> { } } - // Clean up PID file and temp files on panic (best-effort). - // With panic=abort in release, the hook still runs before the process terminates. - std::panic::set_hook(Box::new(|info| { - eprintln!("atomic: fatal panic: {info}"); - if let Ok(path) = config::pid_path() { - let _ = std::fs::remove_file(path); - } - // Clean temp files left by write_secure (atomic write pattern) - if let Ok(dir) = config::atomic_dir() { - if let Ok(entries) = std::fs::read_dir(&dir) { - for entry in entries.flatten() { - if let Some(name) = entry.file_name().to_str() { - if name.contains(".tmp.") { - let _ = std::fs::remove_file(entry.path()); - } - } - } - } - } - })); + // Panic hook removed: file I/O in panic handlers is async-signal-unsafe + // (deadlock risk if panic occurred during malloc or file operation). + // The kernel automatically releases flock on the PID file when the process exits. + // Temp files from write_secure are cleaned on next startup or OS reboot. let cli = Cli::parse(); @@ -81,54 +65,48 @@ async fn main() -> Result<()> { } Command::Stop => { + use fs2::FileExt; + let pid_path = config::pid_path()?; if !pid_path.exists() { anyhow::bail!("No running server found (no PID file)"); } - let pid_str = std::fs::read_to_string(&pid_path)?; - let pid: i32 = pid_str.trim().parse().context("Invalid PID file")?; - if pid <= 0 { - let _ = std::fs::remove_file(&pid_path); - anyhow::bail!("Invalid PID {pid} in PID file (removed)"); - } - // Verify the PID belongs to an atomic process before killing - let ps_output = std::process::Command::new("ps") - .args(["-p", &pid.to_string(), "-o", "comm="]) - .output(); - if let Ok(output) = ps_output { - let comm = String::from_utf8_lossy(&output.stdout); - let comm = comm.trim(); - if !comm.is_empty() && !comm.contains("atomic") { + // Use flock to atomically determine if the server process is alive. + // The running server holds an exclusive flock on the PID file. + // This eliminates the TOCTOU race window from ps/kill -0 checks. + let pid_file = std::fs::OpenOptions::new() + .read(true) + .write(true) + .open(&pid_path) + .context("Failed to open PID file")?; + + match pid_file.try_lock_exclusive() { + Ok(()) => { + // We got the lock — the server process is dead (kernel released its lock). + drop(pid_file); let _ = std::fs::remove_file(&pid_path); - anyhow::bail!( - "PID {pid} belongs to '{comm}', not atomic (stale PID file removed)" - ); + println!("Cleaned up stale PID file (server was not running)"); } - } - - // Verify process still alive immediately before kill to minimize PID reuse window - let probe = std::process::Command::new("kill") - .args(["-0", &pid.to_string()]) - .status(); - if !probe.map(|s| s.success()).unwrap_or(false) { - let _ = std::fs::remove_file(&pid_path); - anyhow::bail!("PID {pid} no longer exists (stale PID file removed)"); - } + Err(_) => { + // Lock held by live server process — read PID and send SIGTERM. + let pid_str = std::fs::read_to_string(&pid_path)?; + let pid: i32 = pid_str.trim().parse().context("Invalid PID file")?; + if pid <= 0 { + anyhow::bail!("Invalid PID {pid} in PID file"); + } - // Send SIGTERM - let status = std::process::Command::new("kill") - .arg(pid.to_string()) - .status() - .context("Failed to send stop signal")?; + let status = std::process::Command::new("kill") + .arg(pid.to_string()) + .status() + .context("Failed to send SIGTERM")?; - if status.success() { - let _ = std::fs::remove_file(&pid_path); - println!("Server stopped (PID {pid})"); - } else { - // Process might already be gone - let _ = std::fs::remove_file(&pid_path); - println!("Server process {pid} not found (cleaned up stale PID file)"); + if status.success() { + println!("Server stopped (PID {pid})"); + } else { + println!("Failed to stop server (PID {pid})"); + } + } } } diff --git a/src/server.rs b/src/server.rs index ba2dfbc..056a6bc 100644 --- a/src/server.rs +++ b/src/server.rs @@ -15,10 +15,16 @@ use tower_http::cors::{Any, CorsLayer}; use tracing::info; use zeroize::Zeroizing; +use std::sync::OnceLock; + use crate::config; use crate::credentials::Credentials; use crate::tls::TlsMode; +/// Global AppState — initialized once via OnceLock instead of Box::leak. +/// Provides &'static access without leaking heap memory. +static APP_STATE: OnceLock = OnceLock::new(); + /// Timeout for DB operations in HTTP handlers. Must exceed SQLite busy_timeout (4s) /// so that SQLite returns BUSY cleanly before the task gets force-cancelled. const DB_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(5); @@ -86,9 +92,13 @@ impl RateLimiter { fn check(&self, ip: IpAddr) -> bool { let now = std::time::Instant::now(); let window = std::time::Duration::from_secs(RATE_LIMIT_WINDOW_SECS); - let mut shard = self.shards[Self::shard_index(&ip)] - .lock() - .unwrap_or_else(|e| e.into_inner()); + // Fail open: if shard lock is contested, allow the request rather than block. + // A locked shard indicates system stress; blocking increases backlog. + let mut shard = match self.shards[Self::shard_index(&ip)].try_lock() { + Ok(guard) => guard, + Err(std::sync::TryLockError::WouldBlock) => return true, + Err(std::sync::TryLockError::Poisoned(e)) => e.into_inner(), + }; // Hard cap per shard to prevent unbounded growth if shard.len() >= RATE_LIMIT_MAX_ENTRIES / RATE_SHARDS { let stale = shard.iter() @@ -228,9 +238,7 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { let behind_proxy = credentials.proxy; - // Box::leak: AppState lives for the entire process, so leaking avoids - // Arc's atomic refcount increment/decrement on every request handler clone. - let state: &'static AppState = Box::leak(Box::new(AppState { + APP_STATE.set(AppState { agent_json_cached, verifying_key, vault_key, @@ -240,7 +248,8 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { rate_limiter: RateLimiter::new(), in_flight: std::sync::atomic::AtomicUsize::new(0), last_db_failure: std::sync::atomic::AtomicU64::new(0), - })); + }).map_err(|_| anyhow::anyhow!("server already initialized"))?; + let state: &'static AppState = APP_STATE.get().unwrap(); // Background task: WAL checkpoint every 5 minutes (PASSIVE to avoid blocking writers; // the hourly cleanup task runs TRUNCATE to actually reclaim WAL disk space). @@ -444,6 +453,9 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { } } + // Signal pool to reject new acquisitions and wake any threads waiting on Condvar. + shutdown_state.db_pool.shutdown(); + // Final WAL checkpoint after all handlers have drained. // Timeout prevents indefinite hang if the DB is stuck. let checkpoint_result = tokio::time::timeout( From 56a5817e779359521fdfe4455a87f37757481df4 Mon Sep 17 00:00:00 2001 From: Hitesh Joshi Date: Tue, 31 Mar 2026 12:04:55 +0530 Subject: [PATCH 39/49] Update README with changelog for 88852d8 --- README.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/README.md b/README.md index 6cb45f1..c1721de 100644 --- a/README.md +++ b/README.md @@ -208,6 +208,15 @@ Cross-compiles to `x86_64-linux-musl`, `aarch64-linux-musl`, `x86_64-apple-darwi ## Changelog +**88852d8** — OnceLock AppState (kill Box::leak), flock-based stop (kill ps/kill -0 TOCTOU), DbPool shutdown flag, fail-open rate limiter, drop base64 for base64ct (constant-time), remove unsafe panic hook +- `server.rs`: replace `Box::leak(Box::new(AppState))` with `OnceLock` — eliminates intentional memory leak while preserving `&'static` access. State owned by static, not orphaned on heap. +- `server.rs`: rate limiter uses `try_lock()` instead of `lock()` — fails open on contention (allows request) instead of blocking, eliminating tail latency spikes under load. +- `main.rs`: rewrite `atomic stop` to use `flock` for process liveness detection — replaces racy `ps`/`kill -0` TOCTOU checks with kernel-enforced exclusive lock test. Eliminates PID reuse attack window. +- `main.rs`: remove panic hook that performed file I/O (PID cleanup, temp file deletion) — file I/O in panic handlers is async-signal-unsafe (deadlock risk). Kernel releases flock automatically on process exit. +- `db.rs`: add `AtomicBool` shutdown flag to `DbPool` — `shutdown()` wakes all `Condvar` waiters and rejects new acquisitions, preventing indefinite hangs during SIGTERM. +- `Cargo.toml`: swap `base64` for `base64ct` — constant-time base64 encoding/decoding prevents timing side-channels during deposit token verification. Removes direct `base64` dependency. +- Net: −89 lines added/+95 removed (net +6 for shutdown safety), −1 direct dependency (base64 → base64ct), 68 tests passing. + **e61756c** — Box::leak AppState (kill Arc refcount), drop chrono (epoch_secs + Hinnant RFC3339), Box vault secrets, static magic-link JSON response - `server.rs`: replace `Arc` with `Box::leak` for `&'static AppState` — eliminates atomic refcount increment/decrement on every request handler clone. AppState is process-lifetime singleton, leaking is zero-cost. - `server.rs`: remove `MagicLinkResponse` struct — magic link JSON response is now a static string literal (`r#"{"status":"verified"}"#`), removing serde serialization from the hot path. From 3d768cae826005dec03edd70a98fc908ebe414e2 Mon Sep 17 00:00:00 2001 From: Hitesh Joshi Date: Tue, 31 Mar 2026 12:15:16 +0530 Subject: [PATCH 40/49] =?UTF-8?q?=CE=BB-RLM=20iter=2025:=20kill=20magic=5F?= =?UTF-8?q?link=20module=20(~175=20lines),=20CryptoError=20replaces=20anyh?= =?UTF-8?q?ow=20in=20crypto=20paths=20(zero-alloc=20opaque=20errors),=20Ve?= =?UTF-8?q?cDeque=20pool=20(O(1)=20pop),=20drop=20connection=20recycling,?= =?UTF-8?q?=20drop=20SIGHUP=20TLS=20reload,=20drop=20WAL=20checkpoint=20ta?= =?UTF-8?q?sk=20(wal=5Fautocheckpoint=3D1000=20sufficient),=20drop=20RateL?= =?UTF-8?q?imiter=20(dead=20after=20magic=5Flink=20removal),=20stack-alloc?= =?UTF-8?q?ate=20deposit=20payload=20decode,=20uniform=20404=20on=20circui?= =?UTF-8?q?t=20breaker,=20remove=20redundant=20manual=20zeroize,=20remove?= =?UTF-8?q?=20subtle=20dep,=20remove=20dead=20tls=5Fdir()?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Cargo.lock | 1 - Cargo.toml | 1 - src/cli.rs | 20 ---- src/config.rs | 4 - src/credentials.rs | 4 +- src/crypto/mod.rs | 16 +++ src/crypto/signing.rs | 25 ++--- src/crypto/vault.rs | 30 +++--- src/db.rs | 60 +++-------- src/deposit.rs | 7 +- src/magic_link.rs | 174 ------------------------------- src/main.rs | 12 +-- src/server.rs | 231 ++---------------------------------------- 13 files changed, 69 insertions(+), 516 deletions(-) delete mode 100644 src/magic_link.rs diff --git a/Cargo.lock b/Cargo.lock index 65a5a5e..428dd4b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -134,7 +134,6 @@ dependencies = [ "serde", "serde_json", "sha2", - "subtle", "thiserror", "tikv-jemalloc-ctl", "tikv-jemallocator", diff --git a/Cargo.toml b/Cargo.toml index 4084af6..5e5e404 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -66,7 +66,6 @@ zeroize = { version = "1.8.2", features = ["derive"] } fs2 = "0.4" tikv-jemallocator = { version = "0.6", optional = true } tikv-jemalloc-ctl = { version = "0.6", optional = true } -subtle = "2" [profile.release] opt-level = 3 diff --git a/src/cli.rs b/src/cli.rs index 50a6934..d7b64f4 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -82,12 +82,6 @@ pub enum Command { command: VaultCommand, }, - /// Host a verification code for domain proof - MagicLink { - #[command(subcommand)] - command: MagicLinkCommand, - }, - /// Sign an outgoing HTTP request Sign { /// Print modified command without executing @@ -135,20 +129,6 @@ pub enum VaultCommand { }, } -#[derive(Subcommand)] -pub enum MagicLinkCommand { - /// Host a code for a service to verify - Host { - /// The verification code to host - code: String, - /// How long to host it in seconds (max 3600) - #[arg(long, default_value = "300")] - expires: u64, - }, - /// List active magic links - List, -} - #[derive(Subcommand)] pub enum KeyCommand { /// Rotate the agent's keypair diff --git a/src/config.rs b/src/config.rs index 7700d2f..05d81e8 100644 --- a/src/config.rs +++ b/src/config.rs @@ -56,10 +56,6 @@ pub fn deposits_log_path() -> Result { Ok(atomic_dir()?.join("deposits.log")) } -pub fn tls_dir() -> Result { - Ok(atomic_dir()?.join("tls")) -} - pub fn pid_path() -> Result { Ok(atomic_dir()?.join("atomic.pid")) } diff --git a/src/credentials.rs b/src/credentials.rs index 0ec2de9..886584a 100644 --- a/src/credentials.rs +++ b/src/credentials.rs @@ -65,11 +65,11 @@ impl Credentials { } pub fn signing_key(&self) -> Result { - signing::decode_private_key(&self.private_key) + Ok(signing::decode_private_key(&self.private_key)?) } pub fn verifying_key(&self) -> Result { - signing::decode_public_key(&self.public_key) + Ok(signing::decode_public_key(&self.public_key)?) } pub fn save(&self, path: &Path) -> Result<()> { diff --git a/src/crypto/mod.rs b/src/crypto/mod.rs index 67883a9..dfcfb31 100644 --- a/src/crypto/mod.rs +++ b/src/crypto/mod.rs @@ -1,2 +1,18 @@ pub mod signing; pub mod vault; + +/// Zero-allocation error type for cryptographic operations. +/// Prevents information leakage — no file paths, key IDs, or internal details. +#[derive(Debug, Clone, Copy, PartialEq, Eq, thiserror::Error)] +pub enum CryptoError { + #[error("invalid key")] + InvalidKey, + #[error("invalid signature")] + InvalidSignature, + #[error("invalid ciphertext")] + InvalidCiphertext, + #[error("key derivation failed")] + KeyDerivationFailed, + #[error("encryption failed")] + EncryptionFailed, +} diff --git a/src/crypto/signing.rs b/src/crypto/signing.rs index 16a8043..bcbbd3d 100644 --- a/src/crypto/signing.rs +++ b/src/crypto/signing.rs @@ -1,9 +1,10 @@ -use anyhow::{Context, Result}; use base64ct::{Base64, Encoding}; use ed25519_dalek::{Signature, Signer, SigningKey, Verifier, VerifyingKey}; use rand::rngs::OsRng; use zeroize::Zeroizing; +use super::CryptoError; + pub fn generate_keypair() -> (SigningKey, VerifyingKey) { let signing_key = SigningKey::generate(&mut OsRng); let verifying_key = signing_key.verifying_key(); @@ -25,15 +26,15 @@ pub fn encode_public_key(verifying_key: &VerifyingKey) -> String { format!("ed25519:{}", Base64::encode_string(verifying_key.as_bytes())) } -pub fn decode_public_key(encoded: &str) -> Result { +pub fn decode_public_key(encoded: &str) -> Result { let b64 = encoded .strip_prefix("ed25519:") - .context("Public key must start with 'ed25519:'")?; - let bytes = Base64::decode_vec(b64).map_err(|_| anyhow::anyhow!("Invalid base64 in public key"))?; + .ok_or(CryptoError::InvalidKey)?; + let bytes = Base64::decode_vec(b64).map_err(|_| CryptoError::InvalidKey)?; let key_bytes: [u8; 32] = bytes .try_into() - .map_err(|_| anyhow::anyhow!("Public key must be 32 bytes"))?; - VerifyingKey::from_bytes(&key_bytes).context("Invalid Ed25519 public key") + .map_err(|_| CryptoError::InvalidKey)?; + VerifyingKey::from_bytes(&key_bytes).map_err(|_| CryptoError::InvalidKey) } pub fn encode_private_key(signing_key: &SigningKey) -> String { @@ -41,10 +42,10 @@ pub fn encode_private_key(signing_key: &SigningKey) -> String { Base64::encode_string(&*key_bytes) } -pub fn decode_private_key(encoded: &str) -> Result { - let bytes = Zeroizing::new(Base64::decode_vec(encoded).map_err(|_| anyhow::anyhow!("Invalid base64 in private key"))?); +pub fn decode_private_key(encoded: &str) -> Result { + let bytes = Zeroizing::new(Base64::decode_vec(encoded).map_err(|_| CryptoError::InvalidKey)?); if bytes.len() != 32 { - anyhow::bail!("Private key must be 32 bytes"); + return Err(CryptoError::InvalidKey); } let mut key_bytes = Zeroizing::new([0u8; 32]); key_bytes.copy_from_slice(&bytes); @@ -56,11 +57,11 @@ pub fn encode_signature(signature: &Signature) -> String { } #[allow(dead_code)] -pub fn decode_signature(encoded: &str) -> Result { - let bytes = Base64::decode_vec(encoded).map_err(|_| anyhow::anyhow!("Invalid base64 in signature"))?; +pub fn decode_signature(encoded: &str) -> Result { + let bytes = Base64::decode_vec(encoded).map_err(|_| CryptoError::InvalidSignature)?; let sig_bytes: [u8; 64] = bytes .try_into() - .map_err(|_| anyhow::anyhow!("Signature must be 64 bytes"))?; + .map_err(|_| CryptoError::InvalidSignature)?; Ok(Signature::from_bytes(&sig_bytes)) } diff --git a/src/crypto/vault.rs b/src/crypto/vault.rs index d39ca4a..958cfcf 100644 --- a/src/crypto/vault.rs +++ b/src/crypto/vault.rs @@ -2,29 +2,30 @@ use aes_gcm::{ aead::{Aead, KeyInit}, Aes256Gcm, Nonce, }; -use anyhow::{Context, Result}; use hkdf::Hkdf; use rand::RngCore; use sha2::Sha256; use zeroize::Zeroizing; +use super::CryptoError; + const NONCE_SIZE: usize = 12; const HKDF_SALT: &[u8] = b"atomic-v1"; const MAX_CIPHERTEXT_SIZE: usize = 16 * 1024 * 1024; // 16 MB // HKDF with salt and "atomic-vault" context, so the vault key differs from the signing key. -pub fn derive_vault_key(private_key_bytes: &[u8; 32]) -> Result> { +pub fn derive_vault_key(private_key_bytes: &[u8; 32]) -> Result, CryptoError> { let hk = Hkdf::::new(Some(HKDF_SALT), private_key_bytes); let mut key = Zeroizing::new([0u8; 32]); hk.expand(b"atomic-vault", key.as_mut()) - .map_err(|_| anyhow::anyhow!("HKDF expand failed"))?; + .map_err(|_| CryptoError::KeyDerivationFailed)?; Ok(key) } // Output: 12-byte nonce prepended to ciphertext. -pub fn encrypt(key: &[u8; 32], plaintext: &[u8]) -> Result> { +pub fn encrypt(key: &[u8; 32], plaintext: &[u8]) -> Result, CryptoError> { let cipher = Aes256Gcm::new_from_slice(key) - .map_err(|e| anyhow::anyhow!("Failed to create cipher: {e}"))?; + .map_err(|_| CryptoError::EncryptionFailed)?; let mut nonce_bytes = [0u8; NONCE_SIZE]; rand::rngs::OsRng.fill_bytes(&mut nonce_bytes); @@ -32,7 +33,7 @@ pub fn encrypt(key: &[u8; 32], plaintext: &[u8]) -> Result> { let ciphertext = cipher .encrypt(nonce, plaintext) - .map_err(|e| anyhow::anyhow!("Encryption failed: {e}"))?; + .map_err(|_| CryptoError::EncryptionFailed)?; let mut result = Vec::with_capacity(NONCE_SIZE + ciphertext.len()); result.extend_from_slice(&nonce_bytes); @@ -42,23 +43,19 @@ pub fn encrypt(key: &[u8; 32], plaintext: &[u8]) -> Result> { // Expects 12-byte nonce prepended to ciphertext (same format encrypt() produces). // Returns Zeroizing> so plaintext is wiped from memory on drop. -pub fn decrypt(key: &[u8; 32], data: &[u8]) -> Result>> { - if data.len() < NONCE_SIZE { - anyhow::bail!("Encrypted data too short"); - } - if data.len() > MAX_CIPHERTEXT_SIZE { - anyhow::bail!("Encrypted data too large"); +pub fn decrypt(key: &[u8; 32], data: &[u8]) -> Result>, CryptoError> { + if data.len() < NONCE_SIZE || data.len() > MAX_CIPHERTEXT_SIZE { + return Err(CryptoError::InvalidCiphertext); } let (nonce_bytes, ciphertext) = data.split_at(NONCE_SIZE); let cipher = Aes256Gcm::new_from_slice(key) - .map_err(|e| anyhow::anyhow!("Failed to create cipher: {e}"))?; + .map_err(|_| CryptoError::InvalidCiphertext)?; let nonce = Nonce::from_slice(nonce_bytes); let plaintext = cipher .decrypt(nonce, ciphertext) - .map_err(|_| anyhow::anyhow!("Decryption failed (wrong key or corrupted data)")) - .context("Vault decryption failed")?; + .map_err(|_| CryptoError::InvalidCiphertext)?; Ok(Zeroizing::new(plaintext)) } @@ -118,8 +115,7 @@ mod tests { fn decrypt_oversized_rejected() { let key = [42u8; 32]; let oversized = vec![0u8; MAX_CIPHERTEXT_SIZE + 1]; - let err = decrypt(&key, &oversized).unwrap_err(); - assert!(err.to_string().contains("too large")); + assert!(matches!(decrypt(&key, &oversized), Err(CryptoError::InvalidCiphertext))); } #[test] diff --git a/src/db.rs b/src/db.rs index 28443e5..0796948 100644 --- a/src/db.rs +++ b/src/db.rs @@ -1,24 +1,20 @@ use anyhow::{Context, Result}; use rusqlite::Connection; -use std::path::{Path, PathBuf}; +use std::collections::VecDeque; +use std::path::Path; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::{Condvar, Mutex}; use std::time::{Duration, Instant}; use crate::config; -/// Maximum lifetime for a pooled connection before it's recycled. -/// Prevents SQLite page cache fragmentation from accumulating over hours/days. -const CONN_MAX_LIFETIME: Duration = Duration::from_secs(1800); // 30 minutes - /// Zero-dependency connection pool for SQLite. -/// Uses Mutex + Condvar — minimal overhead for small pool sizes (2-8). +/// Uses Mutex + Condvar — O(1) push/pop for small pool sizes (2-8). /// WAL mode allows concurrent readers; the pool prevents serialization /// behind a single Mutex. pub struct DbPool { - conns: Mutex>, + conns: Mutex>, available: Condvar, - db_path: PathBuf, shutdown: AtomicBool, } @@ -27,7 +23,6 @@ pub struct DbPool { pub struct PooledConn<'a> { pool: &'a DbPool, conn: Option, - created_at: Instant, acquired_at: Instant, } @@ -64,7 +59,7 @@ impl Drop for PooledConn<'_> { // double-panic abort (which would skip remaining destructors and Zeroizing). let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { let mut conns = self.pool.conns.lock().unwrap_or_else(|e| e.into_inner()); - conns.push((c, self.created_at)); + conns.push_back(c); self.pool.available.notify_one(); })); if result.is_err() { @@ -83,8 +78,6 @@ impl DbPool { } /// Get a connection from the pool, blocking up to 5 seconds. - /// Connections older than 30 minutes are recycled to reset SQLite's - /// internal allocator and prevent page cache fragmentation. pub fn get(&self) -> Result> { if self.shutdown.load(Ordering::SeqCst) { anyhow::bail!("DB pool shutting down"); @@ -92,22 +85,10 @@ impl DbPool { let deadline = Instant::now() + Duration::from_secs(5); let mut conns = self.conns.lock().unwrap_or_else(|e| e.into_inner()); loop { - if let Some((conn, created_at)) = conns.pop() { - if created_at.elapsed() > CONN_MAX_LIFETIME { - drop(conn); - drop(conns); // release lock during open_connection - let fresh = open_connection(&self.db_path)?; - return Ok(PooledConn { - pool: self, - conn: Some(fresh), - created_at: Instant::now(), - acquired_at: Instant::now(), - }); - } + if let Some(conn) = conns.pop_front() { return Ok(PooledConn { pool: self, conn: Some(conn), - created_at, acquired_at: Instant::now(), }); } @@ -175,18 +156,16 @@ pub fn open_pool(size: usize) -> Result { let first = open_connection(&db_path)?; migrate(&first)?; - let now = Instant::now(); - let mut conns = Vec::with_capacity(size); - conns.push((first, now)); + let mut conns = VecDeque::with_capacity(size); + conns.push_back(first); for _ in 1..size { - conns.push((open_connection(&db_path)?, now)); + conns.push_back(open_connection(&db_path)?); } Ok(DbPool { conns: Mutex::new(conns), available: Condvar::new(), - db_path, shutdown: AtomicBool::new(false), }) } @@ -201,22 +180,11 @@ pub fn open() -> Result { } fn migrate(conn: &Connection) -> Result<()> { - // Migrate magic_links from old schemas (plaintext `code` or `hint` column). - // Magic links are short-lived, so dropping the table is safe. - let needs_recreate = conn.prepare("SELECT code FROM magic_links LIMIT 0").is_ok() - || conn.prepare("SELECT hint FROM magic_links LIMIT 0").is_ok(); - if needs_recreate { - conn.execute_batch("DROP TABLE magic_links;") - .context("Failed to migrate magic_links table")?; - } + // Drop legacy magic_links table if it exists (feature removed). + let _ = conn.execute_batch("DROP TABLE IF EXISTS magic_links;"); conn.execute_batch( - "CREATE TABLE IF NOT EXISTS magic_links ( - code_hash TEXT PRIMARY KEY, - expires_at INTEGER NOT NULL - ); - - CREATE TABLE IF NOT EXISTS used_deposits ( + "CREATE TABLE IF NOT EXISTS used_deposits ( nonce TEXT PRIMARY KEY, label TEXT NOT NULL, used_at INTEGER NOT NULL @@ -235,10 +203,6 @@ fn migrate(conn: &Connection) -> Result<()> { deposited_at INTEGER NOT NULL ); - -- Indexes on time columns used by the hourly cleanup task. - -- Without these, DELETE ... WHERE expires_at/used_at/deposited_at < ? - -- does a full table scan, holding a write lock longer than necessary. - CREATE INDEX IF NOT EXISTS idx_magic_links_expires ON magic_links(expires_at); CREATE INDEX IF NOT EXISTS idx_used_deposits_used_at ON used_deposits(used_at); CREATE INDEX IF NOT EXISTS idx_deposit_log_deposited_at ON deposit_log(deposited_at);", ) diff --git a/src/deposit.rs b/src/deposit.rs index 9cf96b4..67c0df0 100644 --- a/src/deposit.rs +++ b/src/deposit.rs @@ -72,9 +72,12 @@ fn try_verify_signature( .verify(payload_b64.as_bytes(), &sig) .map_err(|e| anyhow::anyhow!("Signature invalid: {e}"))?; - let payload_json = Base64UrlUnpadded::decode_vec(payload_b64) + // Stack-allocated buffer for payload decoding (zero heap allocation). + // DepositPayload JSON is well under 512 bytes (label≤256 + nonce=32 + overhead). + let mut payload_buf = [0u8; 768]; + let payload_bytes = Base64UrlUnpadded::decode(payload_b64, &mut payload_buf) .map_err(|_| anyhow::anyhow!("Bad payload encoding"))?; - let payload: DepositPayload = serde_json::from_slice(&payload_json)?; + let payload: DepositPayload = serde_json::from_slice(payload_bytes)?; let now = crate::config::epoch_secs() as i64; if payload.expires_at <= now { diff --git a/src/magic_link.rs b/src/magic_link.rs deleted file mode 100644 index cef18ce..0000000 --- a/src/magic_link.rs +++ /dev/null @@ -1,174 +0,0 @@ -use anyhow::Result; -use sha2::{Digest, Sha256}; - -use crate::config; -use crate::db; - -fn hash_code(code: &str) -> String { - let hash = Sha256::digest(code.as_bytes()); - hex::encode(hash) -} - -pub fn host(code: &str, expires_secs: u64) -> Result<()> { - if code.len() < 20 { - anyhow::bail!("Code must be at least 20 characters (got {}). Use a longer code to prevent brute-force.", code.len()); - } - let max_secs: u64 = 3600; // 1 hour max for magic links - let capped_secs = expires_secs.min(max_secs); - let expires_at = crate::config::epoch_secs() as i64 - + i64::try_from(capped_secs).map_err(|_| anyhow::anyhow!("Duration too large"))?; - - let code_hash = hash_code(code); - - let conn = db::open()?; - conn.execute( - "INSERT OR REPLACE INTO magic_links (code_hash, expires_at) VALUES (?1, ?2)", - rusqlite::params![code_hash, expires_at], - )?; - - let creds = crate::credentials::Credentials::load(&config::credentials_path()?)?; - let url = format!("{}/m/{}", creds.base_url(), code); - - println!("{url}"); - println!("Expires in {capped_secs}s. Service can verify at that URL."); - Ok(()) -} - -pub fn list() -> Result<()> { - let conn = db::open()?; - let now = crate::config::epoch_secs() as i64; - - // clean expired - conn.execute("DELETE FROM magic_links WHERE expires_at <= ?1", [now])?; - - let mut stmt = conn.prepare("SELECT expires_at FROM magic_links ORDER BY expires_at")?; - let rows = stmt.query_map([], |row| row.get::<_, i64>(0))?; - - let mut idx = 0u32; - for row in rows { - let expires_at = row?; - let remaining = expires_at - now; - let mins = remaining / 60; - let secs = remaining % 60; - idx += 1; - println!(" #{idx} expires in {mins}m {secs}s"); - } - - if idx == 0 { - println!("No active magic links."); - } - Ok(()) -} - -/// Called by the server to check and consume a code. One-time use. -/// Uses constant-time comparison to prevent timing side-channels that -/// could leak whether a code exists via SQL execution time differences. -pub fn claim_with_conn(code: &str, conn: &rusqlite::Connection) -> Option { - use rusqlite::OptionalExtension; - use subtle::ConstantTimeEq; - - let now = crate::config::epoch_secs() as i64; - let code_hash = hash_code(code); - - // Fetch the stored hash first (SELECT), then compare in constant time. - // SQL timing differs between index hit and miss; the ct_eq comparison - // ensures the overall code path is uniform regardless of existence. - let stored_hash: Option = conn - .query_row( - "SELECT code_hash FROM magic_links WHERE code_hash = ?1 AND expires_at > ?2", - rusqlite::params![code_hash, now], - |row| row.get(0), - ) - .optional() - .ok()?; - - let matched = match &stored_hash { - Some(stored) => bool::from(stored.as_bytes().ct_eq(code_hash.as_bytes())), - None => { - // Dummy comparison to keep timing uniform on miss - let dummy = [0u8; 64]; // SHA-256 hex = 64 bytes - let _: subtle::Choice = dummy.ct_eq(code_hash.as_bytes()); - false - } - }; - - if matched { - // Atomically delete the row (one-time use) - conn.prepare_cached("DELETE FROM magic_links WHERE code_hash = ?1") - .and_then(|mut stmt| stmt.execute(rusqlite::params![code_hash])) - .ok()?; - Some(code.to_string()) - } else { - None - } -} - -#[cfg(test)] -mod tests { - use super::*; - - fn test_db() -> rusqlite::Connection { - let conn = rusqlite::Connection::open_in_memory().unwrap(); - conn.execute_batch( - "CREATE TABLE magic_links (code_hash TEXT PRIMARY KEY, expires_at INTEGER NOT NULL);" - ).unwrap(); - conn - } - - fn insert_code(conn: &rusqlite::Connection, code: &str, expires_at: i64) { - let code_hash = hash_code(code); - conn.execute( - "INSERT INTO magic_links (code_hash, expires_at) VALUES (?1, ?2)", - rusqlite::params![code_hash, expires_at], - ).unwrap(); - } - - #[test] - fn claim_valid_code() { - let conn = test_db(); - let future = crate::config::epoch_secs() as i64 + 300; - insert_code(&conn, "abc12345", future); - assert_eq!(claim_with_conn("abc12345", &conn), Some("abc12345".to_string())); - } - - #[test] - fn claim_expired_code() { - let conn = test_db(); - let past = crate::config::epoch_secs() as i64 - 10; - insert_code(&conn, "expired!", past); - assert!(claim_with_conn("expired!", &conn).is_none()); - } - - #[test] - fn claim_nonexistent_code() { - let conn = test_db(); - assert!(claim_with_conn("nope1234", &conn).is_none()); - } - - #[test] - fn claim_one_time_use() { - let conn = test_db(); - let future = crate::config::epoch_secs() as i64 + 300; - insert_code(&conn, "onceonly", future); - assert!(claim_with_conn("onceonly", &conn).is_some()); - assert!(claim_with_conn("onceonly", &conn).is_none()); - } - - #[test] - fn hash_code_is_deterministic() { - assert_eq!(hash_code("test1234"), hash_code("test1234")); - } - - #[test] - fn hash_code_differs_for_different_inputs() { - assert_ne!(hash_code("test1234"), hash_code("test5678")); - } - - #[test] - fn wrong_code_does_not_match() { - let conn = test_db(); - let future = crate::config::epoch_secs() as i64 + 300; - insert_code(&conn, "correct!", future); - assert!(claim_with_conn("wrongone", &conn).is_none()); - } -} diff --git a/src/main.rs b/src/main.rs index 5586ba3..3c3fff5 100644 --- a/src/main.rs +++ b/src/main.rs @@ -6,7 +6,6 @@ mod crypto; mod db; mod deposit; mod init; -mod magic_link; mod server; mod sign; mod tls; @@ -20,7 +19,7 @@ use anyhow::{Context, Result}; use clap::Parser; use tracing_subscriber::EnvFilter; -use cli::{Cli, Command, KeyCommand, MagicLinkCommand, ServiceCommand, VaultCommand}; +use cli::{Cli, Command, KeyCommand, ServiceCommand, VaultCommand}; #[tokio::main] async fn main() -> Result<()> { @@ -194,15 +193,6 @@ async fn main() -> Result<()> { } } - Command::MagicLink { command } => match command { - MagicLinkCommand::Host { code, expires } => { - magic_link::host(&code, expires)?; - } - MagicLinkCommand::List => { - magic_link::list()?; - } - }, - Command::Sign { dry_run, command } => { sign::run(&command, dry_run)?; } diff --git a/src/server.rs b/src/server.rs index 056a6bc..0944ed6 100644 --- a/src/server.rs +++ b/src/server.rs @@ -8,7 +8,6 @@ use axum::{ Router, }; use serde::Serialize; -use std::collections::HashMap; use std::net::{IpAddr, SocketAddr}; use std::sync::atomic::Ordering; use tower_http::cors::{Any, CorsLayer}; @@ -29,16 +28,11 @@ static APP_STATE: OnceLock = OnceLock::new(); /// so that SQLite returns BUSY cleanly before the task gets force-cancelled. const DB_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(5); -const RATE_LIMIT_WINDOW_SECS: u64 = 60; -const RATE_LIMIT_MAX_REQUESTS: u32 = 10; -const RATE_LIMIT_MAX_ENTRIES: usize = 10_000; const MAX_INPUT_LEN: usize = 256; /// Circuit breaker cool-down: DB operations rejected for this many seconds after last failure. const DB_CIRCUIT_COOLDOWN_SECS: u64 = 60; -const RATE_SHARDS: usize = 8; - fn epoch_secs() -> u64 { config::epoch_secs() } @@ -62,76 +56,6 @@ fn is_valid_input(s: &str) -> bool { && s.bytes().all(|b| ASCII_OK[b as usize]) } -/// Sharded mutex rate limiter — replaces DashMap for minimal overhead at <10k entries. -/// 8 shards eliminate contention without the DashMap dependency tree. -struct RateLimiter { - shards: [std::sync::Mutex>; RATE_SHARDS], -} - -impl RateLimiter { - fn new() -> Self { - Self { - shards: std::array::from_fn(|_| std::sync::Mutex::new(HashMap::new())), - } - } - - fn shard_index(ip: &IpAddr) -> usize { - let h = match ip { - IpAddr::V4(v4) => { - let o = v4.octets(); - (o[0] as usize).wrapping_mul(31) ^ (o[1] as usize).wrapping_mul(17) - ^ (o[2] as usize).wrapping_mul(7) ^ (o[3] as usize) - } - IpAddr::V6(v6) => { - v6.octets().iter().fold(0usize, |acc, &b| acc.wrapping_mul(31) ^ b as usize) - } - }; - h & (RATE_SHARDS - 1) - } - - fn check(&self, ip: IpAddr) -> bool { - let now = std::time::Instant::now(); - let window = std::time::Duration::from_secs(RATE_LIMIT_WINDOW_SECS); - // Fail open: if shard lock is contested, allow the request rather than block. - // A locked shard indicates system stress; blocking increases backlog. - let mut shard = match self.shards[Self::shard_index(&ip)].try_lock() { - Ok(guard) => guard, - Err(std::sync::TryLockError::WouldBlock) => return true, - Err(std::sync::TryLockError::Poisoned(e)) => e.into_inner(), - }; - // Hard cap per shard to prevent unbounded growth - if shard.len() >= RATE_LIMIT_MAX_ENTRIES / RATE_SHARDS { - let stale = shard.iter() - .find(|(_, (_, ts))| now.duration_since(*ts) > window) - .map(|(k, _)| *k); - match stale { - Some(key) => { shard.remove(&key); } - None => return false, - } - } - let entry = shard.entry(ip).or_insert((0, now)); - if now.duration_since(entry.1) > window { - *entry = (1, now); - true - } else if entry.0 >= RATE_LIMIT_MAX_REQUESTS { - false - } else { - entry.0 += 1; - true - } - } - - /// Evict stale entries from all shards. Called by hourly cleanup. - fn clean_stale(&self) { - let now = std::time::Instant::now(); - let window = std::time::Duration::from_secs(RATE_LIMIT_WINDOW_SECS); - for shard in &self.shards { - let mut map = shard.lock().unwrap_or_else(|e| e.into_inner()); - map.retain(|_, (_, ts)| now.duration_since(*ts) <= window); - } - } -} - pub struct AppState { pub agent_json_cached: bytes::Bytes, pub verifying_key: ed25519_dalek::VerifyingKey, @@ -141,8 +65,6 @@ pub struct AppState { pub db_pool: crate::db::DbPool, pub tls_active: bool, pub behind_proxy: bool, - /// Sharded mutex rate limiter — monotonic Instant prevents clock-skew attacks. - rate_limiter: RateLimiter, /// In-flight request counter for graceful shutdown drain. in_flight: std::sync::atomic::AtomicUsize, /// Circuit breaker: epoch second of last DB failure. Circuit open if within cooldown. @@ -154,11 +76,6 @@ impl AppState { &self.vault_key } - /// Returns true if the request is within rate limits. - pub fn check_rate_limit(&self, ip: IpAddr) -> bool { - self.rate_limiter.check(ip) - } - /// Record a DB failure timestamp. The first request after the cooldown naturally tests the DB. fn record_db_failure(&self) { self.last_db_failure.store(epoch_secs(), Ordering::Relaxed); @@ -214,9 +131,8 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { let verifying_key = credentials.verifying_key()?; let signing_key = credentials.signing_key()?; - let mut sk_bytes = Zeroizing::new(signing_key.to_bytes()); + let sk_bytes = Zeroizing::new(signing_key.to_bytes()); let vault_key = crate::crypto::vault::derive_vault_key(&sk_bytes)?; - sk_bytes.iter_mut().for_each(|b| *b = 0); // belt-and-suspenders drop(sk_bytes); // Pool size: env override or auto-detect from available parallelism (capped 2..8) @@ -245,44 +161,13 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { db_pool, tls_active, behind_proxy, - rate_limiter: RateLimiter::new(), in_flight: std::sync::atomic::AtomicUsize::new(0), last_db_failure: std::sync::atomic::AtomicU64::new(0), }).map_err(|_| anyhow::anyhow!("server already initialized"))?; let state: &'static AppState = APP_STATE.get().unwrap(); - // Background task: WAL checkpoint every 5 minutes (PASSIVE to avoid blocking writers; - // the hourly cleanup task runs TRUNCATE to actually reclaim WAL disk space). - tokio::spawn(async move { - loop { - tokio::time::sleep(std::time::Duration::from_secs(300)).await; - let _ = tokio::task::spawn_blocking(move || { - let wal_large = crate::config::atomic_dir() - .map(|d| d.join("atomic.db-wal")) - .ok() - .and_then(|p| std::fs::metadata(&p).ok()) - .map(|m| m.len() > 40 * 1024 * 1024) - .unwrap_or(false); - - match state.db_pool.get() { - Ok(conn) => { - if wal_large { - tracing::warn!("WAL exceeds 40MB, forcing TRUNCATE checkpoint"); - if let Err(e) = conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);") { - tracing::warn!("WAL TRUNCATE checkpoint failed: {e}, falling back to RESTART"); - let _ = conn.execute_batch("PRAGMA wal_checkpoint(RESTART);"); - } - } else if let Err(e) = conn.execute_batch("PRAGMA wal_checkpoint(PASSIVE);") { - tracing::warn!("WAL checkpoint failed: {e}"); - } - } - Err(e) => tracing::warn!("WAL checkpoint: pool exhausted: {e}"), - } - }).await; - } - }); - - // Background task: clean expired magic links, old deposit nonces, stale rate limiter entries hourly + // Background task: hourly cleanup of expired deposit data + PRAGMA optimize. + // WAL checkpointing is handled by wal_autocheckpoint=1000 (set in open_connection). tokio::spawn(async move { loop { tokio::time::sleep(std::time::Duration::from_secs(3600)).await; @@ -295,19 +180,6 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { } }; let now = epoch_secs() as i64; - // Paginated deletes: batch 1000 rows at a time to avoid holding - // the WAL write lock for extended periods under heavy load. - loop { - match conn.execute( - "DELETE FROM magic_links WHERE rowid IN \ - (SELECT rowid FROM magic_links WHERE expires_at <= ?1 LIMIT 1000)", - [now], - ) { - Ok(0) => break, - Ok(_) => continue, - Err(e) => { tracing::warn!("Failed to clean expired magic links: {e}"); break; } - } - } let cutoff = now - 7 * 86400; loop { match conn.execute( @@ -332,18 +204,13 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { Err(e) => { tracing::warn!("Failed to clean old deposit log entries: {e}"); break; } } } - if let Err(e) = conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);") { - tracing::warn!("Hourly WAL TRUNCATE checkpoint failed: {e}"); - } let _ = conn.execute_batch("PRAGMA optimize;"); }).await; - // Clean stale rate limiter entries (non-blocking, quick lock per shard) - state.rate_limiter.clean_stale(); } }); - // CORS only on the public agent.json endpoint; deposit and magic link - // endpoints are called by servers, not browsers, and don't need CORS. + // CORS only on the public agent.json endpoint; deposit endpoints + // are called by servers, not browsers, and don't need CORS. let cors = CorsLayer::new() .allow_origin(Any) .allow_methods(Any) @@ -359,7 +226,6 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { .route("/", get(root_redirect)) .merge(public_routes) .route("/d/{token}", post(handle_deposit)) - .route("/m/{code}", get(handle_magic_link)) .route("/_/health", get(handle_health)) .fallback(handle_404) .layer(middleware::from_fn_with_state( @@ -394,32 +260,6 @@ pub async fn run_server(credentials: Credentials) -> Result<()> { } TlsMode::Custom { .. } => { let rustls_config = crate::tls::resolve_rustls_config(&tls_mode).await?; - // Reload TLS cert on SIGHUP - #[cfg(unix)] - { - let sighup_config = rustls_config.clone(); - tokio::spawn(async move { - use tokio::signal::unix::{signal, SignalKind}; - let mut sighup = signal(SignalKind::hangup()) - .expect("Failed to install SIGHUP handler"); - loop { - sighup.recv().await; - let tls_dir = match crate::config::tls_dir() { - Ok(d) => d, - Err(e) => { - tracing::warn!("SIGHUP cert reload failed: {e}"); - continue; - } - }; - let cert_path = tls_dir.join("fullchain.pem"); - let key_path = tls_dir.join("key.pem"); - match sighup_config.reload_from_pem_file(&cert_path, &key_path).await { - Ok(()) => info!("TLS cert reloaded (SIGHUP)"), - Err(e) => tracing::warn!("TLS cert reload failed (SIGHUP): {e}"), - } - } - }); - } let handle = axum_server::Handle::new(); let shutdown_handle = handle.clone(); tokio::spawn(async move { @@ -520,8 +360,9 @@ async fn handle_deposit( } // Circuit breaker: reject immediately if DB is known-broken (disk full, corrupt, etc.) + // Returns uniform 404 to prevent information leakage about internal state. if state.is_db_circuit_open() { - return (StatusCode::SERVICE_UNAVAILABLE, [("retry-after", "30")]).into_response(); + return StatusCode::NOT_FOUND.into_response(); } // Only trust X-Forwarded-For when running behind a known reverse proxy. @@ -604,64 +445,6 @@ async fn handle_deposit( } } -async fn handle_magic_link( - State(state): State<&'static AppState>, - ConnectInfo(addr): ConnectInfo, - Path(code): Path, -) -> Response { - // Per-IP rate limiting to prevent brute-force of magic link codes. - // For a single-tenant agent, per-IP is sufficient — distributed brute-force - // across thousands of IPs is not the threat model. - if !state.check_rate_limit(addr.ip()) { - return StatusCode::TOO_MANY_REQUESTS.into_response(); - } - - // Reject obviously short codes or codes with non-printable chars before touching the DB - if code.len() < 20 || !is_valid_input(&code) { - return StatusCode::NOT_FOUND.into_response(); - } - - // Circuit breaker: reject immediately if DB is known-broken - if state.is_db_circuit_open() { - return (StatusCode::SERVICE_UNAVAILABLE, [("retry-after", "30")]).into_response(); - } - - let code_clone = code; - let result = tokio::time::timeout( - DB_TIMEOUT, - tokio::task::spawn_blocking(move || { - let conn = state.db_pool.get()?; - Ok::<_, anyhow::Error>(crate::magic_link::claim_with_conn(&code_clone, &conn)) - }) - ).await; - - match result { - Err(_elapsed) => { - tracing::error!("Magic link handler timed out"); - state.record_db_failure(); - StatusCode::NOT_FOUND.into_response() - } - Ok(Ok(Ok(Some(_)))) => { - state.record_db_success(); - (StatusCode::OK, [(header::CONTENT_TYPE, "application/json")], r#"{"status":"verified"}"#).into_response() - } - Ok(Ok(Ok(None))) => { - state.record_db_success(); - StatusCode::NOT_FOUND.into_response() - } - Ok(Ok(Err(e))) => { - tracing::error!("Magic link DB error: {e}"); - state.record_db_failure(); - StatusCode::NOT_FOUND.into_response() - } - Ok(Err(e)) => { - tracing::error!("Magic link task panicked: {e}"); - state.record_db_failure(); - StatusCode::NOT_FOUND.into_response() - } - } -} - async fn handle_health(State(state): State<&'static AppState>) -> Response { // Check DB is responsive (with timeout) let db_ok = tokio::time::timeout( From 5ea945eb4b433386de7cb6c199a3dc879f69fec5 Mon Sep 17 00:00:00 2001 From: Hitesh Joshi Date: Tue, 31 Mar 2026 12:16:35 +0530 Subject: [PATCH 41/49] Update README with changelog for 3d768ca --- README.md | 44 ++++++++++++++++++++------------------------ 1 file changed, 20 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index c1721de..f174f7c 100644 --- a/README.md +++ b/README.md @@ -78,20 +78,6 @@ $ atomic deposits $ atomic deposits --label stripe_key ``` -## Magic links - -Domain verification, like DNS TXT records but over HTTP. A service gives the agent a code, the agent hosts it, the service checks. - -```bash -$ atomic magic-link host VERIFY_ABC123 --expires 5m -https://fin.acme.com/m/VERIFY_ABC123 - -$ curl https://fin.acme.com/m/VERIFY_ABC123 -{"status":"verified","code":"VERIFY_ABC123"} -``` - -One-time use, gone after the first GET, expires in minutes. - ## Request signing ```bash @@ -118,7 +104,7 @@ Agents don't need that. An agent with a keypair can prove itself on every reques | | Human (JWT) | Agent (Atomic) | |------------------|--------------------------------------|---------------------------------------------| | **Identity** | email + password | domain (`fin.acme.com`) | -| **Signup** | create account, get credentials | sign request + magic link for domain proof | +| **Signup** | create account, get credentials | sign request, service verifies agent.json | | **Proof** | service issues a JWT | agent signs every request with private key | | **Each request** | send JWT, service checks it | send signature, service checks agent.json | | **Expiry** | token expires, agent re-auths | no token -- signatures are stateless | @@ -128,7 +114,7 @@ Agents don't need that. An agent with a keypair can prove itself on every reques The practical difference: the agent has nothing to manage. No token storage, no refresh logic. The private key stays on the box and never gets sent over the wire. -A service that wants extra assurance can layer a magic link challenge on top of the signature check at signup -- verify the sig, then confirm domain control, then create an internal account for `fin.acme.com`. After that, subsequent requests are just signature checks against a cached public key. +A service that wants extra assurance can verify domain control via DNS TXT records on top of the signature check at signup. After that, subsequent requests are just signature checks against a cached public key. Performance-wise, JWT verification is a single HMAC check while Ed25519 verify costs more. But "more" here means microseconds, and the public key only changes on rotation so it caches well. It's not where your latency lives. @@ -146,9 +132,6 @@ atomic verify Check another agent atomic deposit-url --label --expires Create deposit URL atomic deposits [--label ] Deposit audit log -atomic magic-link host --expires Host a verification code -atomic magic-link list Show active codes - atomic vault set