diff --git a/cmd/simrun/main.go b/cmd/simrun/main.go index c444bfd..ad65a1e 100644 --- a/cmd/simrun/main.go +++ b/cmd/simrun/main.go @@ -115,6 +115,35 @@ func main() { } }() + // Retention sweepers: expire run logs and whole assessments by age. Both run + // once at startup and then hourly, re-reading AppConfig each tick so admin + // changes apply without a restart. + go func() { + ticker := time.NewTicker(1 * time.Hour) + defer ticker.Stop() + sweep := func() { + // Use the main ctx so a shutdown signal cancels in-flight DB work + // (config load + the per-run deletes the assessment sweep loops over) + // instead of blocking shutdown on a slow Postgres. + cfg, err := configStore.GetAppConfig(ctx) + if err != nil { + log.Warnf("Retention sweep: failed to load config: %v", err) + return + } + web.SweepRunLogs(bootstrap.DataDir, cfg.AssessmentLogRetentionEnabled, cfg.AssessmentLogRetentionDays) + web.SweepAssessments(ctx, runStore, bootstrap.DataDir, cfg.AssessmentRetentionEnabled, cfg.AssessmentRetentionDays) + } + sweep() + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + sweep() + } + } + }() + go func() { if err := server.ListenAndServe(); err != nil { log.Fatalf("Server error: %v", err) diff --git a/internal/config/appconfig.go b/internal/config/appconfig.go index b1d2eac..2eef584 100644 --- a/internal/config/appconfig.go +++ b/internal/config/appconfig.go @@ -7,19 +7,27 @@ package config // belongs in connectors, anything secret belongs in secret_groups, anything // set at deploy belongs in Bootstrap. type AppConfig struct { - Parallelism int `json:"parallelism"` - TerraformVersion string `json:"terraform_version"` - PackLogsEnabled bool `json:"pack_logs_enabled"` - SSHLoggingEnabled bool `json:"ssh_logging_enabled"` + Parallelism int `json:"parallelism"` + TerraformVersion string `json:"terraform_version"` + PackLogsEnabled bool `json:"pack_logs_enabled"` + SSHLoggingEnabled bool `json:"ssh_logging_enabled"` + AssessmentLogRetentionEnabled bool `json:"assessment_log_retention_enabled"` + AssessmentLogRetentionDays int `json:"assessment_log_retention_days"` + AssessmentRetentionEnabled bool `json:"assessment_retention_enabled"` + AssessmentRetentionDays int `json:"assessment_retention_days"` } // DefaultAppConfig returns the default values used when no row exists for // a key. Keep these aligned with the migration that backfills app_config. func DefaultAppConfig() AppConfig { return AppConfig{ - Parallelism: 5, - TerraformVersion: "", - PackLogsEnabled: true, - SSHLoggingEnabled: false, + Parallelism: 5, + TerraformVersion: "", + PackLogsEnabled: true, + SSHLoggingEnabled: false, + AssessmentLogRetentionEnabled: true, + AssessmentLogRetentionDays: 7, + AssessmentRetentionEnabled: false, + AssessmentRetentionDays: 30, } } diff --git a/internal/config/appconfig_test.go b/internal/config/appconfig_test.go index 6d03b92..a177161 100644 --- a/internal/config/appconfig_test.go +++ b/internal/config/appconfig_test.go @@ -8,9 +8,13 @@ import ( func TestDefaultAppConfig(t *testing.T) { assert.Equal(t, AppConfig{ - Parallelism: 5, - TerraformVersion: "", - PackLogsEnabled: true, - SSHLoggingEnabled: false, + Parallelism: 5, + TerraformVersion: "", + PackLogsEnabled: true, + SSHLoggingEnabled: false, + AssessmentLogRetentionEnabled: true, + AssessmentLogRetentionDays: 7, + AssessmentRetentionEnabled: false, + AssessmentRetentionDays: 30, }, DefaultAppConfig()) } diff --git a/internal/db/config.go b/internal/db/config.go index f1dd78f..912a993 100644 --- a/internal/db/config.go +++ b/internal/db/config.go @@ -96,6 +96,30 @@ func parseAppConfig(all map[string]json.RawMessage) config.AppConfig { cfg.SSHLoggingEnabled = b } } + if v, ok := all["assessment_log_retention_enabled"]; ok { + var b bool + if err := json.Unmarshal(v, &b); err == nil { + cfg.AssessmentLogRetentionEnabled = b + } + } + if v, ok := all["assessment_log_retention_days"]; ok { + var n int + if err := json.Unmarshal(v, &n); err == nil && n > 0 { + cfg.AssessmentLogRetentionDays = n + } + } + if v, ok := all["assessment_retention_enabled"]; ok { + var b bool + if err := json.Unmarshal(v, &b); err == nil { + cfg.AssessmentRetentionEnabled = b + } + } + if v, ok := all["assessment_retention_days"]; ok { + var n int + if err := json.Unmarshal(v, &n); err == nil && n > 0 { + cfg.AssessmentRetentionDays = n + } + } return cfg } @@ -114,6 +138,10 @@ func appConfigKVs(c config.AppConfig) []appConfigKV { {"terraform_version", c.TerraformVersion}, {"pack_logs_enabled", c.PackLogsEnabled}, {"ssh_logging_enabled", c.SSHLoggingEnabled}, + {"assessment_log_retention_enabled", c.AssessmentLogRetentionEnabled}, + {"assessment_log_retention_days", c.AssessmentLogRetentionDays}, + {"assessment_retention_enabled", c.AssessmentRetentionEnabled}, + {"assessment_retention_days", c.AssessmentRetentionDays}, } } diff --git a/internal/db/config_test.go b/internal/db/config_test.go index 6fe974b..749ef9e 100644 --- a/internal/db/config_test.go +++ b/internal/db/config_test.go @@ -47,32 +47,64 @@ func TestParseAppConfig_NonPositiveParallelismKeepsDefault(t *testing.T) { func TestParseAppConfig_AllSet(t *testing.T) { got := parseAppConfig(map[string]json.RawMessage{ - "parallelism": json.RawMessage(`12`), - "terraform_version": json.RawMessage(`"1.6.0"`), - "pack_logs_enabled": json.RawMessage(`false`), - "ssh_logging_enabled": json.RawMessage(`true`), + "parallelism": json.RawMessage(`12`), + "terraform_version": json.RawMessage(`"1.6.0"`), + "pack_logs_enabled": json.RawMessage(`false`), + "ssh_logging_enabled": json.RawMessage(`true`), + "assessment_log_retention_enabled": json.RawMessage(`false`), + "assessment_log_retention_days": json.RawMessage(`14`), + "assessment_retention_enabled": json.RawMessage(`true`), + "assessment_retention_days": json.RawMessage(`90`), }) assert.Equal(t, config.AppConfig{ - Parallelism: 12, - TerraformVersion: "1.6.0", - PackLogsEnabled: false, - SSHLoggingEnabled: true, + Parallelism: 12, + TerraformVersion: "1.6.0", + PackLogsEnabled: false, + SSHLoggingEnabled: true, + AssessmentLogRetentionEnabled: false, + AssessmentLogRetentionDays: 14, + AssessmentRetentionEnabled: true, + AssessmentRetentionDays: 90, }, got) } +// The retention day fields guard against non-positive values like parallelism, +// so a stored 0/-1 cannot silently configure immediate deletion. +func TestParseAppConfig_NonPositiveRetentionDaysKeepsDefault(t *testing.T) { + def := config.DefaultAppConfig() + for _, v := range []string{`0`, `-1`} { + t.Run(v, func(t *testing.T) { + got := parseAppConfig(map[string]json.RawMessage{ + "assessment_log_retention_days": json.RawMessage(v), + "assessment_retention_days": json.RawMessage(v), + }) + assert.Equal(t, def.AssessmentLogRetentionDays, got.AssessmentLogRetentionDays) + assert.Equal(t, def.AssessmentRetentionDays, got.AssessmentRetentionDays) + }) + } +} + func TestAppConfigKVs_MarshalsToExpectedJSON(t *testing.T) { c := config.AppConfig{ - Parallelism: 7, - TerraformVersion: "1.5.7", - PackLogsEnabled: true, - SSHLoggingEnabled: false, + Parallelism: 7, + TerraformVersion: "1.5.7", + PackLogsEnabled: true, + SSHLoggingEnabled: false, + AssessmentLogRetentionEnabled: true, + AssessmentLogRetentionDays: 7, + AssessmentRetentionEnabled: false, + AssessmentRetentionDays: 30, } want := map[string]string{ - "parallelism": `7`, - "terraform_version": `"1.5.7"`, - "pack_logs_enabled": `true`, - "ssh_logging_enabled": `false`, + "parallelism": `7`, + "terraform_version": `"1.5.7"`, + "pack_logs_enabled": `true`, + "ssh_logging_enabled": `false`, + "assessment_log_retention_enabled": `true`, + "assessment_log_retention_days": `7`, + "assessment_retention_enabled": `false`, + "assessment_retention_days": `30`, } kvs := appConfigKVs(c) @@ -89,10 +121,14 @@ func TestFakeConfigStore_UpdateGetAppConfigRoundtrip(t *testing.T) { ctx := context.Background() want := config.AppConfig{ - Parallelism: 12, - TerraformVersion: "1.6.0", - PackLogsEnabled: false, - SSHLoggingEnabled: true, + Parallelism: 12, + TerraformVersion: "1.6.0", + PackLogsEnabled: false, + SSHLoggingEnabled: true, + AssessmentLogRetentionEnabled: false, + AssessmentLogRetentionDays: 14, + AssessmentRetentionEnabled: true, + AssessmentRetentionDays: 90, } require.NoError(t, f.UpdateAppConfig(ctx, want)) @@ -102,6 +138,8 @@ func TestFakeConfigStore_UpdateGetAppConfigRoundtrip(t *testing.T) { assert.Equal(t, []string{ "parallelism", "terraform_version", "pack_logs_enabled", "ssh_logging_enabled", + "assessment_log_retention_enabled", "assessment_log_retention_days", + "assessment_retention_enabled", "assessment_retention_days", }, f.sets) } diff --git a/internal/db/migrations/011_assessment_retention_appconfig.down.sql b/internal/db/migrations/011_assessment_retention_appconfig.down.sql new file mode 100644 index 0000000..411cc77 --- /dev/null +++ b/internal/db/migrations/011_assessment_retention_appconfig.down.sql @@ -0,0 +1,6 @@ +DELETE FROM app_config WHERE key IN ( + 'assessment_log_retention_enabled', + 'assessment_log_retention_days', + 'assessment_retention_enabled', + 'assessment_retention_days' +); diff --git a/internal/db/migrations/011_assessment_retention_appconfig.up.sql b/internal/db/migrations/011_assessment_retention_appconfig.up.sql new file mode 100644 index 0000000..086346a --- /dev/null +++ b/internal/db/migrations/011_assessment_retention_appconfig.up.sql @@ -0,0 +1,8 @@ +-- Backfill app_config with default values for retention settings. +-- Idempotent — only inserts if the key is missing. Aligned with DefaultAppConfig(). +INSERT INTO app_config (key, value) VALUES + ('assessment_log_retention_enabled', 'true'::jsonb), + ('assessment_log_retention_days', '7'::jsonb), + ('assessment_retention_enabled', 'false'::jsonb), + ('assessment_retention_days', '30'::jsonb) +ON CONFLICT (key) DO NOTHING; diff --git a/internal/db/runs.go b/internal/db/runs.go index 526304c..95df9be 100644 --- a/internal/db/runs.go +++ b/internal/db/runs.go @@ -16,6 +16,9 @@ type RunStore interface { Create(ctx context.Context, run *Run) error Get(ctx context.Context, id uuid.UUID) (*Run, error) List(ctx context.Context, filters ListRunsFilters, limit, offset int) (RunPage, error) + // ListExpired returns the IDs of runs created before cutoff whose status is + // not "running" — the assessment-retention sweeper's deletion candidates. + ListExpired(ctx context.Context, cutoff time.Time) ([]uuid.UUID, error) Update(ctx context.Context, id uuid.UUID, status string, total, succeeded, failed int, endTime *time.Time) error Delete(ctx context.Context, id uuid.UUID) error AddScenarioResult(ctx context.Context, runID uuid.UUID, result *ScenarioResult) error @@ -222,6 +225,26 @@ func buildRunsWhere(f ListRunsFilters) (string, []any) { return "WHERE " + strings.Join(clauses, " AND "), args } +func (s *runStore) ListExpired(ctx context.Context, cutoff time.Time) ([]uuid.UUID, error) { + rows, err := s.pool.Query(ctx, + `SELECT id FROM runs WHERE created_at < $1 AND status <> 'running'`, cutoff, + ) + if err != nil { + return nil, err + } + defer rows.Close() + + var ids []uuid.UUID + for rows.Next() { + var id uuid.UUID + if err := rows.Scan(&id); err != nil { + return nil, err + } + ids = append(ids, id) + } + return ids, rows.Err() +} + func (s *runStore) Update(ctx context.Context, id uuid.UUID, status string, total, succeeded, failed int, endTime *time.Time) error { _, err := s.pool.Exec(ctx, `UPDATE runs SET status = $2, total = $3, succeeded = $4, failed = $5, end_time = $6 diff --git a/internal/testutil/fakes/fakes.go b/internal/testutil/fakes/fakes.go index 2267cf0..f374d7e 100644 --- a/internal/testutil/fakes/fakes.go +++ b/internal/testutil/fakes/fakes.go @@ -116,6 +116,18 @@ func matchesRunFilters(r *db.Run, f db.ListRunsFilters) bool { return true } +func (s *RunStore) ListExpired(_ context.Context, cutoff time.Time) ([]uuid.UUID, error) { + s.mu.Lock() + defer s.mu.Unlock() + var ids []uuid.UUID + for id, r := range s.runs { + if r.CreatedAt.Before(cutoff) && r.Status != "running" { + ids = append(ids, id) + } + } + return ids, nil +} + func (s *RunStore) Update(_ context.Context, id uuid.UUID, status string, total, succeeded, failed int, endTime *time.Time) error { s.mu.Lock() defer s.mu.Unlock() @@ -703,6 +715,30 @@ func (s *ConfigStore) GetAppConfig(_ context.Context) (config.AppConfig, error) out.SSHLoggingEnabled = v } } + if raw, ok := s.data["assessment_log_retention_enabled"]; ok { + var v bool + if err := json.Unmarshal(raw, &v); err == nil { + out.AssessmentLogRetentionEnabled = v + } + } + if raw, ok := s.data["assessment_log_retention_days"]; ok { + var v int + if err := json.Unmarshal(raw, &v); err == nil && v > 0 { + out.AssessmentLogRetentionDays = v + } + } + if raw, ok := s.data["assessment_retention_enabled"]; ok { + var v bool + if err := json.Unmarshal(raw, &v); err == nil { + out.AssessmentRetentionEnabled = v + } + } + if raw, ok := s.data["assessment_retention_days"]; ok { + var v int + if err := json.Unmarshal(raw, &v); err == nil && v > 0 { + out.AssessmentRetentionDays = v + } + } return out, nil } @@ -710,10 +746,14 @@ func (s *ConfigStore) UpdateAppConfig(_ context.Context, c config.AppConfig) err s.mu.Lock() defer s.mu.Unlock() for k, v := range map[string]any{ - "parallelism": c.Parallelism, - "terraform_version": c.TerraformVersion, - "pack_logs_enabled": c.PackLogsEnabled, - "ssh_logging_enabled": c.SSHLoggingEnabled, + "parallelism": c.Parallelism, + "terraform_version": c.TerraformVersion, + "pack_logs_enabled": c.PackLogsEnabled, + "ssh_logging_enabled": c.SSHLoggingEnabled, + "assessment_log_retention_enabled": c.AssessmentLogRetentionEnabled, + "assessment_log_retention_days": c.AssessmentLogRetentionDays, + "assessment_retention_enabled": c.AssessmentRetentionEnabled, + "assessment_retention_days": c.AssessmentRetentionDays, } { raw, err := json.Marshal(v) if err != nil { diff --git a/internal/testutil/fakes/fakes_test.go b/internal/testutil/fakes/fakes_test.go new file mode 100644 index 0000000..76a8b9c --- /dev/null +++ b/internal/testutil/fakes/fakes_test.go @@ -0,0 +1,45 @@ +package fakes + +import ( + "context" + "testing" + "time" + + "github.com/IBM/simrun/internal/db" + "github.com/google/uuid" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// ListExpired is the assessment-retention sweeper's deletion query: it must +// return only runs created before the cutoff and must never return a run that +// is still running, even if old. (The production SQL is Postgres-bound and +// verified manually per the change's task 7.3; this pins the shared contract.) +func TestRunStore_ListExpired(t *testing.T) { + s := New().Run + ctx := context.Background() + cutoff := time.Now().Add(-7 * 24 * time.Hour) + + old := mustCreateRun(t, ctx, s, "completed", cutoff.Add(-time.Hour)) + recent := mustCreateRun(t, ctx, s, "completed", cutoff.Add(time.Hour)) + oldRunning := mustCreateRun(t, ctx, s, "running", cutoff.Add(-time.Hour)) + + ids, err := s.ListExpired(ctx, cutoff) + require.NoError(t, err) + + assert.Contains(t, ids, old, "old completed run should be expired") + assert.NotContains(t, ids, recent, "run newer than cutoff must be kept") + assert.NotContains(t, ids, oldRunning, "running run must be skipped even when old") +} + +func mustCreateRun(t *testing.T, ctx context.Context, s *RunStore, status string, createdAt time.Time) uuid.UUID { + t.Helper() + id := uuid.New() + require.NoError(t, s.Create(ctx, &db.Run{ + ID: id, + Status: status, + StartTime: createdAt, + CreatedAt: createdAt, + })) + return id +} diff --git a/internal/web/api_config_test.go b/internal/web/api_config_test.go new file mode 100644 index 0000000..4104af9 --- /dev/null +++ b/internal/web/api_config_test.go @@ -0,0 +1,51 @@ +package web_test + +import ( + "net/http" + "testing" + + "github.com/IBM/simrun/internal/testutil/testserver" + "github.com/IBM/simrun/internal/web" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// Retention day fields are floored at 1 so a config write cannot configure +// immediate deletion of run logs or assessments. +func TestHandleUpdateConfig_RetentionDaysRejectsZero(t *testing.T) { + for _, key := range []string{"assessment_log_retention_days", "assessment_retention_days"} { + t.Run(key, func(t *testing.T) { + ts := testserver.New(t) + + before, err := ts.Stores.Config.GetAppConfig(t.Context()) + require.NoError(t, err) + + resp := ts.Put(t, "/api/config", web.UpdateConfigRequest{ + Key: key, + Value: []byte(`0`), + }) + defer resp.Body.Close() + require.Equal(t, http.StatusBadRequest, resp.StatusCode) + + // The rejected write must leave the stored config untouched. + after, err := ts.Stores.Config.GetAppConfig(t.Context()) + require.NoError(t, err) + assert.Equal(t, before, after) + }) + } +} + +func TestHandleUpdateConfig_RetentionDaysPersistsValid(t *testing.T) { + ts := testserver.New(t) + + resp := ts.Put(t, "/api/config", web.UpdateConfigRequest{ + Key: "assessment_retention_days", + Value: []byte(`14`), + }) + defer resp.Body.Close() + require.Equal(t, http.StatusNoContent, resp.StatusCode) + + cfg, err := ts.Stores.Config.GetAppConfig(t.Context()) + require.NoError(t, err) + assert.Equal(t, 14, cfg.AssessmentRetentionDays) +} diff --git a/internal/web/api_runs_test.go b/internal/web/api_runs_test.go index 5a9cdf9..1f9b07c 100644 --- a/internal/web/api_runs_test.go +++ b/internal/web/api_runs_test.go @@ -2,6 +2,8 @@ package web_test import ( "net/http" + "os" + "path/filepath" "testing" "time" @@ -369,6 +371,78 @@ func TestHandleDeleteRun(t *testing.T) { assert.Nil(t, got) } +// Deleting a run must reclaim every on-disk artifact — the JSONL log and the +// large collected .ndjson files — not just the database row, so disk is +// actually freed. +func TestHandleDeleteRun_RemovesAllArtifacts(t *testing.T) { + ts := testserver.New(t) + ctx := t.Context() + + id := uuid.New() + require.NoError(t, ts.Stores.Run.Create(ctx, &db.Run{ + ID: id, + Status: "completed", + StartTime: time.Now(), + CreatedAt: time.Now(), + })) + + // JSONL log file. + w, err := web.NewRunLogWriter(ts.DataDir, id.String()) + require.NoError(t, err) + w.Write(web.RunLogEntry{Level: "info", Message: "hi"}) + require.NoError(t, w.Close()) + jsonlPath := filepath.Join(ts.DataDir, "run-logs", id.String()+".jsonl") + require.FileExists(t, jsonlPath) + + // Collected .ndjson artifact referenced by a scenario result. + ndjsonPath := filepath.Join(ts.DataDir, "collected-"+id.String()+".ndjson") + require.NoError(t, os.WriteFile(ndjsonPath, []byte(`{"a":1}`+"\n"), 0644)) + require.NoError(t, ts.Stores.Run.AddScenarioResult(ctx, id, &db.ScenarioResult{ + Name: "s1", + CollectedLogPath: &ndjsonPath, + })) + + resp := ts.Delete(t, "/api/runs/"+id.String()) + defer resp.Body.Close() + require.Equal(t, http.StatusNoContent, resp.StatusCode) + + _, err = ts.Stores.Run.Get(ctx, id) + assert.Error(t, err, "run row should be gone") + results, err := ts.Stores.Run.GetScenarioResults(ctx, id) + require.NoError(t, err) + assert.Empty(t, results, "scenario_results should be cascade-deleted") + assert.NoFileExists(t, jsonlPath, "JSONL log should be removed") + assert.NoFileExists(t, ndjsonPath, "collected .ndjson should be removed") +} + +// A collected file already gone from disk must not fail the delete. +func TestHandleDeleteRun_MissingCollectedFileStillSucceeds(t *testing.T) { + ts := testserver.New(t) + ctx := t.Context() + + id := uuid.New() + require.NoError(t, ts.Stores.Run.Create(ctx, &db.Run{ + ID: id, + Status: "completed", + StartTime: time.Now(), + CreatedAt: time.Now(), + })) + + // Points at a .ndjson that was never created on disk. + missing := filepath.Join(ts.DataDir, "gone-"+id.String()+".ndjson") + require.NoError(t, ts.Stores.Run.AddScenarioResult(ctx, id, &db.ScenarioResult{ + Name: "s1", + CollectedLogPath: &missing, + })) + + resp := ts.Delete(t, "/api/runs/"+id.String()) + defer resp.Body.Close() + require.Equal(t, http.StatusNoContent, resp.StatusCode) + + _, err := ts.Stores.Run.Get(ctx, id) + assert.Error(t, err, "run row should be gone") +} + func TestHandleGetRunLogs_NoLogs(t *testing.T) { ts := testserver.New(t) diff --git a/internal/web/frontend/.gitkeep b/internal/web/frontend/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/internal/web/handlers.go b/internal/web/handlers.go index 03d44ac..919d635 100644 --- a/internal/web/handlers.go +++ b/internal/web/handlers.go @@ -334,14 +334,11 @@ func (h *Handlers) HandleDeleteRun(w http.ResponseWriter, r *http.Request) { return } - if err := h.runStore.Delete(r.Context(), id); err != nil { + if err := deleteRunWithArtifacts(r.Context(), h.runStore, h.dataDir, id); err != nil { writeError(w, http.StatusInternalServerError, err.Error()) return } - // Clean up log file (best-effort) - DeleteRunLog(h.dataDir, id.String()) - w.WriteHeader(http.StatusNoContent) } @@ -430,6 +427,16 @@ func (h *Handlers) HandleUpdateConfig(w http.ResponseWriter, r *http.Request) { return } + // Retention day fields must be >= 1 so they cannot be set to a value that + // deletes data immediately. Other keys keep the permissive key/value behavior. + if req.Key == "assessment_log_retention_days" || req.Key == "assessment_retention_days" { + var days int + if err := json.Unmarshal(req.Value, &days); err != nil || days < 1 { + writeError(w, http.StatusBadRequest, req.Key+" must be at least 1") + return + } + } + if err := h.configStore.Set(r.Context(), req.Key, req.Value); err != nil { writeError(w, http.StatusInternalServerError, err.Error()) return diff --git a/internal/web/retention_test.go b/internal/web/retention_test.go new file mode 100644 index 0000000..de5d183 --- /dev/null +++ b/internal/web/retention_test.go @@ -0,0 +1,124 @@ +package web_test + +import ( + "context" + "os" + "path/filepath" + "testing" + "time" + + "github.com/IBM/simrun/internal/db" + "github.com/IBM/simrun/internal/testutil/fakes" + "github.com/IBM/simrun/internal/web" + "github.com/google/uuid" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// writeLog writes a run-log JSONL file and back-dates its mtime by ageDays. +func writeLog(t *testing.T, dataDir, runID string, ageDays int) string { + t.Helper() + w, err := web.NewRunLogWriter(dataDir, runID) + require.NoError(t, err) + w.Write(web.RunLogEntry{Level: "info", Message: "entry"}) + require.NoError(t, w.Close()) + path := filepath.Join(dataDir, "run-logs", runID+".jsonl") + when := time.Now().AddDate(0, 0, -ageDays) + require.NoError(t, os.Chtimes(path, when, when)) + return path +} + +// The log sweeper expires verbose logs by mtime but must never touch the runs +// row — the assessment summary outlives its logs. +func TestSweepRunLogs_DeletesAgedKeepsRecent(t *testing.T) { + dataDir := t.TempDir() + aged := writeLog(t, dataDir, uuid.NewString(), 10) + recent := writeLog(t, dataDir, uuid.NewString(), 1) + + web.SweepRunLogs(dataDir, true, 7) + + assert.NoFileExists(t, aged, "log older than 7 days should be swept") + assert.FileExists(t, recent, "log newer than 7 days should be kept") +} + +func TestSweepRunLogs_DisabledIsNoOp(t *testing.T) { + dataDir := t.TempDir() + aged := writeLog(t, dataDir, uuid.NewString(), 30) + + web.SweepRunLogs(dataDir, false, 7) + + assert.FileExists(t, aged, "disabled sweeper must delete nothing") +} + +// makeRun creates a run plus a JSONL log and a collected .ndjson artifact, +// returning the run ID and both file paths. +func makeRun(t *testing.T, ctx context.Context, store *fakes.RunStore, dataDir, status string, ageDays int) (uuid.UUID, string, string) { + t.Helper() + id := uuid.New() + created := time.Now().AddDate(0, 0, -ageDays) + require.NoError(t, store.Create(ctx, &db.Run{ + ID: id, + Status: status, + StartTime: created, + CreatedAt: created, + })) + + jsonl := writeLog(t, dataDir, id.String(), ageDays) + + ndjson := filepath.Join(dataDir, "collected-"+id.String()+".ndjson") + require.NoError(t, os.WriteFile(ndjson, []byte(`{"x":1}`+"\n"), 0644)) + require.NoError(t, store.AddScenarioResult(ctx, id, &db.ScenarioResult{ + Name: "s1", + CollectedLogPath: &ndjson, + })) + return id, jsonl, ndjson +} + +// The assessment sweeper purges aged completed runs entirely (row + results + +// JSONL + collected .ndjson), keeps recent runs, and never deletes a run still +// running even if it is old. +func TestSweepAssessments_PurgesAgedSkipsRunningAndRecent(t *testing.T) { + dataDir := t.TempDir() + store := fakes.New().Run + ctx := context.Background() + + oldID, oldJSONL, oldNDJSON := makeRun(t, ctx, store, dataDir, "completed", 40) + recentID, recentJSONL, _ := makeRun(t, ctx, store, dataDir, "completed", 1) + runningID, runningJSONL, _ := makeRun(t, ctx, store, dataDir, "running", 40) + + web.SweepAssessments(ctx, store, dataDir, true, 30) + + // Aged completed run: everything gone. + _, err := store.Get(ctx, oldID) + assert.Error(t, err, "aged run row should be deleted") + results, err := store.GetScenarioResults(ctx, oldID) + require.NoError(t, err) + assert.Empty(t, results, "aged run results should be cascade-deleted") + assert.NoFileExists(t, oldJSONL, "aged run JSONL should be removed") + assert.NoFileExists(t, oldNDJSON, "aged run collected .ndjson should be removed") + + // Recent run: untouched. + _, err = store.Get(ctx, recentID) + assert.NoError(t, err, "recent run should be kept") + assert.FileExists(t, recentJSONL, "recent run JSONL should be kept") + + // Old running run: untouched. + _, err = store.Get(ctx, runningID) + assert.NoError(t, err, "running run should be kept even when old") + assert.FileExists(t, runningJSONL, "running run JSONL should be kept") +} + +func TestSweepAssessments_DisabledIsNoOp(t *testing.T) { + dataDir := t.TempDir() + store := fakes.New().Run + ctx := context.Background() + + id, jsonl, ndjson := makeRun(t, ctx, store, dataDir, "completed", 40) + + web.SweepAssessments(ctx, store, dataDir, false, 30) + + _, err := store.Get(ctx, id) + assert.NoError(t, err, "disabled sweeper must keep the run") + assert.FileExists(t, jsonl) + assert.FileExists(t, ndjson) +} diff --git a/internal/web/run_deletion.go b/internal/web/run_deletion.go new file mode 100644 index 0000000..9ad0bdb --- /dev/null +++ b/internal/web/run_deletion.go @@ -0,0 +1,90 @@ +package web + +import ( + "context" + "os" + "path/filepath" + "time" + + "github.com/IBM/simrun/internal/db" + "github.com/google/uuid" + "github.com/sirupsen/logrus" +) + +// deleteRunWithArtifacts removes a run's database row (cascading to its +// scenario_results via FK) and best-effort removes all of its on-disk +// artifacts: the per-run JSONL log file and every collected .ndjson file +// referenced by scenario_results.collected_log_path. +// +// Collected paths are read before the row is deleted, since the cascade +// removes the scenario_results that hold them. On-disk removal is best-effort: +// a missing or unremovable file logs a warning but does not fail the delete, +// so a leftover artifact never blocks reclaiming the database row. +// +// Shared by manual delete (HandleDeleteRun) and the assessment-retention +// sweeper so both reclaim the large collected .ndjson artifacts identically. +func deleteRunWithArtifacts(ctx context.Context, runStore db.RunStore, dataDir string, runID uuid.UUID) error { + // Collect the .ndjson paths before deleting, while the rows still exist. + var collected []string + results, err := runStore.GetScenarioResults(ctx, runID) + if err != nil { + // A failed lookup must not strand the row: log and continue to delete. + logrus.WithError(err).WithField("run_id", runID). + Warn("failed to load scenario results for artifact cleanup; deleting run anyway") + } else { + for _, res := range results { + if res.CollectedLogPath != nil && *res.CollectedLogPath != "" { + collected = append(collected, *res.CollectedLogPath) + } + } + } + + if err := runStore.Delete(ctx, runID); err != nil { + return err + } + + // Best-effort: remove the JSONL log file. + DeleteRunLog(dataDir, runID.String()) + + // Best-effort: remove each collected .ndjson artifact. + for _, p := range collected { + clean := filepath.Clean(p) + if filepath.Ext(clean) != ".ndjson" { + // Guard against removing anything that isn't a collected log file, + // mirroring the download handler's path check. + logrus.WithField("path", p).WithField("run_id", runID). + Warn("skipping removal of non-.ndjson collected log path") + continue + } + if err := os.Remove(clean); err != nil && !os.IsNotExist(err) { + logrus.WithError(err).WithField("path", clean).WithField("run_id", runID). + Warn("failed to remove collected log file") + } + } + + return nil +} + +// SweepAssessments deletes whole runs (row + scenario_results + JSONL log + +// collected .ndjson artifacts) whose created_at is older than days. It is a +// no-op when enabled is false. Runs still in the "running" status are excluded +// by ListExpired, so an actively-writing run is never purged. A per-run delete +// failure is logged and the sweep continues with the remaining runs. +func SweepAssessments(ctx context.Context, runStore db.RunStore, dataDir string, enabled bool, days int) { + if !enabled { + return + } + + cutoff := time.Now().AddDate(0, 0, -days) + ids, err := runStore.ListExpired(ctx, cutoff) + if err != nil { + logrus.WithError(err).Warn("assessment sweep: failed to list expired runs") + return + } + + for _, id := range ids { + if err := deleteRunWithArtifacts(ctx, runStore, dataDir, id); err != nil { + logrus.WithError(err).WithField("run_id", id).Warn("assessment sweep: failed to delete expired run") + } + } +} diff --git a/internal/web/runlog.go b/internal/web/runlog.go index 8e8eba2..2d559b0 100644 --- a/internal/web/runlog.go +++ b/internal/web/runlog.go @@ -174,6 +174,42 @@ func DeleteRunLog(dataDir, runID string) { _ = os.Remove(filepath.Join(dataDir, "run-logs", runID+".jsonl")) } +// SweepRunLogs deletes run-log JSONL files in /run-logs whose last +// modification time is older than days. It is a no-op when enabled is false. +// Deleting a log file does not touch the corresponding runs row — only the +// verbose log expires. Best-effort: per-file failures are logged, not fatal. +func SweepRunLogs(dataDir string, enabled bool, days int) { + if !enabled { + return + } + + dir := filepath.Join(dataDir, "run-logs") + entries, err := os.ReadDir(dir) + if err != nil { + if !os.IsNotExist(err) { + logrus.WithError(err).WithField("dir", dir).Warn("run-log sweep: failed to read run-logs directory") + } + return + } + + cutoff := time.Now().AddDate(0, 0, -days) + for _, e := range entries { + if e.IsDir() || filepath.Ext(e.Name()) != ".jsonl" { + continue + } + info, err := e.Info() + if err != nil { + continue + } + if info.ModTime().Before(cutoff) { + path := filepath.Join(dir, e.Name()) + if err := os.Remove(path); err != nil && !os.IsNotExist(err) { + logrus.WithError(err).WithField("path", path).Warn("run-log sweep: failed to remove aged log file") + } + } + } +} + // ReadRunLog reads a run's JSONL log file and returns the entries. func ReadRunLog(dataDir, runID string) ([]RunLogEntry, error) { path := filepath.Join(dataDir, "run-logs", runID+".jsonl") diff --git a/openspec/changes/assessment-retention/.openspec.yaml b/openspec/changes/assessment-retention/.openspec.yaml new file mode 100644 index 0000000..3ac681e --- /dev/null +++ b/openspec/changes/assessment-retention/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-06-17 diff --git a/openspec/changes/assessment-retention/design.md b/openspec/changes/assessment-retention/design.md new file mode 100644 index 0000000..b7e1fe7 --- /dev/null +++ b/openspec/changes/assessment-retention/design.md @@ -0,0 +1,129 @@ +## Context + +Run logs are one JSONL file per run at `/run-logs/.jsonl` +(`internal/web/runlog.go`), removed only by `DeleteRunLog` when a run is +deleted. Whole assessments — the `runs` row, its `scenario_results`, and the +collected `.ndjson` files at `scenario_results.collected_log_path` — are never +cleaned up automatically. Both accumulate and filled the host's shared volume. + +Deployment is single-instance on a shared volume. Bounding growth — not +relocating bytes to Postgres/object storage — is what fixes the out-of-space +condition, so retention stays on disk + Postgres as-is. + +Two distinct lifetimes are wanted: verbose logs are disposable after a few days, +while assessment summaries may be kept longer before the whole run is purged. +Hence two independent retention windows. + +Existing patterns reused: +- `AppConfig` is a key/value model in `app_config`, parsed by `parseAppConfig` + and written by `appConfigKVs` (`internal/db/config.go`), served by + `GET/PUT /api/config`. New settings follow `pack_logs_enabled` exactly. +- A periodic background cleanup goroutine already exists for auth sessions + (`cmd/simrun/main.go:103`): ticker + `ctx.Done()`. Both sweepers copy it. +- `GET /api/runs/{runId}/logs` already returns `200 []` for a missing file, so + log-swept runs need no endpoint change. +- `RunStore.Delete` already cascades `scenario_results` via FK; the manual + delete handler (`HandleDeleteRun`) calls `DeleteRunLog` after. + +## Goals / Non-Goals + +**Goals:** +- Bound run-log disk usage by age (log sweeper, keeps the run record). +- Bound whole-assessment growth by age (assessment sweeper, removes row + + results + JSONL + collected `.ndjson`). +- Two independent, admin-tunable windows from the assessments page; sane defaults. +- Fix the existing leak where deleting a run leaves collected `.ndjson` behind. +- Minimal, surgical change; no new storage backend or infra. + +**Non-Goals:** +- A per-run log size cap. Measured: ~45 sims running for an hour produce only a + few MB of JSONL, so a single run cannot realistically exhaust the disk. +- Moving logs to Postgres or object storage (rejected in brainstorm: no benefit + on a shared volume, adds write amplification / infra). +- Retention for terraform work dirs or pack cache. +- Horizontal-scale / cross-instance access. + +## Decisions + +### Two independent windows, both enforced by background sweepers +- **Log retention** deletes `run-logs/*.jsonl` by file `mtime` older than + `assessment_log_retention_days`; keeps the `runs` row. Default **on, 7 days**. +- **Assessment retention** deletes whole runs by `created_at` older than + `assessment_retention_days`; removes the row (cascading results), the JSONL, + and the collected `.ndjson`. Default **off, 30 days** — opt-in because it + destroys results history. + +Both run as `go func()`s in `main.go` mirroring the session cleaner: once at +startup, then on a 1h ticker, reading `AppConfig` each tick so changes apply +without restart, exiting on `ctx.Done()`. The 1h interval is fixed (matches the +session cleaner), not a config knob. + +- *Why mtime for logs but created_at for assessments:* the log file's mtime is + the cheapest truth for "last touched" and also reclaims orphaned files; an + assessment's age is defined by when the run started, which is `created_at`. +- *Why skip `status = running`:* avoids deleting a run that is still actively + writing results/logs even if it has been running absurdly long. + +### Shared run-deletion helper (fixes the collected-`.ndjson` leak) +`HandleDeleteRun` today removes the row + JSONL but **not** the collected +`.ndjson` files, so those large SIEM-log artifacts leak on manual delete. The +assessment sweeper needs to remove them too. Extract a single helper that, given +a run ID: reads its `scenario_results` collected paths, calls `RunStore.Delete`, +then best-effort removes the JSONL and each `.ndjson` (rejecting paths not +ending in `.ndjson`, matching the existing collected-logs handler guard). Both +`HandleDeleteRun` and the assessment sweeper call it, so behavior is identical +and the leak is fixed once. + +- *Alternative considered — sweeper deletes rows then globs orphaned files:* + fragile (can't reliably tie an arbitrary `.ndjson` to a run); using the stored + `collected_log_path` is precise. + +### RunStore: query expired runs +Add a `RunStore` method to list runs with `created_at < cutoff` and +`status <> 'running'` (returning IDs; the helper then fetches each run's +collected paths via the existing `GetScenarioResults`). The existing `List` +filters support `Since` (>=); this adds the complementary cutoff for the +sweeper. Deletion reuses `RunStore.Delete`. + +### Validation in the PUT handler +`PUT /api/config` is a generic key/value setter today. The handler SHALL reject +`assessment_log_retention_days < 1` and `assessment_retention_days < 1` with HTTP 400. +Validation is keyed on the config `key` being set; other keys keep today's +permissive behavior. + +### Config surface: one dialog, both windows +An "Assessment retention" button on `web/frontend/src/routes/assessments` opens +a dialog (shadcn `Dialog` + `Label`/`Input`/`Switch`) with two enable toggles +and two day-counts, saving via `PUT /api/config` (one call per changed key). +Config is loaded lazily when the dialog opens. 400s surface in the dialog +without discarding input. + +## Risks / Trade-offs + +- **Assessment retention is destructive (deletes results history)** → shipped + off by default; admins opt in deliberately; validation floors at 1 day. +- **Mtime drift / clock skew on the host** → mtime/created_at are local to the + writing host, so relative age is consistent for a multi-day window. +- **Log retention deletes logs an operator still wanted** → default 7 days + matches stated "disposable after a few days"; the run summary persists. +- **Best-effort file removal can leave a stray file if a path is malformed** → + the `.ndjson` guard plus logging a warning on failure keeps it visible + without failing the sweep. +- **Two sweepers read AppConfig hourly (2 DB hits/hour)** → negligible, same + order as the session cleaner. + +## Migration Plan + +1. Add migration `011_assessment_retention_appconfig` (up: backfill the four + keys with defaults; down: delete them). Aligns with `DefaultAppConfig()`. +2. Ship code: new `AppConfig` fields, `parseAppConfig`/`appConfigKVs` entries, + shared run-deletion helper (reused by `HandleDeleteRun`), `RunStore` + expired-runs query, both sweeper goroutines, PUT validation, config dialog. +3. Rollback: revert the binary and run the migration `down`; on-disk files are + untouched by rollback. Accumulated logs are reclaimed on the first log sweep; + accumulated old assessments are reclaimed only after an admin opts in. + +## Open Questions + +- None outstanding. Sweep interval fixed at 1h; size cap dropped; assessment + retention default-off confirmed. diff --git a/openspec/changes/assessment-retention/proposal.md b/openspec/changes/assessment-retention/proposal.md new file mode 100644 index 0000000..f8f3393 --- /dev/null +++ b/openspec/changes/assessment-retention/proposal.md @@ -0,0 +1,61 @@ +## Why + +Run-log JSONL files (`/run-logs/.jsonl`) are deleted only when +their parent run is deleted, and whole assessments (the `runs` row, its +`scenario_results`, and the collected `.ndjson` log files) are never cleaned up +automatically. Both pile up without bound and recently filled the host's shared +volume. Operators treat run logs as disposable a few days after a run finishes, +and want a way to bound how long whole assessments are kept too. + +## What Changes + +- Add a background **log-retention sweeper** that deletes run-log JSONL files + older than a configurable age, keeping the assessment record. Run summaries + persist; their verbose logs expire. +- Add a background **assessment-retention sweeper** that deletes whole runs + older than a configurable age — the `runs` row (cascading to + `scenario_results`), the JSONL log file, **and** the collected `.ndjson` + files referenced by `collected_log_path` (the large SIEM-log artifacts), so + the disk is actually reclaimed. Disabled by default (opt-in), since it + destroys results history. +- Add a shared run-deletion helper that removes a run's row plus all of its + on-disk artifacts (JSONL + collected `.ndjson`). The manual + `DELETE /api/runs/{id}` is updated to use it so it no longer leaks collected + `.ndjson` files. +- Extend `AppConfig` with admin-tunable retention settings (log-retention + enable + days; assessment-retention enable + days), stored in `app_config` + and served by the existing `GET/PUT /api/config`. +- Add an **"Assessment retention" button + dialog** on the assessments page so + admins set both cleanup cadences from the UI. + +Relies on the existing `runs` behavior that a missing log file yields `200 []` +from `GET /api/runs/{runId}/logs`, so swept logs surface gracefully as "no +logs" rather than an error. + +## Capabilities + +### New Capabilities +- `assessment-retention`: Time-bounded lifecycle for run logs and whole + assessments — the two background sweepers, the shared run-deletion helper, + the admin-configurable retention settings, and the UI to tune them. + +### Modified Capabilities +- `runs`: The "Delete Run Cascades to Results and Log File" requirement is + extended so deleting a run also removes the collected `.ndjson` files, not + just the JSONL log. + +## Impact + +- **Code:** `internal/web/runlog.go` (log sweep helper), + `internal/web/handlers.go` (shared run-deletion helper; reused by + `HandleDeleteRun`), `internal/db/runs.go` (method to list/delete runs older + than a cutoff and return their artifact paths), `internal/config/appconfig.go` + (new fields + defaults), server startup (`cmd/simrun/main.go`) to start both + sweeper goroutines, a new `app_config` migration to backfill defaults. +- **Frontend:** `web/frontend/src/routes/assessments` — new button + dialog + (`web/frontend/src/lib/components/RetentionDialog.svelte`) wired to + `PUT /api/config`. +- **APIs:** `GET/PUT /api/config` payload gains four retention fields. No new + endpoints. +- **Dependencies / infra:** none. Single-host, shared-volume deployment; no new + storage backend. diff --git a/openspec/changes/assessment-retention/specs/assessment-retention/spec.md b/openspec/changes/assessment-retention/specs/assessment-retention/spec.md new file mode 100644 index 0000000..8ccf0de --- /dev/null +++ b/openspec/changes/assessment-retention/specs/assessment-retention/spec.md @@ -0,0 +1,102 @@ +## ADDED Requirements + +### Requirement: Retention Settings In AppConfig +The system SHALL extend `AppConfig` with retention settings persisted in the +`app_config` table and served by `GET /api/config` / `PUT /api/config`: +`assessment_log_retention_enabled` (bool), `assessment_log_retention_days` (int), +`assessment_retention_enabled` (bool), and `assessment_retention_days` (int). +Defaults SHALL be `assessment_log_retention_enabled = true`, +`assessment_log_retention_days = 7`, `assessment_retention_enabled = false`, +`assessment_retention_days = 30`, backfilled by a migration aligned with +`DefaultAppConfig()`. + +#### Scenario: Defaults when unset +- **WHEN** no `app_config` row exists for the retention keys +- **THEN** `GET /api/config` returns `assessment_log_retention_enabled = true`, `assessment_log_retention_days = 7`, `assessment_retention_enabled = false`, and `assessment_retention_days = 30` + +#### Scenario: Admin updates retention +- **WHEN** a client sends `PUT /api/config` with `assessment_retention_enabled = true` and `assessment_retention_days = 14` +- **THEN** both values are persisted and returned by a subsequent `GET /api/config` + +### Requirement: Retention Settings Validation +The system SHALL reject `PUT /api/config` with HTTP 400 when +`assessment_log_retention_days` or `assessment_retention_days` is less than 1, so +retention cannot be set to a value that deletes data immediately. + +#### Scenario: Zero log retention rejected +- **WHEN** a client sends `PUT /api/config` with `assessment_log_retention_days = 0` +- **THEN** the response is HTTP 400 and the stored value is unchanged + +#### Scenario: Zero assessment retention rejected +- **WHEN** a client sends `PUT /api/config` with `assessment_retention_days = 0` +- **THEN** the response is HTTP 400 and the stored value is unchanged + +### Requirement: Log-Retention Sweeper +The system SHALL run a background sweeper that periodically scans +`/run-logs/` and deletes any `.jsonl` file whose last +modification time is older than `assessment_log_retention_days`. The sweeper SHALL run +once at startup and then on a fixed 1-hour interval, SHALL re-read `AppConfig` +each tick, and SHALL be a no-op when `assessment_log_retention_enabled = false`. +Deleting a log file SHALL NOT delete or modify the corresponding `runs` row. + +#### Scenario: Old log swept +- **WHEN** a run's JSONL file was last modified longer ago than `assessment_log_retention_days` and log retention is enabled +- **THEN** the sweeper deletes the file and leaves the `runs` row intact + +#### Scenario: Recent log retained +- **WHEN** a run's JSONL file is newer than `assessment_log_retention_days` +- **THEN** the sweeper leaves the file in place + +#### Scenario: Log retention disabled +- **WHEN** `assessment_log_retention_enabled = false` +- **THEN** the log sweeper deletes no files regardless of age + +### Requirement: Assessment-Retention Sweeper +The system SHALL run a background sweeper that periodically deletes whole runs +whose `created_at` is older than `assessment_retention_days`. For each expired +run the system SHALL delete the `runs` row (cascading to `scenario_results`), +the run's JSONL log file, and every collected `.ndjson` file referenced by that +run's `scenario_results.collected_log_path`. The sweeper SHALL run once at +startup and then on a fixed 1-hour interval, SHALL re-read `AppConfig` each +tick, SHALL be a no-op when `assessment_retention_enabled = false`, and SHALL +skip runs whose `status` is still `running`. + +#### Scenario: Old assessment purged +- **WHEN** a completed run's `created_at` is older than `assessment_retention_days` and assessment retention is enabled +- **THEN** the `runs` row, its `scenario_results`, its JSONL log file, and all of its collected `.ndjson` files are deleted + +#### Scenario: Recent assessment retained +- **WHEN** a run's `created_at` is newer than `assessment_retention_days` +- **THEN** the sweeper leaves the run and all its artifacts in place + +#### Scenario: Assessment retention disabled +- **WHEN** `assessment_retention_enabled = false` +- **THEN** the assessment sweeper deletes no runs regardless of age + +#### Scenario: Running assessment skipped +- **WHEN** a run older than `assessment_retention_days` still has `status = "running"` +- **THEN** the sweeper does not delete it + +### Requirement: Swept Logs Surface As Empty +The system SHALL serve `GET /api/runs/{runId}/logs` with HTTP 200 and body `[]` +when the run's JSONL file has been swept by log retention, reusing the existing +missing-file behavior so an expired-log run is indistinguishable from one that +never logged. + +#### Scenario: Logs requested after sweep +- **WHEN** a client GETs logs for a run whose JSONL file was deleted by the log sweeper +- **THEN** the response is HTTP 200 with body `[]` + +### Requirement: Configure Retention From Assessments Page +The system SHALL present an "Assessment retention" control on the assessments +page that opens a dialog for editing `assessment_log_retention_enabled`, +`assessment_log_retention_days`, `assessment_retention_enabled`, and +`assessment_retention_days`, and SHALL persist changes via `PUT /api/config`. + +#### Scenario: Open and save +- **WHEN** an admin opens the dialog, enables assessment retention with a 14-day window, and saves +- **THEN** the new values are sent to `PUT /api/config` and reflected on the page after save + +#### Scenario: Invalid input surfaced +- **WHEN** an admin enters a retention period below 1 and saves +- **THEN** the API returns HTTP 400 and the dialog surfaces the error without losing the entered values diff --git a/openspec/changes/assessment-retention/specs/runs/spec.md b/openspec/changes/assessment-retention/specs/runs/spec.md new file mode 100644 index 0000000..f347cc7 --- /dev/null +++ b/openspec/changes/assessment-retention/specs/runs/spec.md @@ -0,0 +1,16 @@ +## MODIFIED Requirements + +### Requirement: Delete Run Cascades to Results and Log File +The system SHALL delete the `runs` row on `DELETE /api/runs/{runId}`, +cascade-delete all `scenario_results` rows via the FK, best-effort remove the +run's JSONL log file, and best-effort remove every collected `.ndjson` file +referenced by that run's `scenario_results.collected_log_path`. Failure to +delete any on-disk file SHALL NOT fail the request. + +#### Scenario: Successful delete +- **WHEN** a client deletes a run with 3 results +- **THEN** the `runs` row, all 3 `scenario_results` rows, the JSONL log file, and any collected `.ndjson` files for those results are removed + +#### Scenario: Collected log file missing +- **WHEN** a run is deleted and one of its `collected_log_path` files no longer exists on disk +- **THEN** the deletion still succeeds and the request returns HTTP 204 diff --git a/openspec/changes/assessment-retention/tasks.md b/openspec/changes/assessment-retention/tasks.md new file mode 100644 index 0000000..dda7a11 --- /dev/null +++ b/openspec/changes/assessment-retention/tasks.md @@ -0,0 +1,41 @@ +## 1. Config model + persistence + +- [x] 1.1 Add `AssessmentLogRetentionEnabled bool`, `AssessmentLogRetentionDays int`, `AssessmentRetentionEnabled bool`, `AssessmentRetentionDays int` to `config.AppConfig` and set defaults (`true`, `7`, `false`, `30`) in `DefaultAppConfig()` (`internal/config/appconfig.go`) +- [x] 1.2 Add the four keys to `parseAppConfig` (`> 0` guard for the day fields, like `parallelism`) and `appConfigKVs` (`internal/db/config.go`) +- [x] 1.3 Add migration `011_assessment_retention_appconfig` up/down backfilling the four keys with defaults, aligned with `DefaultAppConfig()` (`internal/db/migrations/`) +- [x] 1.4 Unit-test `parseAppConfig`/`appConfigKVs` round-trip and defaults for the new keys + +## 2. PUT /api/config validation + +- [x] 2.1 In `HandleUpdateConfig`, reject `assessment_log_retention_days < 1` and `assessment_retention_days < 1` with HTTP 400, leaving the stored value unchanged (`internal/web/handlers.go`) +- [x] 2.2 Test: PUT with either day field `= 0` → 400; valid values → persisted + +## 3. Shared run-deletion helper (fix collected-.ndjson leak) + +- [x] 3.1 Add a helper that, given a run ID, fetches its `scenario_results` collected paths, calls `RunStore.Delete`, then best-effort removes the JSONL log and each `collected_log_path` `.ndjson` (reject paths not ending in `.ndjson`, log warnings on failure) (`internal/web/handlers.go` or a small helper file) +- [x] 3.2 Refactor `HandleDeleteRun` to call the helper so manual delete also removes collected `.ndjson` +- [x] 3.3 Test: deleting a run with collected logs removes row, results, JSONL, and `.ndjson`; a missing `.ndjson` still returns 204 + +## 4. RunStore expired-runs query + +- [x] 4.1 Add a `RunStore` method returning IDs of runs with `created_at < cutoff` and `status <> 'running'` (`internal/db/runs.go`) +- [x] 4.2 Test: returns only runs older than the cutoff and excludes `running` runs + +## 5. Background sweepers + +- [x] 5.1 Add a log-retention sweep helper in `internal/web/runlog.go` deleting `/run-logs/*.jsonl` older than N days by `ModTime`; no-op when disabled +- [x] 5.2 Add an assessment-retention sweep that queries expired run IDs and deletes each via the shared helper from task 3; no-op when disabled +- [x] 5.3 Start both sweepers as `go func()`s in `cmd/simrun/main.go` mirroring the session-cleanup goroutine: run once at startup, then on a 1h ticker, reading `AppConfig` each tick, exiting on `ctx.Done()` +- [x] 5.4 Test: log sweeper deletes aged JSONL and leaves the `runs` row; assessment sweeper deletes aged completed runs + all artifacts, skips `running` runs, and both are no-ops when disabled + +## 6. Config page UI + +- [x] 6.1 Add an "Assessment retention" button + shadcn `Dialog` on `web/frontend/src/routes/assessments` with two toggles + two day-counts (`Label`/`Input`/`Switch`) +- [x] 6.2 Wire save to `PUT /api/config` (one call per changed key, matching existing config writes); reflect saved values on the page +- [x] 6.3 Surface a 400 from the API in the dialog without discarding entered values + +## 7. Verification + +- [x] 7.1 `go test ./...`, `mise run lint`, `mise run fmt` pass +- [ ] 7.2 Manual: set log retention to 1 day, confirm an aged JSONL is swept on next tick and `GET /api/runs/{id}/logs` returns `200 []` while the run record remains +- [ ] 7.3 Manual: enable assessment retention at 1 day, confirm an aged completed run is fully removed (row, results, JSONL, collected `.ndjson`) and a `running` run is left untouched diff --git a/web/frontend/src/lib/components/RetentionDialog.svelte b/web/frontend/src/lib/components/RetentionDialog.svelte new file mode 100644 index 0000000..33ab520 --- /dev/null +++ b/web/frontend/src/lib/components/RetentionDialog.svelte @@ -0,0 +1,158 @@ + + + + + + Assessment retention + + Control how long run logs and whole assessments are kept before they are deleted + automatically. + + + +
+
+
+ + +
+
+ + +

+ Verbose per-run logs older than this are removed; the assessment record is kept. +

+
+
+ +
+
+ + +
+
+ + +

+ Whole assessments older than this — results and collected logs included — are + permanently deleted. +

+
+
+ + {#if error} + + {error} + + {/if} +
+ +
+ + +
+
+
diff --git a/web/frontend/src/routes/assessments/+page.svelte b/web/frontend/src/routes/assessments/+page.svelte index aba53e4..71194b7 100644 --- a/web/frontend/src/routes/assessments/+page.svelte +++ b/web/frontend/src/routes/assessments/+page.svelte @@ -12,14 +12,21 @@ import { Input } from '$lib/components/ui/input/index.js'; import { Skeleton } from '$lib/components/ui/skeleton/index.js'; import { runs } from '$lib/stores/runs'; - import { listRuns, deleteRun, type RunFilters } from '$lib/api/client'; + import { listRuns, deleteRun, getConfig, type RunFilters } from '$lib/api/client'; import { goto } from '$app/navigation'; - import { statusVariant, formatDuration, formatUserEmail, scenarioTypeVariant } from '$lib/utils/format'; - import type { Run, ScenarioType } from '$lib/types'; + import { + statusVariant, + formatDuration, + formatUserEmail, + scenarioTypeVariant + } from '$lib/utils/format'; + import type { Run, ScenarioType, AppConfig } from '$lib/types'; import * as Tooltip from '$lib/components/ui/tooltip/index.js'; import NewAssessmentDialog from '$lib/components/NewAssessmentDialog.svelte'; + import RetentionDialog from '$lib/components/RetentionDialog.svelte'; import PenLineIcon from '@lucide/svelte/icons/pen-line'; import PlusIcon from '@lucide/svelte/icons/plus'; + import TimerIcon from '@lucide/svelte/icons/timer'; import TrashIcon from '@lucide/svelte/icons/trash-2'; import XIcon from '@lucide/svelte/icons/x'; import ChevronLeftIcon from '@lucide/svelte/icons/chevron-left'; @@ -41,6 +48,24 @@ let error = $state(''); let newAssessmentOpen = $state(false); + let retentionOpen = $state(false); + let retentionConfig = $state({}); + + // Load config lazily when opening retention settings so the common case + // (browsing assessments) doesn't pay for an extra request. + async function openRetention() { + try { + retentionConfig = await getConfig(); + retentionOpen = true; + } catch (e) { + error = e instanceof Error ? e.message : 'Failed to load retention settings'; + } + } + + function handleRetentionSaved(changes: Record) { + retentionConfig = { ...retentionConfig, ...changes }; + } + let deleteDialogOpen = $state(false); let deleteTarget = $state(null); let deleting = $state(false); @@ -172,7 +197,7 @@ async function changePerPage(value: string) { const next = Number(value); - if (!PAGE_SIZES.includes(next as typeof PAGE_SIZES[number])) return; + if (!PAGE_SIZES.includes(next as (typeof PAGE_SIZES)[number])) return; stopPolling(); perPage = next; page = 1; @@ -269,16 +294,21 @@ deleting = false; } } -

Assessment History

- +
+ + +
@@ -419,9 +449,7 @@ {run.total} {run.succeeded} - 0 ? 'text-destructive' : ''} - >{run.failed} + 0 ? 'text-destructive' : ''}>{run.failed} @@ -513,6 +541,12 @@ + +