Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 41 additions & 4 deletions internal/detonators/simrun_detonator.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,12 +99,17 @@ func (d *SimrunDetonator) Detonate() (map[string]string, error) {
return nil, fmt.Errorf("failed to generate execution ID: %w", err)
}

// Carry execution_id out on every failure path past this point so a
// partially-completed detonation (e.g. terraform applied, then the
// detonation timed out) stays correlatable in the persisted result.
failure := map[string]string{"execution_id": executionID}

d.setupLoggingAndTerraform(executionID)

// Get manifest and find simulation
simulation, err := d.resolveSimulation(ctx)
if err != nil {
return nil, err
return failure, err
}

d.logSimulationInfo(simulation)
Expand All @@ -115,7 +120,7 @@ func (d *SimrunDetonator) Detonate() (map[string]string, error) {
}
terraformOutputs, tfWorkDir, err := d.setupTerraform(ctx, executionID, simulation)
if err != nil {
return nil, err
return failure, err
}
if tfWorkDir != "" {
defer d.cleanupTerraform(ctx, executionID, tfWorkDir)
Expand All @@ -125,7 +130,7 @@ func (d *SimrunDetonator) Detonate() (map[string]string, error) {
d.reportPhase("detonating")
detonateOutput, err := d.executeSimulation(ctx, executionID, terraformOutputs)
if err != nil {
return nil, err
return failure, err
}

// Run custom cleanup if needed
Expand Down Expand Up @@ -217,6 +222,38 @@ func (d *SimrunDetonator) terraformEnvVars(executionID string) map[string]string
return vars
}

// detonationEnvVars returns the environment for the pack runner. It augments
// the run env with AWS_REGION/AWS_DEFAULT_REGION derived from the resolved
// aws_region parameter. Terraform pins the AWS provider to var.aws_region, but
// the detonation resolves region via the AWS SDK default chain (ambient
// AWS_REGION). Without aligning them, terraform creates resources in
// var.aws_region (default us-east-1) while the detonation acts on the
// container's ambient region — producing NotFound errors. envutil merges this
// map over the process env, so the value here wins over any ambient AWS_REGION.
func (d *SimrunDetonator) detonationEnvVars() map[string]string {
env := make(map[string]string, len(d.envVars)+2)
for k, v := range d.envVars {
env[k] = v
}
region := d.resolveAWSRegion()
env["AWS_REGION"] = region
env["AWS_DEFAULT_REGION"] = region
return env
}

// resolveAWSRegion resolves the AWS region the detonation should use, mirroring
// the precedence terraformEnvVars applies to TF_VAR_aws_region: per-sim
// scenario param > pack-level param > built-in default.
func (d *SimrunDetonator) resolveAWSRegion() string {
if r, ok := d.params["aws_region"].(string); ok && r != "" {
return r
}
if r, ok := d.packConfig.Parameters["aws_region"].(string); ok && r != "" {
return r
}
return pack.DefaultAWSRegion
}

// formatTFVar renders a parameter value into the string form Terraform
// accepts for TF_VAR_*. Map and slice values are JSON-encoded so map/list
// typed Terraform variables receive a parseable HCL/JSON literal.
Expand Down Expand Up @@ -398,7 +435,7 @@ func (d *SimrunDetonator) logSimulationInfo(simulation *pack.SimulationManifest)

// createExecutor creates a pack runner and executor for simulation execution
func (d *SimrunDetonator) createExecutor(ctx context.Context, executionID string) (*executor.Executor, runner.PackRunner, error) {
packRunner, err := d.runnerFactory.CreateRunner(ctx, d.packConfig, d.envVars)
packRunner, err := d.runnerFactory.CreateRunner(ctx, d.packConfig, d.detonationEnvVars())
if err != nil {
return nil, nil, fmt.Errorf("failed to create pack runner: %w", err)
}
Expand Down
56 changes: 56 additions & 0 deletions internal/detonators/simrun_detonator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,64 @@ import (
"testing"

"github.com/IBM/simrun/internal/config"
"github.com/IBM/simrun/pack"
)

// TestDetonationEnvVars_AWSRegion verifies the detonation env pins
// AWS_REGION/AWS_DEFAULT_REGION to the same aws_region value terraform's
// provider uses. Without this, terraform creates resources in var.aws_region
// (default us-east-1) while the detonation SDK resolves region from the
// container's ambient AWS_REGION, so deletes/lookups hit the wrong region and
// fail with NotFound. Precedence mirrors terraformEnvVars: per-sim > pack-level
// > built-in default.
func TestDetonationEnvVars_AWSRegion(t *testing.T) {
tests := []struct {
name string
envVars map[string]string
packParams map[string]any
simParams map[string]any
want string
}{
{
name: "unset falls back to built-in default",
want: pack.DefaultAWSRegion,
},
{
name: "pack-level value used",
packParams: map[string]any{"aws_region": "eu-west-1"},
want: "eu-west-1",
},
{
name: "per-sim overrides pack-level",
packParams: map[string]any{"aws_region": "eu-west-1"},
simParams: map[string]any{"aws_region": "ap-south-1"},
want: "ap-south-1",
},
{
name: "param overrides ambient run-env region",
envVars: map[string]string{"AWS_REGION": "us-west-2"},
packParams: map[string]any{"aws_region": "eu-west-1"},
want: "eu-west-1",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
d := &SimrunDetonator{
packConfig: config.PackConfig{Parameters: tt.packParams},
params: tt.simParams,
envVars: tt.envVars,
}
env := d.detonationEnvVars()
if env["AWS_REGION"] != tt.want {
t.Errorf("AWS_REGION = %q, want %q", env["AWS_REGION"], tt.want)
}
if env["AWS_DEFAULT_REGION"] != tt.want {
t.Errorf("AWS_DEFAULT_REGION = %q, want %q", env["AWS_DEFAULT_REGION"], tt.want)
}
})
}
}

func TestTerraformEnvVars_PerSimOverridesPackLevel(t *testing.T) {
d := &SimrunDetonator{
packConfig: config.PackConfig{
Expand Down
5 changes: 4 additions & 1 deletion internal/runner/runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,10 @@ func (m *Runner) runScenario(scenario *Scenario) (string, float64, error) {

executionOutput, logger, err := m.executeScenario(scenario)
if err != nil {
return "", 0, err
// Preserve the execution_id even on failure so a partially-completed
// detonation (e.g. terraform applied, detonation timed out) stays
// correlatable. Indexing a nil map yields "".
return executionOutput["execution_id"], 0, err
}

indicators := m.buildIndicatorsList(scenario, executionOutput)
Expand Down
11 changes: 10 additions & 1 deletion internal/runner/runner_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ func TestRunnerErrorHandling(t *testing.T) {
mockDetonator.On("SetStatusCallback", mock.AnythingOfType("func(string)")).Return()

mockFailingDetonator := &detonatorMocks.MockDetonator{}
mockFailingDetonator.On("Detonate").Return(map[string]string{}, errors.New("foo"))
mockFailingDetonator.On("Detonate").Return(map[string]string{"execution_id": "failed-uid"}, errors.New("foo"))
mockFailingDetonator.On("String").Return("mock-failing-detonator")
mockFailingDetonator.On("SimulationId").Return("failing-simulation")
mockFailingDetonator.On("PackName").Return("")
Expand Down Expand Up @@ -155,6 +155,15 @@ func TestRunnerErrorHandling(t *testing.T) {
assert.Equal(t, 2, successCount)
assert.Equal(t, 1, failedCount)

// A failed scenario must still carry its execution_id so the result can be
// correlated with the partial detonation (e.g. terraform that did apply).
for _, result := range results {
if !result.Success {
assert.Equal(t, "failed-uid", result.ExecutionId,
"failed scenario should retain the execution_id from the detonation output")
}
}

// All scenarios should have been detonated, even if one returned an error
mockDetonator.AssertNumberOfCalls(t, "Detonate", 2)
mockFailingDetonator.AssertNumberOfCalls(t, "Detonate", 1)
Expand Down
93 changes: 93 additions & 0 deletions internal/web/retention_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -122,3 +122,96 @@ func TestSweepAssessments_DisabledIsNoOp(t *testing.T) {
assert.FileExists(t, jsonl)
assert.FileExists(t, ndjson)
}

// seedTerraformDir creates <dataDir>/terraform/<execID>/ with nested state and
// plugin contents, mirroring a real Terraform working directory so removal must
// recurse (os.RemoveAll), not a flat os.Remove.
func seedTerraformDir(t *testing.T, dataDir, execID string) string {
t.Helper()
dir := filepath.Join(dataDir, "terraform", execID)
require.NoError(t, os.MkdirAll(filepath.Join(dir, ".terraform"), 0o755))
require.NoError(t, os.WriteFile(filepath.Join(dir, "terraform.tfstate"), []byte("{}"), 0o644))
return dir
}

// makeAgedRunWithExecutions creates an aged completed run whose scenario results
// carry the given execution IDs (one result per ID), returning the run ID.
func makeAgedRunWithExecutions(t *testing.T, ctx context.Context, store *fakes.RunStore, dataDir string, execIDs ...string) uuid.UUID {
t.Helper()
id := uuid.New()
created := time.Now().AddDate(0, 0, -40)
require.NoError(t, store.Create(ctx, &db.Run{
ID: id,
Status: "completed",
StartTime: created,
CreatedAt: created,
}))
writeLog(t, dataDir, id.String(), 40)
for i, execID := range execIDs {
require.NoError(t, store.AddScenarioResult(ctx, id, &db.ScenarioResult{
Name: "s" + uuid.NewString()[:4] + "-" + string(rune('a'+i)),
ExecutionID: execID,
}))
}
return id
}

// Deleting a run reclaims the per-execution Terraform working directory for each
// of its scenario results — the disk space these dirs hold is the whole point.
func TestSweepAssessments_RemovesTerraformDirs(t *testing.T) {
dataDir := t.TempDir()
store := fakes.New().Run
ctx := context.Background()

e1, e2 := uuid.NewString(), uuid.NewString()
id := makeAgedRunWithExecutions(t, ctx, store, dataDir, e1, e2)
dir1 := seedTerraformDir(t, dataDir, e1)
dir2 := seedTerraformDir(t, dataDir, e2)

web.SweepAssessments(ctx, store, dataDir, true, 30)

_, err := store.Get(ctx, id)
assert.Error(t, err, "aged run row should be deleted")
assert.NoDirExists(t, dir1, "Terraform dir for E1 should be removed")
assert.NoDirExists(t, dir2, "Terraform dir for E2 should be removed")
}

// An unsafe execution_id must never cause cleanup to escape or wipe the
// terraform/ base directory — a blank id would otherwise RemoveAll the base.
func TestSweepAssessments_SkipsUnsafeExecutionID(t *testing.T) {
for _, execID := range []string{"", " ", "../escape", "nested/id", ".", ".."} {
t.Run("id="+execID, func(t *testing.T) {
dataDir := t.TempDir()
store := fakes.New().Run
ctx := context.Background()

// A sibling dir keyed on a safe id proves only the unsafe id is skipped.
safe := uuid.NewString()
id := makeAgedRunWithExecutions(t, ctx, store, dataDir, execID, safe)
safeDir := seedTerraformDir(t, dataDir, safe)
base := filepath.Join(dataDir, "terraform")

web.SweepAssessments(ctx, store, dataDir, true, 30)

_, err := store.Get(ctx, id)
assert.Error(t, err, "aged run row should still be deleted")
assert.DirExists(t, base, "terraform base dir must survive an unsafe id")
assert.NoDirExists(t, safeDir, "the safe sibling id should still be cleaned up")
})
}
}

// A missing Terraform dir must not fail the delete — removal is best-effort.
func TestSweepAssessments_MissingTerraformDirIsBestEffort(t *testing.T) {
dataDir := t.TempDir()
store := fakes.New().Run
ctx := context.Background()

// No terraform dir is seeded for this execution id.
id := makeAgedRunWithExecutions(t, ctx, store, dataDir, uuid.NewString())

web.SweepAssessments(ctx, store, dataDir, true, 30)

_, err := store.Get(ctx, id)
assert.Error(t, err, "delete should succeed even though the Terraform dir was missing")
}
36 changes: 29 additions & 7 deletions internal/web/run_deletion.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"
"os"
"path/filepath"
"strings"
"time"

"github.com/IBM/simrun/internal/db"
Expand All @@ -13,19 +14,22 @@ import (

// deleteRunWithArtifacts removes a run's database row (cascading to its
// scenario_results via FK) and best-effort removes all of its on-disk
// artifacts: the per-run JSONL log file and every collected .ndjson file
// referenced by scenario_results.collected_log_path.
// artifacts: the per-run JSONL log file, every collected .ndjson file
// referenced by scenario_results.collected_log_path, and each scenario
// result's Terraform working directory at <dataDir>/terraform/<execution_id>/.
//
// Collected paths are read before the row is deleted, since the cascade
// removes the scenario_results that hold them. On-disk removal is best-effort:
// a missing or unremovable file logs a warning but does not fail the delete,
// so a leftover artifact never blocks reclaiming the database row.
// Collected paths and execution IDs are read before the row is deleted, since
// the cascade removes the scenario_results that hold them. On-disk removal is
// best-effort: a missing or unremovable file logs a warning but does not fail
// the delete, so a leftover artifact never blocks reclaiming the database row.
//
// Shared by manual delete (HandleDeleteRun) and the assessment-retention
// sweeper so both reclaim the large collected .ndjson artifacts identically.
func deleteRunWithArtifacts(ctx context.Context, runStore db.RunStore, dataDir string, runID uuid.UUID) error {
// Collect the .ndjson paths before deleting, while the rows still exist.
// Collect the .ndjson paths and execution IDs before deleting, while the
// rows still exist.
var collected []string
var executionIDs []string
results, err := runStore.GetScenarioResults(ctx, runID)
if err != nil {
// A failed lookup must not strand the row: log and continue to delete.
Expand All @@ -36,6 +40,7 @@ func deleteRunWithArtifacts(ctx context.Context, runStore db.RunStore, dataDir s
if res.CollectedLogPath != nil && *res.CollectedLogPath != "" {
collected = append(collected, *res.CollectedLogPath)
}
executionIDs = append(executionIDs, res.ExecutionID)
}
}

Expand All @@ -62,6 +67,23 @@ func deleteRunWithArtifacts(ctx context.Context, runStore db.RunStore, dataDir s
}
}

// Best-effort: remove each scenario result's Terraform working directory at
// <dataDir>/terraform/<execution_id>/.
base := filepath.Join(dataDir, "terraform")
for _, id := range executionIDs {
dir := filepath.Join(base, id)
// Skip blank/whitespace ids and any id that doesn't resolve to a direct
// child of terraform/. filepath.Join cleans the result, so separators,
// "." and ".." would otherwise let cleanup escape or wipe the base dir.
if strings.TrimSpace(id) == "" || filepath.Dir(dir) != base {
continue
}
if err := os.RemoveAll(dir); err != nil && !os.IsNotExist(err) {
logrus.WithError(err).WithField("path", dir).WithField("run_id", runID).
Warn("failed to remove Terraform working directory")
}
}

return nil
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
schema: spec-driven
created: 2026-06-22
Loading