Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions serve.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ var serveCmd = cli.Command{
engine.TaskConfigValidate: tasks.NewConfigValidator(homeDir).Handler(),
engine.TaskConfigReload: tasks.NewConfigReloader(homeDir).Handler(),
engine.TaskMarkReady: tasks.MarkReadyHandler(),
engine.TaskRestartSeid: tasks.NewRestartSeider().Handler(),
engine.TaskConfigureGenesis: tasks.NewGenesisFetcher(homeDir, chainID, genesisBucket, genesisRegion, nil).Handler(),
engine.TaskConfigureStateSync: tasks.NewStateSyncConfigurer(homeDir, nil).Handler(),
engine.TaskSnapshotUpload: snapshotUploader.Handler(),
Expand Down
7 changes: 7 additions & 0 deletions sidecar/client/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,13 @@ func (c *SidecarClient) SubmitMarkReadyTask(ctx context.Context, task MarkReadyT
return c.SubmitTask(ctx, task.ToTaskRequest())
}

func (c *SidecarClient) SubmitRestartSeidTask(ctx context.Context, task RestartSeidTask) (uuid.UUID, error) {
if err := task.Validate(); err != nil {
return uuid.Nil, fmt.Errorf("task validation failed: %w", err)
}
return c.SubmitTask(ctx, task.ToTaskRequest())
}

func (c *SidecarClient) SubmitConfigureStateSyncTask(ctx context.Context, task ConfigureStateSyncTask) (uuid.UUID, error) {
if err := task.Validate(); err != nil {
return uuid.Nil, fmt.Errorf("task validation failed: %w", err)
Expand Down
14 changes: 14 additions & 0 deletions sidecar/client/tasks.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ const (
TaskTypeConfigValidate = string(engine.TaskConfigValidate)
TaskTypeConfigReload = string(engine.TaskConfigReload)
TaskTypeMarkReady = string(engine.TaskMarkReady)
TaskTypeRestartSeid = string(engine.TaskRestartSeid)
TaskTypeConfigureGenesis = string(engine.TaskConfigureGenesis)
TaskTypeConfigureStateSync = string(engine.TaskConfigureStateSync)
TaskTypeSnapshotUpload = string(engine.TaskSnapshotUpload)
Expand Down Expand Up @@ -358,6 +359,19 @@ func (t MarkReadyTask) ToTaskRequest() TaskRequest {
return req
}

// RestartSeidTask restarts the co-located seid process in place so it
// re-reads config.toml without bouncing the sidecar. The task completes
// once seid's local RPC is serving again.
type RestartSeidTask struct{}

func (t RestartSeidTask) TaskType() string { return TaskTypeRestartSeid }
func (t RestartSeidTask) Validate() error { return nil }

func (t RestartSeidTask) ToTaskRequest() TaskRequest {
req := TaskRequest{Type: t.TaskType()}
return req
}

// SetGenesisPeersTask requests the sidecar to publish this node's peer
// entry to the shared genesis peers list (S3 coordinates derived from
// the sidecar environment).
Expand Down
1 change: 1 addition & 0 deletions sidecar/engine/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ const (
TaskConfigValidate TaskType = "config-validate"
TaskConfigReload TaskType = "config-reload"
TaskMarkReady TaskType = "mark-ready"
TaskRestartSeid TaskType = "restart-seid"
TaskConfigureGenesis TaskType = "configure-genesis"
TaskConfigureStateSync TaskType = "configure-state-sync"
TaskSnapshotUpload TaskType = "snapshot-upload"
Expand Down
233 changes: 233 additions & 0 deletions sidecar/tasks/restart_seid.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,233 @@
package tasks

import (
"context"
"fmt"
"os"
"strconv"
"strings"
"syscall"
"time"

"github.com/sei-protocol/seictl/sidecar/actions"
"github.com/sei-protocol/seictl/sidecar/engine"
"github.com/sei-protocol/seictl/sidecar/rpc"
"github.com/sei-protocol/seilog"
)

var restartSeidLog = seilog.NewLogger("seictl", "task", "restart-seid")

const (
// restartSeidProcess is the comm/argv[0] of the validator process.
restartSeidProcess = "seid"

// restartSeidGracePeriod bounds the SIGTERM→exit window. A loaded validator's
// graceful shutdown (consensus WAL flush + PebbleDB/IAVL close, possibly
// mid-compaction) can run well past the idle-only ~3s figure, so the window
// is sized for the loaded-shutdown tail. If seid is still alive at the
// deadline the task fails loud and leaves the process running for an
// operator — it is never force-killed (a stuck-but-alive validator is safer
// than a SIGKILL mid-commit).
restartSeidGracePeriod = 90 * time.Second

// restartSeidUpTimeout bounds the wait for seid's local RPC to serve
// /status again after the restart. Cold-start replay can be slow on a
// loaded node; the engine has no retry, so this is the full budget. With
// graceful-only shutdown there is no SIGKILL-induced replay blowup, so 5m
// holds; revisit for very large archive nodes if replay outgrows it.
restartSeidUpTimeout = 5 * time.Minute

restartSeidUpPollInterval = 1 * time.Second

// restartSeidExitPollInterval is how often gracefulStop checks whether seid
// has exited after SIGTERM.
restartSeidExitPollInterval = 100 * time.Millisecond
)

// seidStartFinder scans /proc for the running `seid start` process. It
// corroborates argv[0]==seid with the "start" subcommand so it never matches
// seid-init or the bash wait-loop wrapper that share the PID namespace.
//
// It implements actions.ProcessSignaler so stopSeid can SIGTERM + poll it;
// FindPID ignores the name argument (the corroboration is baked in) and Signal
// / Alive delegate to the real syscall-backed package functions.
type seidStartFinder struct{}

func (seidStartFinder) FindPID(string) (int, error) {
entries, err := os.ReadDir("/proc")
if err != nil {
return 0, fmt.Errorf("reading /proc: %w", err)
}
for _, e := range entries {
if !e.IsDir() {
continue
}
pid, err := strconv.Atoi(e.Name())
if err != nil {
continue
}
comm, err := os.ReadFile(fmt.Sprintf("/proc/%d/comm", pid))
if err != nil || strings.TrimSpace(string(comm)) != restartSeidProcess {
continue
}
cmdline, err := os.ReadFile(fmt.Sprintf("/proc/%d/cmdline", pid))
if err != nil {
continue
}
if isSeidStart(cmdline) {
return pid, nil
}
}
return 0, fmt.Errorf("process %q not found in /proc", restartSeidProcess)
}

func (seidStartFinder) Signal(pid int, sig syscall.Signal) error { return actions.SignalPID(pid, sig) }

func (seidStartFinder) Alive(pid int) bool { return actions.PIDAlive(pid) }

// isSeidStart reports whether a null-delimited /proc cmdline is `seid start ...`,
// matching both a bare "seid" and an absolute path ending in "/seid".
func isSeidStart(cmdline []byte) bool {
args := strings.Split(strings.TrimRight(string(cmdline), "\x00"), "\x00")
if len(args) < 2 {
return false
}
exe := args[0]
if exe != restartSeidProcess && !strings.HasSuffix(exe, "/"+restartSeidProcess) {
return false
}
return args[1] == "start"
}

// RestartSeider restarts the co-located seid process in place: it SIGTERMs seid
// and waits for it to exit gracefully (the kubelet restarts the container), then
// waits for seid's local RPC to serve again. seid re-reads config.toml on this
// restart without bouncing the sidecar. The handler never starts seid and never
// flips the engine ready flag — it is not a readiness operation.
//
// Shutdown is graceful-only and fail-loud: if seid does not exit within the
// grace window it is left running and the task fails (never SIGKILLed). A
// force-kill opt-in is intentionally omitted until a non-validator forced
// restart needs it.
//
// Completion means "seid's RPC is serving /status again," NOT "caught up /
// voting." Callers that need in-service-and-voting must gate height / caught-up
// separately (downstream AwaitNodesAtHeight).
//
// The three OS interactions are injectable for testing:
// - signaler: process discovery + SIGTERM (defaults to a /proc + syscall
// implementation that corroborates `seid start`).
// - probeUp: returns true once seid's local RPC answers /status (defaults to
// a local CometBFT /status probe).
type RestartSeider struct {
signaler actions.ProcessSignaler
probeUp func(ctx context.Context) bool
gracePeriod time.Duration
upTimeout time.Duration
upInterval time.Duration
}

// NewRestartSeider builds a RestartSeider with the real /proc + syscall +
// local-RPC implementations.
func NewRestartSeider() *RestartSeider {
statusClient := rpc.NewStatusClient("", nil)
return &RestartSeider{
signaler: seidStartFinder{},
probeUp: func(ctx context.Context) bool { return seidRPCUp(ctx, statusClient) },
gracePeriod: restartSeidGracePeriod,
upTimeout: restartSeidUpTimeout,
upInterval: restartSeidUpPollInterval,
}
}

// seidRPCUp reports whether seid's local RPC answers /status (latest_block_height
// parses); any successful parse counts as RPC up. A transport or parse error
// (RPC not yet listening) returns false.
func seidRPCUp(ctx context.Context, c *rpc.StatusClient) bool {
if _, err := c.Status(ctx); err != nil {
return false
}
return true
}

// Handler returns an engine.TaskHandler for the restart-seid task type.
// Params are empty: restart-seid is a fire-and-confirm operation.
func (r *RestartSeider) Handler() engine.TaskHandler {
return engine.TypedHandler(func(ctx context.Context, _ struct{}) error {
if err := r.stopSeid(ctx); err != nil {
return err
}
return r.waitForUp(ctx)
})
}

// stopSeid SIGTERMs seid and waits for it to exit gracefully. When the process
// is not found in /proc we disambiguate: if seid's RPC is serving, the process
// exists but is invisible to us (e.g. a non-shared-PID-ns profile) and we refuse
// to report a restart that did not happen; if its RPC is down, seid is genuinely
// stopped or mid-restart (including the entrypoint's bash wait-loop window before
// it execs seid) and we proceed straight to waitForUp.
func (r *RestartSeider) stopSeid(ctx context.Context) error {
pid, err := r.signaler.FindPID(restartSeidProcess)
if err != nil {
if r.probeUp(ctx) {
return fmt.Errorf("seid RPC is serving but its process was not found in /proc: %w — refusing to report a restart that did not happen", err)
}
restartSeidLog.Info("seid process not running and RPC down; proceeding to wait for it to come up",
"reason", err.Error())
return nil
}
restartSeidLog.Info("restarting seid in place", "pid", pid, "grace", r.gracePeriod)
return r.gracefulStop(ctx, pid)
}

// gracefulStop SIGTERMs pid and polls until it exits or the grace window
// elapses. It never escalates to SIGKILL: if seid is still alive at the
// deadline the task fails and the process is left running for an operator.
func (r *RestartSeider) gracefulStop(ctx context.Context, pid int) error {
if err := r.signaler.Signal(pid, syscall.SIGTERM); err != nil {
return fmt.Errorf("sending SIGTERM to seid pid %d: %w", pid, err)
}

ticker := time.NewTicker(restartSeidExitPollInterval)
defer ticker.Stop()
deadline := time.After(r.gracePeriod)

for {
select {
case <-ctx.Done():
return ctx.Err()
case <-deadline:
return fmt.Errorf("seid pid %d still alive %s after SIGTERM; leaving it running (not force-killing a validator mid-commit)", pid, r.gracePeriod)
case <-ticker.C:
if !r.signaler.Alive(pid) {
restartSeidLog.Info("seid exited after SIGTERM", "pid", pid)
return nil
}
}
}
}

// waitForUp polls seid's local RPC until it serves /status or the timeout
// elapses. Success here is the completion signal for the in-place restart.
func (r *RestartSeider) waitForUp(ctx context.Context) error {
deadline := time.Now().Add(r.upTimeout)
ticker := time.NewTicker(r.upInterval)
defer ticker.Stop()

restartSeidLog.Info("waiting for seid RPC to come back up", "timeout", r.upTimeout)
for {
if r.probeUp(ctx) {
restartSeidLog.Info("seid RPC is up; restart complete")
return nil
Comment thread
cursor[bot] marked this conversation as resolved.
Comment thread
cursor[bot] marked this conversation as resolved.
}
select {
case <-ctx.Done():
return ctx.Err()
case <-ticker.C:
if time.Now().After(deadline) {
return fmt.Errorf("seid RPC did not come up within %s after restart", r.upTimeout)
}
}
}
}
Loading
Loading