-
Notifications
You must be signed in to change notification settings - Fork 2
feat(sidecar): add restart-seid task for in-place seid restart #199
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,233 @@ | ||
| package tasks | ||
|
|
||
| import ( | ||
| "context" | ||
| "fmt" | ||
| "os" | ||
| "strconv" | ||
| "strings" | ||
| "syscall" | ||
| "time" | ||
|
|
||
| "github.com/sei-protocol/seictl/sidecar/actions" | ||
| "github.com/sei-protocol/seictl/sidecar/engine" | ||
| "github.com/sei-protocol/seictl/sidecar/rpc" | ||
| "github.com/sei-protocol/seilog" | ||
| ) | ||
|
|
||
| var restartSeidLog = seilog.NewLogger("seictl", "task", "restart-seid") | ||
|
|
||
| const ( | ||
| // restartSeidProcess is the comm/argv[0] of the validator process. | ||
| restartSeidProcess = "seid" | ||
|
|
||
| // restartSeidGracePeriod bounds the SIGTERM→exit window. A loaded validator's | ||
| // graceful shutdown (consensus WAL flush + PebbleDB/IAVL close, possibly | ||
| // mid-compaction) can run well past the idle-only ~3s figure, so the window | ||
| // is sized for the loaded-shutdown tail. If seid is still alive at the | ||
| // deadline the task fails loud and leaves the process running for an | ||
| // operator — it is never force-killed (a stuck-but-alive validator is safer | ||
| // than a SIGKILL mid-commit). | ||
| restartSeidGracePeriod = 90 * time.Second | ||
|
|
||
| // restartSeidUpTimeout bounds the wait for seid's local RPC to serve | ||
| // /status again after the restart. Cold-start replay can be slow on a | ||
| // loaded node; the engine has no retry, so this is the full budget. With | ||
| // graceful-only shutdown there is no SIGKILL-induced replay blowup, so 5m | ||
| // holds; revisit for very large archive nodes if replay outgrows it. | ||
| restartSeidUpTimeout = 5 * time.Minute | ||
|
|
||
| restartSeidUpPollInterval = 1 * time.Second | ||
|
|
||
| // restartSeidExitPollInterval is how often gracefulStop checks whether seid | ||
| // has exited after SIGTERM. | ||
| restartSeidExitPollInterval = 100 * time.Millisecond | ||
| ) | ||
|
|
||
| // seidStartFinder scans /proc for the running `seid start` process. It | ||
| // corroborates argv[0]==seid with the "start" subcommand so it never matches | ||
| // seid-init or the bash wait-loop wrapper that share the PID namespace. | ||
| // | ||
| // It implements actions.ProcessSignaler so stopSeid can SIGTERM + poll it; | ||
| // FindPID ignores the name argument (the corroboration is baked in) and Signal | ||
| // / Alive delegate to the real syscall-backed package functions. | ||
| type seidStartFinder struct{} | ||
|
|
||
| func (seidStartFinder) FindPID(string) (int, error) { | ||
| entries, err := os.ReadDir("/proc") | ||
| if err != nil { | ||
| return 0, fmt.Errorf("reading /proc: %w", err) | ||
| } | ||
| for _, e := range entries { | ||
| if !e.IsDir() { | ||
| continue | ||
| } | ||
| pid, err := strconv.Atoi(e.Name()) | ||
| if err != nil { | ||
| continue | ||
| } | ||
| comm, err := os.ReadFile(fmt.Sprintf("/proc/%d/comm", pid)) | ||
| if err != nil || strings.TrimSpace(string(comm)) != restartSeidProcess { | ||
| continue | ||
| } | ||
| cmdline, err := os.ReadFile(fmt.Sprintf("/proc/%d/cmdline", pid)) | ||
| if err != nil { | ||
| continue | ||
| } | ||
| if isSeidStart(cmdline) { | ||
| return pid, nil | ||
| } | ||
| } | ||
| return 0, fmt.Errorf("process %q not found in /proc", restartSeidProcess) | ||
| } | ||
|
|
||
| func (seidStartFinder) Signal(pid int, sig syscall.Signal) error { return actions.SignalPID(pid, sig) } | ||
|
|
||
| func (seidStartFinder) Alive(pid int) bool { return actions.PIDAlive(pid) } | ||
|
|
||
| // isSeidStart reports whether a null-delimited /proc cmdline is `seid start ...`, | ||
| // matching both a bare "seid" and an absolute path ending in "/seid". | ||
| func isSeidStart(cmdline []byte) bool { | ||
| args := strings.Split(strings.TrimRight(string(cmdline), "\x00"), "\x00") | ||
| if len(args) < 2 { | ||
| return false | ||
| } | ||
| exe := args[0] | ||
| if exe != restartSeidProcess && !strings.HasSuffix(exe, "/"+restartSeidProcess) { | ||
| return false | ||
| } | ||
| return args[1] == "start" | ||
| } | ||
|
|
||
| // RestartSeider restarts the co-located seid process in place: it SIGTERMs seid | ||
| // and waits for it to exit gracefully (the kubelet restarts the container), then | ||
| // waits for seid's local RPC to serve again. seid re-reads config.toml on this | ||
| // restart without bouncing the sidecar. The handler never starts seid and never | ||
| // flips the engine ready flag — it is not a readiness operation. | ||
| // | ||
| // Shutdown is graceful-only and fail-loud: if seid does not exit within the | ||
| // grace window it is left running and the task fails (never SIGKILLed). A | ||
| // force-kill opt-in is intentionally omitted until a non-validator forced | ||
| // restart needs it. | ||
| // | ||
| // Completion means "seid's RPC is serving /status again," NOT "caught up / | ||
| // voting." Callers that need in-service-and-voting must gate height / caught-up | ||
| // separately (downstream AwaitNodesAtHeight). | ||
| // | ||
| // The three OS interactions are injectable for testing: | ||
| // - signaler: process discovery + SIGTERM (defaults to a /proc + syscall | ||
| // implementation that corroborates `seid start`). | ||
| // - probeUp: returns true once seid's local RPC answers /status (defaults to | ||
| // a local CometBFT /status probe). | ||
| type RestartSeider struct { | ||
| signaler actions.ProcessSignaler | ||
| probeUp func(ctx context.Context) bool | ||
| gracePeriod time.Duration | ||
| upTimeout time.Duration | ||
| upInterval time.Duration | ||
| } | ||
|
|
||
| // NewRestartSeider builds a RestartSeider with the real /proc + syscall + | ||
| // local-RPC implementations. | ||
| func NewRestartSeider() *RestartSeider { | ||
| statusClient := rpc.NewStatusClient("", nil) | ||
| return &RestartSeider{ | ||
| signaler: seidStartFinder{}, | ||
| probeUp: func(ctx context.Context) bool { return seidRPCUp(ctx, statusClient) }, | ||
| gracePeriod: restartSeidGracePeriod, | ||
| upTimeout: restartSeidUpTimeout, | ||
| upInterval: restartSeidUpPollInterval, | ||
| } | ||
| } | ||
|
|
||
| // seidRPCUp reports whether seid's local RPC answers /status (latest_block_height | ||
| // parses); any successful parse counts as RPC up. A transport or parse error | ||
| // (RPC not yet listening) returns false. | ||
| func seidRPCUp(ctx context.Context, c *rpc.StatusClient) bool { | ||
| if _, err := c.Status(ctx); err != nil { | ||
| return false | ||
| } | ||
| return true | ||
| } | ||
|
|
||
| // Handler returns an engine.TaskHandler for the restart-seid task type. | ||
| // Params are empty: restart-seid is a fire-and-confirm operation. | ||
| func (r *RestartSeider) Handler() engine.TaskHandler { | ||
| return engine.TypedHandler(func(ctx context.Context, _ struct{}) error { | ||
| if err := r.stopSeid(ctx); err != nil { | ||
| return err | ||
| } | ||
| return r.waitForUp(ctx) | ||
| }) | ||
| } | ||
|
|
||
| // stopSeid SIGTERMs seid and waits for it to exit gracefully. When the process | ||
| // is not found in /proc we disambiguate: if seid's RPC is serving, the process | ||
| // exists but is invisible to us (e.g. a non-shared-PID-ns profile) and we refuse | ||
| // to report a restart that did not happen; if its RPC is down, seid is genuinely | ||
| // stopped or mid-restart (including the entrypoint's bash wait-loop window before | ||
| // it execs seid) and we proceed straight to waitForUp. | ||
| func (r *RestartSeider) stopSeid(ctx context.Context) error { | ||
| pid, err := r.signaler.FindPID(restartSeidProcess) | ||
| if err != nil { | ||
| if r.probeUp(ctx) { | ||
| return fmt.Errorf("seid RPC is serving but its process was not found in /proc: %w — refusing to report a restart that did not happen", err) | ||
| } | ||
| restartSeidLog.Info("seid process not running and RPC down; proceeding to wait for it to come up", | ||
| "reason", err.Error()) | ||
| return nil | ||
| } | ||
| restartSeidLog.Info("restarting seid in place", "pid", pid, "grace", r.gracePeriod) | ||
| return r.gracefulStop(ctx, pid) | ||
| } | ||
|
|
||
| // gracefulStop SIGTERMs pid and polls until it exits or the grace window | ||
| // elapses. It never escalates to SIGKILL: if seid is still alive at the | ||
| // deadline the task fails and the process is left running for an operator. | ||
| func (r *RestartSeider) gracefulStop(ctx context.Context, pid int) error { | ||
| if err := r.signaler.Signal(pid, syscall.SIGTERM); err != nil { | ||
| return fmt.Errorf("sending SIGTERM to seid pid %d: %w", pid, err) | ||
| } | ||
|
|
||
| ticker := time.NewTicker(restartSeidExitPollInterval) | ||
| defer ticker.Stop() | ||
| deadline := time.After(r.gracePeriod) | ||
|
|
||
| for { | ||
| select { | ||
| case <-ctx.Done(): | ||
| return ctx.Err() | ||
| case <-deadline: | ||
| return fmt.Errorf("seid pid %d still alive %s after SIGTERM; leaving it running (not force-killing a validator mid-commit)", pid, r.gracePeriod) | ||
| case <-ticker.C: | ||
| if !r.signaler.Alive(pid) { | ||
| restartSeidLog.Info("seid exited after SIGTERM", "pid", pid) | ||
| return nil | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| // waitForUp polls seid's local RPC until it serves /status or the timeout | ||
| // elapses. Success here is the completion signal for the in-place restart. | ||
| func (r *RestartSeider) waitForUp(ctx context.Context) error { | ||
| deadline := time.Now().Add(r.upTimeout) | ||
| ticker := time.NewTicker(r.upInterval) | ||
| defer ticker.Stop() | ||
|
|
||
| restartSeidLog.Info("waiting for seid RPC to come back up", "timeout", r.upTimeout) | ||
| for { | ||
| if r.probeUp(ctx) { | ||
| restartSeidLog.Info("seid RPC is up; restart complete") | ||
| return nil | ||
|
cursor[bot] marked this conversation as resolved.
|
||
| } | ||
| select { | ||
| case <-ctx.Done(): | ||
| return ctx.Err() | ||
| case <-ticker.C: | ||
| if time.Now().After(deadline) { | ||
| return fmt.Errorf("seid RPC did not come up within %s after restart", r.upTimeout) | ||
| } | ||
| } | ||
| } | ||
| } | ||
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.