From e524461f8879f682ce229204b28f0ecc43aa5a53 Mon Sep 17 00:00:00 2001 From: Edoardo Spadoni Date: Tue, 12 May 2026 11:15:46 +0200 Subject: [PATCH 01/10] feat(alerts): paginated alerts list with history, activity, stats Builds the operational alerts surface on top of Mimir Alertmanager: a single paginated list endpoint plus per-system silence management, resolved-alert history, and aggregations the UI uses to render the overview page. Endpoints: - GET /alerts (cross-hierarchy / single-tenant / sub-tree scoping, multi-value label filters, sorting on starts_at/severity/alertname, pagination with stable fingerprint tiebreaker) - GET /alerts/history (paginated alert_history rows with date range) - GET /alerts/totals / /trend / /stats (severity buckets, time-series deltas, top-N alertname/system_key, MTTR/MTBF) - GET /alerts/{fingerprint}/activity (silence/unsilence audit timeline, populated transparently by the silence endpoints) - GET /systems/{id}/alerts and friends scoped to a single system Each alert in the list is enriched with a local-DB system object (id/name/type) so the frontend doesn't need a per-row round-trip. Per-tenant fan-out failures are surfaced as warnings rather than failing the whole request. Gated on the existing read:systems / manage:systems permissions: read for the list endpoints, manage for silence create/update/delete. --- .../migrations/023_add_alert_activity.sql | 41 ++ .../023_add_alert_activity_rollback.sql | 1 + backend/entities/local_alert_activity.go | 138 ++++++ backend/main.go | 3 + backend/methods/alerting.go | 399 +++++++++++++++--- backend/openapi.yaml | 297 +++++++++++-- backend/services/alerting/client.go | 8 +- 7 files changed, 803 insertions(+), 84 deletions(-) create mode 100644 backend/database/migrations/023_add_alert_activity.sql create mode 100644 backend/database/migrations/023_add_alert_activity_rollback.sql create mode 100644 backend/entities/local_alert_activity.go diff --git a/backend/database/migrations/023_add_alert_activity.sql b/backend/database/migrations/023_add_alert_activity.sql new file mode 100644 index 00000000..c0c46731 --- /dev/null +++ b/backend/database/migrations/023_add_alert_activity.sql @@ -0,0 +1,41 @@ +-- Migration 023: Add alert_activity table +-- Append-only timeline of operator actions performed on a single alert +-- (silence created/updated/deleted). The UI renders this in the alert-detail +-- drawer ("Activity" section). Per-alert scoped via (organization_id, +-- fingerprint). Operator "notes" are not a separate concept: they are stored +-- as the comment of the silence, so a note edit is recorded here as a +-- silence_updated event whose details payload includes the comment change. + +CREATE TABLE IF NOT EXISTS alert_activity ( + id BIGSERIAL PRIMARY KEY, + + organization_id VARCHAR(255) NOT NULL, + fingerprint VARCHAR(255) NOT NULL, + + -- Action identifier. Open-ended so new event types don't require a schema + -- change; current values: 'silenced', 'silence_updated', 'unsilenced'. + action VARCHAR(50) NOT NULL, + + -- Actor identity (denormalized for cheap render). user_id may be NULL for + -- system-driven events (none today, kept for future). + actor_user_id VARCHAR(255), + actor_name VARCHAR(255), + + -- Optional silence reference, set on silence-related actions so the + -- DELETE handler can resolve the originating fingerprint without a + -- separate mapping table. + silence_id VARCHAR(255), + + -- Free-form structured payload (e.g. comment, end_at, note excerpt). + details JSONB NOT NULL DEFAULT '{}', + + created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW() +); + +COMMENT ON TABLE alert_activity IS 'Append-only audit timeline of operator actions on individual alerts'; +COMMENT ON COLUMN alert_activity.fingerprint IS 'Alertmanager fingerprint (hex hash of labels) of the alert the action targets'; +COMMENT ON COLUMN alert_activity.action IS 'Event kind: silenced | silence_updated | unsilenced. Note changes are silence_updated events.'; +COMMENT ON COLUMN alert_activity.silence_id IS 'Silence ID associated with the event. Lets DELETE silence resolve the fingerprint.'; + +CREATE INDEX IF NOT EXISTS idx_alert_activity_org_fp_created_at ON alert_activity(organization_id, fingerprint, created_at DESC); +CREATE INDEX IF NOT EXISTS idx_alert_activity_silence_lookup ON alert_activity(organization_id, silence_id) WHERE silence_id IS NOT NULL; diff --git a/backend/database/migrations/023_add_alert_activity_rollback.sql b/backend/database/migrations/023_add_alert_activity_rollback.sql new file mode 100644 index 00000000..24979caf --- /dev/null +++ b/backend/database/migrations/023_add_alert_activity_rollback.sql @@ -0,0 +1 @@ +DROP TABLE IF EXISTS alert_activity; diff --git a/backend/entities/local_alert_activity.go b/backend/entities/local_alert_activity.go new file mode 100644 index 00000000..d77ebeee --- /dev/null +++ b/backend/entities/local_alert_activity.go @@ -0,0 +1,138 @@ +/* +Copyright (C) 2026 Nethesis S.r.l. +SPDX-License-Identifier: AGPL-3.0-or-later +*/ + +package entities + +import ( + "database/sql" + "encoding/json" + "fmt" + "time" + + "github.com/nethesis/my/backend/database" +) + +// AlertActivityAction enumerates the event kinds written to alert_activity. +// New values can be added without a schema change. Note edits are not +// represented as their own action: the operator note IS the silence comment, +// so a comment change shows up as silence_updated. +const ( + AlertActivitySilenced = "silenced" + AlertActivitySilenceUpdated = "silence_updated" + AlertActivityUnsilenced = "unsilenced" +) + +// AlertActivityEntry is one row of the per-alert audit timeline. +type AlertActivityEntry struct { + ID int64 `json:"id"` + OrganizationID string `json:"organization_id"` + Fingerprint string `json:"fingerprint"` + Action string `json:"action"` + ActorUserID *string `json:"actor_user_id,omitempty"` + ActorName *string `json:"actor_name,omitempty"` + SilenceID *string `json:"silence_id,omitempty"` + Details map[string]interface{} `json:"details"` + CreatedAt time.Time `json:"created_at"` +} + +// LocalAlertActivityRepository writes / reads the alert_activity timeline. +type LocalAlertActivityRepository struct { + db *sql.DB +} + +func NewLocalAlertActivityRepository() *LocalAlertActivityRepository { + return &LocalAlertActivityRepository{db: database.DB} +} + +// Log appends a single event to the activity timeline. Best-effort: callers +// that only want to record audit info should not fail their primary action +// when this returns an error — wrap the call site with a warn-level log. +func (r *LocalAlertActivityRepository) Log(orgID, fingerprint, action, actorUserID, actorName, silenceID string, details map[string]interface{}) error { + if details == nil { + details = map[string]interface{}{} + } + detailsJSON, err := json.Marshal(details) + if err != nil { + return fmt.Errorf("encode details: %w", err) + } + _, err = r.db.Exec( + `INSERT INTO alert_activity (organization_id, fingerprint, action, actor_user_id, actor_name, silence_id, details) + VALUES ($1, $2, $3, NULLIF($4,''), NULLIF($5,''), NULLIF($6,''), $7::jsonb)`, + orgID, fingerprint, action, actorUserID, actorName, silenceID, string(detailsJSON), + ) + if err != nil { + return fmt.Errorf("insert alert_activity: %w", err) + } + return nil +} + +// ListByFingerprint returns the timeline for one alert, most recent first. +// limit caps the number of rows; values <=0 fall back to 100. +func (r *LocalAlertActivityRepository) ListByFingerprint(orgID, fingerprint string, limit int) ([]AlertActivityEntry, error) { + if limit <= 0 { + limit = 100 + } + rows, err := r.db.Query( + `SELECT id, organization_id, fingerprint, action, actor_user_id, actor_name, silence_id, details, created_at + FROM alert_activity + WHERE organization_id = $1 AND fingerprint = $2 + ORDER BY created_at DESC, id DESC + LIMIT $3`, + orgID, fingerprint, limit, + ) + if err != nil { + return nil, fmt.Errorf("query alert_activity: %w", err) + } + defer func() { _ = rows.Close() }() + + out := make([]AlertActivityEntry, 0) + for rows.Next() { + var e AlertActivityEntry + var actorUserID, actorName, silenceID sql.NullString + var detailsRaw []byte + if err := rows.Scan(&e.ID, &e.OrganizationID, &e.Fingerprint, &e.Action, &actorUserID, &actorName, &silenceID, &detailsRaw, &e.CreatedAt); err != nil { + return nil, fmt.Errorf("scan alert_activity: %w", err) + } + if actorUserID.Valid { + e.ActorUserID = &actorUserID.String + } + if actorName.Valid { + e.ActorName = &actorName.String + } + if silenceID.Valid { + e.SilenceID = &silenceID.String + } + if len(detailsRaw) > 0 { + if err := json.Unmarshal(detailsRaw, &e.Details); err != nil { + e.Details = map[string]interface{}{} + } + } else { + e.Details = map[string]interface{}{} + } + out = append(out, e) + } + return out, nil +} + +// FindFingerprintBySilenceID returns the fingerprint of the alert that the +// given silence was created against, or empty string if no record exists. +// Used by DeleteSystemAlertSilence to log the unsilence event under the +// correct alert without requiring the caller to pass the fingerprint. +func (r *LocalAlertActivityRepository) FindFingerprintBySilenceID(orgID, silenceID string) (string, error) { + var fp string + err := r.db.QueryRow( + `SELECT fingerprint FROM alert_activity + WHERE organization_id = $1 AND silence_id = $2 AND action = $3 + ORDER BY created_at DESC LIMIT 1`, + orgID, silenceID, AlertActivitySilenced, + ).Scan(&fp) + if err != nil { + if err == sql.ErrNoRows { + return "", nil + } + return "", fmt.Errorf("lookup fingerprint by silence_id: %w", err) + } + return fp, nil +} diff --git a/backend/main.go b/backend/main.go index 587ee9c4..edd39982 100644 --- a/backend/main.go +++ b/backend/main.go @@ -284,6 +284,9 @@ func main() { alertsGroup.GET("/trend", methods.GetAlertsTrend) // Alert history trend with daily data points alertsGroup.GET("/stats", methods.GetAlertsStats) // Aggregate stats: severity buckets, top-N alertname/system_key, MTTR/MTBF + // Per-alert audit timeline (silence created/updated/removed events for the alert detail drawer) + alertsGroup.GET("/:fingerprint/activity", methods.GetAlertActivity) + // Configuration management alertsGroup.GET("/config", methods.GetAlertingConfig) // Get current alerting configuration alertsGroup.POST("/config", methods.ConfigureAlerts) // Configure alert routing (manage:systems required) diff --git a/backend/methods/alerting.go b/backend/methods/alerting.go index 79270bb0..51ac26e3 100644 --- a/backend/methods/alerting.go +++ b/backend/methods/alerting.go @@ -7,6 +7,7 @@ package methods import ( "context" + "database/sql" "encoding/json" "errors" "fmt" @@ -22,8 +23,10 @@ import ( "time" "github.com/gin-gonic/gin" + "github.com/lib/pq" "github.com/nethesis/my/backend/configuration" + "github.com/nethesis/my/backend/database" "github.com/nethesis/my/backend/entities" "github.com/nethesis/my/backend/helpers" "github.com/nethesis/my/backend/logger" @@ -104,29 +107,21 @@ func requireOrgID(c *gin.Context, orgID string) bool { // Used by aggregate endpoints (e.g., /totals) where omitting organization_id means // "aggregate across the caller's full hierarchy" rather than "specific tenant". // -// Three modes (selected by query params): +// Modes: // -// 1. organization_id omitted → caller's full hierarchy (incl. self). +// 1. organization_id omitted → caller's full hierarchy (incl. self). +// 2. organization_id=X → single tenant X. +// 3. organization_id=X&organization_id=Y (multi) +// → union of {X, Y, ...}. Each must be in the +// caller's hierarchy (Owner exempt). +// 4. + include=descendants → expand each org_id to itself + its sub-tree +// (deduplicated). Useful to mix-and-match drill-downs across siblings. // -// 2. organization_id=X → single tenant X (Mimir is per-tenant, -// so this returns only alerts attributed to X itself, not its descendants). -// -// 3. organization_id=X & include=descendants -// → X plus everything under X (drill-down on a sub-tree). Required because -// resellers/distributors hold no alerts on their own tenant — those live -// in their customer tenants. -// -// - Customer: always pinned to their own organization (single element). -// organization_id and include params are ignored (Mimir tenant is fixed -// to the user's own org). -// - Owner/Distributor/Reseller without organization_id: caller's hierarchy. -// - Owner/Distributor/Reseller with organization_id: validates hierarchy -// access (Owner is exempt), then returns either [X] (single tenant) or -// X + descendants of X (when include=descendants). +// Customer is always pinned to their own organization regardless of params. // // Returns false on auth/validation failure (response already written). func resolveOrgScope(c *gin.Context, user *models.User) ([]string, bool) { - orgID := c.Query("organization_id") + orgIDsParam := c.QueryArray("organization_id") includeDescendants := c.Query("include") == "descendants" orgRole := strings.ToLower(user.OrgRole) @@ -136,32 +131,49 @@ func resolveOrgScope(c *gin.Context, user *models.User) ([]string, bool) { userService := local.NewUserService() - if orgID != "" { - if orgRole != "owner" && !userService.IsOrganizationInHierarchy(orgRole, user.OrganizationID, orgID) { - c.JSON(http.StatusForbidden, response.Forbidden("access denied: organization not in your hierarchy", nil)) - return nil, false - } - if !includeDescendants { - return []string{orgID}, true - } - // Drill-down: derive descendants from orgID's own type, not the caller's - // role. A Distributor drilling into a Reseller wants the Reseller's - // customers; passing the caller's role would query the wrong relation. - targetType := userService.GetOrganizationType(orgID) - orgIDs, err := userService.GetHierarchicalOrganizationIDs(targetType, orgID) + // No org_id passed → caller's full hierarchy. + if len(orgIDsParam) == 0 { + orgIDs, err := userService.GetHierarchicalOrganizationIDs(orgRole, user.OrganizationID) if err != nil { - c.JSON(http.StatusInternalServerError, response.InternalServerError("failed to resolve descendants: "+err.Error(), nil)) + c.JSON(http.StatusInternalServerError, response.InternalServerError("failed to resolve organization hierarchy: "+err.Error(), nil)) return nil, false } return orgIDs, true } - orgIDs, err := userService.GetHierarchicalOrganizationIDs(orgRole, user.OrganizationID) - if err != nil { - c.JSON(http.StatusInternalServerError, response.InternalServerError("failed to resolve organization hierarchy: "+err.Error(), nil)) - return nil, false + // One or more org_ids: validate each, optionally expand each with descendants, + // dedupe to keep the fan-out minimal when sub-trees overlap. + result := make([]string, 0, len(orgIDsParam)) + seen := make(map[string]struct{}, len(orgIDsParam)) + for _, oid := range orgIDsParam { + if oid == "" { + continue + } + if orgRole != "owner" && !userService.IsOrganizationInHierarchy(orgRole, user.OrganizationID, oid) { + c.JSON(http.StatusForbidden, response.Forbidden("access denied: organization not in your hierarchy", nil)) + return nil, false + } + if includeDescendants { + targetType := userService.GetOrganizationType(oid) + expanded, err := userService.GetHierarchicalOrganizationIDs(targetType, oid) + if err != nil { + c.JSON(http.StatusInternalServerError, response.InternalServerError("failed to resolve descendants: "+err.Error(), nil)) + return nil, false + } + for _, e := range expanded { + if _, ok := seen[e]; !ok { + seen[e] = struct{}{} + result = append(result, e) + } + } + } else { + if _, ok := seen[oid]; !ok { + seen[oid] = struct{}{} + result = append(result, oid) + } + } } - return orgIDs, true + return result, true } // ConfigureAlerts handles POST /api/alerts/config @@ -286,6 +298,24 @@ func DisableAlerts(c *gin.Context) { // (cf. systems.go which also overrides to 50). Capped at 100 by the helper. const alertsListDefaultPageSize = 50 +// alertsListAllowedSortBy enumerates the user-selectable sort columns for the +// active /api/alerts list. Default is starts_at desc (most recent first). +// severity is sorted by criticality rank (critical > warning > info > other), +// not lexicographically. Anything outside this set falls back to starts_at. +var alertsListAllowedSortBy = map[string]bool{ + "starts_at": true, + "severity": true, + "alertname": true, +} + +// severityRank maps severity labels to a comparable integer (higher = more +// severe). Unknown values get -1 so they sort below info. +var severityRank = map[string]int{ + "critical": 3, + "warning": 2, + "info": 1, +} + // GetAlerts handles GET /api/alerts // // Lists active alerts. Scope follows the same three modes as /alerts/totals: @@ -314,6 +344,17 @@ func GetAlerts(c *gin.Context) { pageSize = alertsListDefaultPageSize } + // helpers.GetSortingFromQuery defaults sort_direction to "asc"; for active + // alerts the natural default is "what's firing now" first, so we override + // to desc when the caller didn't pass an explicit direction. + sortBy, sortDirection := helpers.GetSortingFromQuery(c) + if !alertsListAllowedSortBy[sortBy] { + sortBy = "starts_at" + } + if c.Query("sort_direction") == "" { + sortDirection = "desc" + } + all, warnings := fanOutMimirAlerts(c.Request.Context(), orgIDs) all = filterAlerts(all, alertFilter{ @@ -323,18 +364,9 @@ func GetAlerts(c *gin.Context) { alertnames: c.QueryArray("alertname"), }) - // Sort by starts_at desc (most recent first); fingerprint as tiebreaker - // so pagination is stable across requests when timestamps tie. - sort.SliceStable(all, func(i, j int) bool { - si, _ := all[i]["startsAt"].(string) - sj, _ := all[j]["startsAt"].(string) - if si != sj { - return si > sj - } - fi, _ := all[i]["fingerprint"].(string) - fj, _ := all[j]["fingerprint"].(string) - return fi < fj - }) + // Sort with fingerprint as a stable tiebreaker so pagination doesn't + // shift between requests when the primary key ties. + sortAlertsList(all, sortBy, sortDirection) totalCount := len(all) start := (page - 1) * pageSize @@ -356,16 +388,122 @@ func GetAlerts(c *gin.Context) { c.JSON(http.StatusOK, response.OK("alerts retrieved successfully", gin.H{ "alerts": pageAlerts, - "pagination": helpers.BuildPaginationInfo(page, pageSize, totalCount), + "pagination": helpers.BuildPaginationInfoWithSorting(page, pageSize, totalCount, sortBy, sortDirection), "warnings": warnings, })) } +// sortAlertsList orders an in-memory slice of Mimir alerts by the given column +// and direction. Always uses fingerprint as a stable secondary key so paging +// doesn't shuffle alerts that tie on the primary key. +func sortAlertsList(alerts []map[string]interface{}, sortBy, sortDirection string) { + desc := sortDirection == "desc" + sort.SliceStable(alerts, func(i, j int) bool { + var primaryLess bool + var primaryEqual bool + switch sortBy { + case "severity": + si := severityRank[severityOf(alerts[i])] + sj := severityRank[severityOf(alerts[j])] + primaryEqual = si == sj + primaryLess = si < sj + case "alertname": + ai := alertnameOf(alerts[i]) + aj := alertnameOf(alerts[j]) + primaryEqual = ai == aj + primaryLess = ai < aj + default: // starts_at + si, _ := alerts[i]["startsAt"].(string) + sj, _ := alerts[j]["startsAt"].(string) + primaryEqual = si == sj + primaryLess = si < sj + } + if !primaryEqual { + if desc { + return !primaryLess + } + return primaryLess + } + // Tiebreaker: fingerprint asc, deterministic regardless of direction. + fi, _ := alerts[i]["fingerprint"].(string) + fj, _ := alerts[j]["fingerprint"].(string) + return fi < fj + }) +} + +func severityOf(alert map[string]interface{}) string { + labels, _ := alert["labels"].(map[string]interface{}) + s, _ := labels["severity"].(string) + return s +} + +func alertnameOf(alert map[string]interface{}) string { + labels, _ := alert["labels"].(map[string]interface{}) + s, _ := labels["alertname"].(string) + return s +} + +// alertFingerprintPattern restricts the fingerprint path param to safe chars. +// Alertmanager fingerprints are 16-char lowercase hex but we allow a slightly +// looser charset to accommodate test fixtures and any future format change. +var alertFingerprintPattern = regexp.MustCompile(`^[A-Za-z0-9._:-]{1,128}$`) + +// GetAlertActivity handles GET /api/alerts/:fingerprint/activity +// Returns the per-alert audit timeline (silence created/updated/removed) for +// the alert identified by fingerprint within the resolved tenant. Most recent +// first. Operator notes are stored as the comment of the silence the action +// produced, so the timeline is the source of truth for "what happened, when, +// by whom". +func GetAlertActivity(c *gin.Context) { + user, ok := helpers.GetUserFromContext(c) + if !ok { + return + } + fp := c.Param("fingerprint") + if !alertFingerprintPattern.MatchString(fp) { + c.JSON(http.StatusBadRequest, response.BadRequest("invalid fingerprint", nil)) + return + } + orgID, ok := resolveOrgID(c, user) + if !ok { + return + } + if !requireOrgID(c, orgID) { + return + } + + limit := 100 + if s := c.Query("limit"); s != "" { + if n, err := strconv.Atoi(s); err == nil && n > 0 { + limit = n + if limit > 500 { + limit = 500 + } + } + } + + repo := entities.NewLocalAlertActivityRepository() + entries, err := repo.ListByFingerprint(orgID, fp, limit) + if err != nil { + logger.Error().Err(err).Str("org_id", orgID).Str("fingerprint", fp).Msg("failed to list alert activity") + c.JSON(http.StatusInternalServerError, response.InternalServerError("failed to list alert activity", nil)) + return + } + c.JSON(http.StatusOK, response.OK("alert activity retrieved successfully", gin.H{ + "events": entries, + })) +} + // fanOutMimirAlerts fetches active alerts from Mimir for every tenant in scope // concurrently, with bounded concurrency and a global timeout. Per-tenant // failures (timeout, 5xx, parse error) are collected as warnings; the rest of -// the result is returned. Used by /api/alerts (list) and is shaped so future -// stats/aggregation endpoints can reuse it. +// the result is returned. +// +// Each alert is enriched with a top-level `system` object containing the +// owning system's `name` and `type` (product, e.g. "nsec") looked up in the +// local `systems` table by (org_id, system_key). This saves the frontend a +// per-row round-trip to /systems just to render the table cell. If the lookup +// fails or the alert has no system_key, the field is simply omitted. func fanOutMimirAlerts(parent context.Context, orgIDs []string) ([]map[string]interface{}, []string) { var ( all []map[string]interface{} @@ -410,6 +548,8 @@ func fanOutMimirAlerts(parent context.Context, orgIDs []string) ([]map[string]in return } + enrichAlertsWithSystemInfo(orgID, alerts) + mu.Lock() all = append(all, alerts...) mu.Unlock() @@ -419,6 +559,77 @@ func fanOutMimirAlerts(parent context.Context, orgIDs []string) ([]map[string]in return all, warnings } +// enrichAlertsWithSystemInfo decorates each alert with a `system` object +// (id, name, type, system_key) by issuing a single SELECT against the local +// systems table for the distinct system_key values it sees. Best-effort: +// a DB hiccup or an unmatched key just leaves the system field unset on +// that alert. The `id` is the local DB UUID — what the frontend uses to +// build the system-detail link (/systems/:id). +func enrichAlertsWithSystemInfo(orgID string, alerts []map[string]interface{}) { + if len(alerts) == 0 { + return + } + keys := make(map[string]struct{}, len(alerts)) + for _, a := range alerts { + labels, _ := a["labels"].(map[string]interface{}) + if k, ok := labels["system_key"].(string); ok && k != "" { + keys[k] = struct{}{} + } + } + if len(keys) == 0 { + return + } + keyList := make([]string, 0, len(keys)) + for k := range keys { + keyList = append(keyList, k) + } + rows, err := database.DB.Query( + `SELECT id, system_key, name, type FROM systems WHERE organization_id = $1 AND system_key = ANY($2)`, + orgID, pq.Array(keyList), + ) + if err != nil { + logger.Warn().Err(err).Str("org_id", orgID).Msg("failed to lookup system info for alert enrichment") + return + } + defer func() { _ = rows.Close() }() + + type sysInfo struct { + ID string + Name string + Type sql.NullString + } + infoBy := make(map[string]sysInfo, len(keyList)) + for rows.Next() { + var id, k, n string + var t sql.NullString + if err := rows.Scan(&id, &k, &n, &t); err != nil { + continue + } + infoBy[k] = sysInfo{ID: id, Name: n, Type: t} + } + + for _, a := range alerts { + labels, _ := a["labels"].(map[string]interface{}) + k, _ := labels["system_key"].(string) + if k == "" { + continue + } + info, ok := infoBy[k] + if !ok { + continue + } + s := map[string]interface{}{ + "id": info.ID, + "system_key": k, + "name": info.Name, + } + if info.Type.Valid { + s["type"] = info.Type.String + } + a["system"] = s + } +} + // GetAlertingConfig handles GET /api/alerts/config // By default returns structured JSON parsed from Mimir YAML. // Use ?format=yaml to get the raw (redacted) YAML. @@ -499,10 +710,10 @@ func GetAlertsTotals(c *gin.Context) { } var ( - active, critical, warning, info int - warnings []string - mu sync.Mutex - wg sync.WaitGroup + active, critical, warning, info, muted int + warnings []string + mu sync.Mutex + wg sync.WaitGroup ) ctx, cancel := context.WithTimeout(c.Request.Context(), alertsTotalsFanoutTimeout) @@ -543,7 +754,7 @@ func GetAlertsTotals(c *gin.Context) { return } - var localCritical, localWarning, localInfo int + var localCritical, localWarning, localInfo, localMuted int for _, alert := range alerts { labels, _ := alert["labels"].(map[string]interface{}) switch sev, _ := labels["severity"].(string); sev { @@ -554,6 +765,13 @@ func GetAlertsTotals(c *gin.Context) { case "info": localInfo++ } + // An alert is muted when Alertmanager has at least one + // active silence matching it (status.silencedBy non-empty). + if status, ok := alert["status"].(map[string]interface{}); ok { + if sb, ok := status["silencedBy"].([]interface{}); ok && len(sb) > 0 { + localMuted++ + } + } } mu.Lock() @@ -561,6 +779,7 @@ func GetAlertsTotals(c *gin.Context) { critical += localCritical warning += localWarning info += localInfo + muted += localMuted mu.Unlock() }(orgID) } @@ -582,6 +801,7 @@ func GetAlertsTotals(c *gin.Context) { "critical": critical, "warning": warning, "info": info, + "muted": muted, "history": historyTotal, "warnings": warnings, })) @@ -838,11 +1058,22 @@ func GetSystemAlerts(c *gin.Context) { return } - // Filter alerts by this system's key + // Filter alerts by this system's key and decorate each with the same + // `system` enrichment shape as /api/alerts. The system was already loaded + // at the start of the handler so no extra query is needed. + sysInfo := map[string]interface{}{ + "id": system.ID, + "system_key": system.SystemKey, + "name": system.Name, + } + if system.Type != nil { + sysInfo["type"] = *system.Type + } filtered := make([]map[string]interface{}, 0, len(alerts)) for _, alert := range alerts { labels, _ := alert["labels"].(map[string]interface{}) if sk, ok := labels["system_key"].(string); ok && sk == system.SystemKey { + alert["system"] = sysInfo filtered = append(filtered, alert) } } @@ -935,11 +1166,47 @@ func CreateSystemAlertSilence(c *gin.Context) { return } + // Best-effort activity log: a write failure must not break the user's + // silence creation, so we only log a warning. The silence is the source + // of truth; activity is a denormalised UX convenience. + logAlertActivity(c, getSystemAlertOrgID(system), req.Fingerprint, entities.AlertActivitySilenced, user, silenceResp.SilenceID, map[string]interface{}{ + "comment": normalizeAlertSilenceComment(req.Comment), + "duration_minutes": req.DurationMinutes, + "end_at": req.EndAt, + }) + c.JSON(http.StatusOK, response.OK("alert silenced successfully", gin.H{ "silence_id": silenceResp.SilenceID, })) } +// logAlertActivity writes one row to alert_activity. Fails open: a DB error +// is logged at warn level but does not surface to the caller, since the +// activity timeline is auxiliary to the primary action. Empty fingerprint +// silently no-ops because the row would be unreachable from any alert detail +// view. +func logAlertActivity(c *gin.Context, orgID, fingerprint, action string, user *models.User, silenceID string, details map[string]interface{}) { + if fingerprint == "" { + return + } + actorUserID := "" + if user != nil && user.LogtoID != nil { + actorUserID = *user.LogtoID + } + actorName := "" + if user != nil { + actorName = user.Name + } + repo := entities.NewLocalAlertActivityRepository() + if err := repo.Log(orgID, fingerprint, action, actorUserID, actorName, silenceID, details); err != nil { + logger.Warn().Err(err). + Str("org_id", orgID). + Str("fingerprint", fingerprint). + Str("action", action). + Msg("failed to write alert activity (non-fatal)") + } +} + // DeleteSystemAlertSilence handles DELETE /api/systems/:id/alerts/silences/:silence_id // Deletes a system-scoped silence in Alertmanager after validating its ownership. func DeleteSystemAlertSilence(c *gin.Context) { @@ -993,6 +1260,13 @@ func DeleteSystemAlertSilence(c *gin.Context) { return } + // Resolve the fingerprint of the alert this silence was originally tied to + // so the unsilence event lands on the right timeline. If we can't find it + // (e.g. silence pre-dates activity tracking), skip the activity write. + activityRepo := entities.NewLocalAlertActivityRepository() + fingerprint, _ := activityRepo.FindFingerprintBySilenceID(orgID, silenceID) + logAlertActivity(c, orgID, fingerprint, entities.AlertActivityUnsilenced, user, silenceID, nil) + c.JSON(http.StatusOK, response.OK("silence disabled successfully", nil)) } @@ -1155,6 +1429,15 @@ func UpdateSystemAlertSilence(c *gin.Context) { return } + // Resolve fingerprint from the original silence creation event so the + // update lands on the right alert's timeline. + activityRepo := entities.NewLocalAlertActivityRepository() + fingerprint, _ := activityRepo.FindFingerprintBySilenceID(orgID, silenceID) + logAlertActivity(c, orgID, fingerprint, entities.AlertActivitySilenceUpdated, user, silenceResp.SilenceID, map[string]interface{}{ + "comment": normalizeAlertSilenceComment(req.Comment), + "end_at": req.EndAt, + }) + c.JSON(http.StatusOK, response.OK("silence updated successfully", gin.H{ "silence_id": silenceResp.SilenceID, })) diff --git a/backend/openapi.yaml b/backend/openapi.yaml index 0e3b2a73..18a06b43 100644 --- a/backend/openapi.yaml +++ b/backend/openapi.yaml @@ -424,6 +424,122 @@ components: description: Whether the VAT number exists in the specified entity type example: true + ActiveAlert: + type: object + description: | + Active alert returned by Mimir Alertmanager, enriched with local-DB + system info before being sent to the client. The fan-out includes + silenced and inhibited alerts (not just `active` ones), so callers can + render the muted/suppressed state. + properties: + fingerprint: + type: string + description: Alertmanager fingerprint (hex hash of labels) + example: "01cfde4b7fa6d1c7" + labels: + type: object + additionalProperties: + type: string + description: Alert labels (alertname, severity, system_key, ...) + example: + alertname: "DiskFilling" + severity: "warning" + system_key: "NETH-FBB2-1A6E-7CAD-44A4-A772-B3EE-F0F6-F371" + annotations: + type: object + additionalProperties: + type: string + status: + type: object + properties: + state: + type: string + enum: [active, suppressed, unprocessed] + silencedBy: + type: array + items: + type: string + description: Active silence IDs muting this alert (non-empty → state is "suppressed") + inhibitedBy: + type: array + items: + type: string + startsAt: + type: string + format: date-time + endsAt: + type: string + format: date-time + generatorURL: + type: string + description: Source URL of the alert (set by the pushing agent), if any + system: + type: object + nullable: true + description: | + Local-DB enrichment: the owning system's id, name and product + type, resolved by (organization_id, labels.system_key). Omitted + when the system_key has no matching row in the local systems + table (e.g., system unregistered or alert pushed against a + stale key). The frontend uses `id` (local DB UUID) to link to + the system detail page (/systems/:id); `system_key` is the + Mimir tenant label and is shown in the UI for reference. + properties: + id: + type: string + format: uuid + description: Local DB UUID. Use this to build links to /systems/:id. + example: "35aa0d84-08c1-4013-b1fd-d5f6ef3e0541" + system_key: + type: string + description: System key as used in alert labels (Mimir tenant identifier on this system). + example: "NETH-FBB2-1A6E-7CAD-44A4-A772-B3EE-F0F6-F371" + name: + type: string + example: "cust1-sys-A" + type: + type: string + nullable: true + description: Product type (e.g. "nsec"). Null until the agent reports it on first inventory. + + AlertActivityEntry: + type: object + description: One event in the per-alert audit timeline (silence created/updated/removed). + properties: + id: + type: integer + format: int64 + organization_id: + type: string + fingerprint: + type: string + description: Alertmanager fingerprint of the alert this event belongs to + action: + type: string + enum: [silenced, silence_updated, unsilenced] + description: Event kind. Note edits are surfaced as `silence_updated` (note = silence comment). + actor_user_id: + type: string + nullable: true + description: logto_id of the user who triggered the action + actor_name: + type: string + nullable: true + description: Display name of the actor (denormalized for cheap render) + silence_id: + type: string + nullable: true + description: Silence ID associated with this event + details: + type: object + additionalProperties: true + description: | + Free-form payload for the event. For `silenced` / `silence_updated`: + `{comment, end_at, duration_minutes?}`. For `unsilenced`: empty. + created_at: + type: string + format: date-time + AlertHistoryRecord: type: object description: A single resolved or inactive alert stored from an Alertmanager webhook @@ -8851,6 +8967,8 @@ paths: description: | Returns current alerts from Mimir for a specific system, filtered by the system's key. Suppressed alerts remain visible so silenced alerts can still be inspected in the system detail view. + Each alert is enriched with a `system` object (same shape as `/api/alerts`) — for this + endpoint it always points to the system identified by the path `id`. Requires `read:systems` permission. security: - BearerAuth: [] @@ -8882,7 +9000,7 @@ paths: alerts: type: array items: - type: object + $ref: '#/components/schemas/ActiveAlert' '401': $ref: '#/components/responses/Unauthorized' '403': @@ -11099,13 +11217,14 @@ paths: Returns active alert counts by severity (from Mimir, per-tenant) and total resolved alert history count (from DB). Requires `read:systems` permission. - **Three scope modes** (selected by query params): + **Scope modes** (selected by query params): - | `organization_id` | `include` | Result | + | `organization_id` | `include` | Result | |---|---|---| - | omitted | — | Caller's full hierarchy (recursive). For Customer it's just self. | - | `X` | omitted | Single tenant `X` only. Resellers/Distributors hold no alerts on their own tenant — those live on their customer tenants — so single-tenant queries on a non-leaf org typically return zero. | - | `X` | `descendants` | `X` plus everything under `X` (drill-down). Use this to view a sub-tree. | + | omitted | — | Caller's full hierarchy (recursive). For Customer it's just self. | + | `X` | omitted | Single tenant `X` only. Resellers/Distributors hold no alerts on their own tenant — those live on their customer tenants — so single-tenant queries on a non-leaf org typically return zero. | + | `X` (repeated for multi) | omitted | Union of all `organization_id` values passed. Each must be in the caller's hierarchy (Owner exempt). | + | `X` (single or multi) | `descendants` | Each org_id is expanded to itself + its sub-tree (deduplicated). Use this to drill into one or more sub-trees. | Active counts are aggregated across the resolved scope by fanning out to Mimir, one request per tenant, with bounded concurrency and a global timeout. Per-tenant @@ -11121,16 +11240,23 @@ paths: - name: organization_id in: query description: | - Target organization ID. Optional for all roles except Customer (where it is - ignored). Distributors/Resellers receive `403` if `X` is not in their hierarchy. + Target organization ID(s). Repeat the param to pass multiple values + (`?organization_id=A&organization_id=B`). Optional for all roles except + Customer (where it is ignored). Distributors/Resellers receive `403` if any + value is not in their hierarchy. schema: - type: string + type: array + items: + type: string + style: form + explode: true - name: include in: query description: | - Set to `descendants` together with `organization_id` to aggregate over the - target org's full sub-tree. Ignored when `organization_id` is omitted (the - caller's own hierarchy is already used) and when caller is a Customer. + Set to `descendants` together with `organization_id` to expand each value + to its full sub-tree (results deduplicated). Ignored when `organization_id` + is omitted (the caller's own hierarchy is already used) and when the caller + is a Customer. schema: type: string enum: [descendants] @@ -11163,6 +11289,9 @@ paths: info: type: integer description: Active info alerts in scope + muted: + type: integer + description: Active alerts currently silenced (Alertmanager `silencedBy` non-empty) history: type: integer description: Total resolved alerts in history (DB) in scope @@ -11202,12 +11331,18 @@ paths: parameters: - name: organization_id in: query - description: Target organization ID. Optional for all roles except Customer (where it is ignored). + description: | + Target organization ID(s). Repeat the param to pass multiple values. + Optional for all roles except Customer (where it is ignored). schema: - type: string + type: array + items: + type: string + style: form + explode: true - name: include in: query - description: Set to `descendants` together with `organization_id` to aggregate over the target org's full sub-tree. + description: Set to `descendants` together with `organization_id` to expand each value to its sub-tree. schema: type: string enum: [descendants] @@ -11292,8 +11427,14 @@ paths: parameters: - name: organization_id in: query + description: | + Target organization ID(s). Repeat the param to pass multiple values. schema: - type: string + type: array + items: + type: string + style: form + explode: true - name: include in: query schema: @@ -11402,12 +11543,18 @@ paths: parameters: - name: organization_id in: query - description: Target organization ID. Optional for all roles except Customer (where it is ignored). + description: | + Target organization ID(s). Repeat the param to pass multiple values. + Optional for all roles except Customer (where it is ignored). schema: - type: string + type: array + items: + type: string + style: form + explode: true - name: include in: query - description: Set to `descendants` together with `organization_id` to drill down on a sub-tree. + description: Set to `descendants` together with `organization_id` to expand each value to its sub-tree. schema: type: string enum: [descendants] @@ -11493,6 +11640,81 @@ paths: '403': $ref: '#/components/responses/Forbidden' + # =========================================== + # ALERTING ENDPOINTS (Per-alert audit timeline) + # =========================================== + + /alerts/{fingerprint}/activity: + parameters: + - name: fingerprint + in: path + required: true + description: | + Alertmanager fingerprint of the alert (hex hash of its labels). + Stable across re-firings of the same alert. + schema: + type: string + pattern: '^[A-Za-z0-9._:-]{1,128}$' + - name: organization_id + in: query + required: true + description: Tenant the alert belongs to. Required for non-Customer roles. + schema: + type: string + get: + operationId: getAlertActivity + tags: + - Backend - Alerts + summary: Per-alert audit timeline (silence events) + description: | + Returns the audit timeline for the alert identified by `fingerprint`, most recent + first. Events are written transparently as silences are created, updated, or + removed via the `/api/systems/:id/alerts/silences` endpoints. + + Operator notes are stored as the silence `comment` (Alertmanager native), so a + note edit appears here as a `silence_updated` event whose `details` payload + includes the new comment. + + Requires `read:systems` permission. + security: + - BearerAuth: [] + parameters: + - name: limit + in: query + description: Max events to return. Default 100, max 500. + schema: + type: integer + minimum: 1 + maximum: 500 + default: 100 + responses: + '200': + description: Activity timeline + content: + application/json: + schema: + type: object + properties: + code: + type: integer + example: 200 + message: + type: string + example: alert activity retrieved successfully + data: + type: object + properties: + events: + type: array + items: + $ref: '#/components/schemas/AlertActivityEntry' + '400': + $ref: '#/components/responses/BadRequest' + '401': + $ref: '#/components/responses/Unauthorized' + '403': + $ref: '#/components/responses/Forbidden' + # =========================================== # ALERTING ENDPOINTS (Backend - Configuration) # =========================================== @@ -11627,8 +11849,14 @@ paths: - Backend - Alerts summary: List active alerts description: | - Retrieves active alerts from Mimir for the caller's scope, paginated and sorted - by `startsAt` descending (fingerprint as tiebreaker for stable pagination). + Retrieves active alerts from Mimir for the caller's scope, paginated. Each + alert is enriched with a `system` object (`name`, `type`) looked up from the + local `systems` table, so the UI can render the system column without an + extra round-trip per row. + + Sortable by `starts_at` (default desc), `severity` (criticality rank: critical + > warning > info), or `alertname`. `fingerprint` is used as a stable tiebreaker + so pagination doesn't shift between requests. Scope follows the same three modes as `/alerts/totals`: - `organization_id` omitted → caller's full hierarchy (cross-tenant fan-out). @@ -11647,12 +11875,18 @@ paths: parameters: - name: organization_id in: query - description: Target organization ID. Optional for all roles except Customer (where it is ignored). + description: | + Target organization ID(s). Repeat the param to pass multiple values. + Optional for all roles except Customer (where it is ignored). schema: - type: string + type: array + items: + type: string + style: form + explode: true - name: include in: query - description: Set to `descendants` together with `organization_id` to drill down on a sub-tree. + description: Set to `descendants` together with `organization_id` to expand each value to its sub-tree. schema: type: string enum: [descendants] @@ -11671,6 +11905,19 @@ paths: minimum: 1 maximum: 100 default: 50 + - name: sort_by + in: query + description: Sort column (allowlist). + schema: + type: string + enum: [starts_at, severity, alertname] + default: starts_at + - name: sort_direction + in: query + schema: + type: string + enum: [asc, desc] + default: desc - name: state in: query description: Filter alerts by state. Supports multiple values. @@ -11731,7 +11978,7 @@ paths: alerts: type: array items: - type: object + $ref: '#/components/schemas/ActiveAlert' pagination: $ref: '#/components/schemas/Pagination' warnings: diff --git a/backend/services/alerting/client.go b/backend/services/alerting/client.go index f4eff6e8..6faaaec7 100644 --- a/backend/services/alerting/client.go +++ b/backend/services/alerting/client.go @@ -112,8 +112,14 @@ func GetAlerts(orgID string) ([]byte, error) { // GetAlertsCtx is the context-aware variant of GetAlerts. Use this when callers // need to enforce a deadline (e.g., fan-out over many tenants for /totals). +// +// We explicitly request silenced and inhibited alerts in addition to the +// default active set. Alertmanager's default response excludes silenced +// alerts entirely, which would make our `muted` counter on /alerts/totals +// always zero and hide muted rows from the list. Including them lets the UI +// surface the "Muted" badge alongside live ones. func GetAlertsCtx(ctx context.Context, orgID string) ([]byte, error) { - url := configuration.Config.MimirURL + "/alertmanager/api/v2/alerts" + url := configuration.Config.MimirURL + "/alertmanager/api/v2/alerts?active=true&silenced=true&inhibited=true" req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) if err != nil { From c04c5b55bbabff79f803ca865d9b71ab7036f458 Mon Sep 17 00:00:00 2001 From: Edoardo Spadoni Date: Tue, 12 May 2026 11:19:33 +0200 Subject: [PATCH 02/10] feat(alerts): per-org config with hierarchical merge MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds POST/GET/DELETE /alerts/config — every organization saves its own layer; the effective Mimir YAML for any tenant is the server-side merge of all layers walking up the hierarchy from the tenant to the Owner. The merge stays internal: /alerts/config exposes only the caller's own row, never an inherited or merged view (no leakage of upstream recipients or secrets to descendants). Model is flat and recipient-centric: enabled: {email, webhook, telegram} tri-state per layer email_recipients: [{address, severities[], language, format}] webhook_recipients: [{name, url, severities[]}] telegram_recipients: [{bot_token, chat_id, severities[]}] Per-recipient severities=[] means "all severities". Email recipients additionally carry language (en|it) and format (html|plain) which the template renderer turns into per-email_configs overrides: - format=html emits our html template + our text fallback (multipart) - format=plain emits our text template plus html: '' (the empty html is mandatory — Alertmanager otherwise falls back to its built-in HTML body and overrides ours with the generic "Sent by Alertmanager") Rendering fans out a receiver per severity (critical/warning/info); recipients with severities=[] land on every per-severity receiver. The builtin alert-history webhook is always attached at the top of the routes (continue: true) so history persists regardless of config. Additive-only contract: descendants can ADD recipients but cannot disable channels enabled by ancestors. The server normalises any explicit false in enabled.{email,webhook,telegram} from non-Owner layers to null on storage. Save+propagate is serialised per-org via an in-process mutex; per-tenant push failures land in warnings[] without failing the save. Body capped at 1 MiB; oversized requests get 413. Gated on the dedicated alerts resource (read:alerts for GET, manage:alerts for POST/DELETE) — admin/super only by default. Includes: - models/alerting.go: flat shape + Validate - services/alerting/{merge,template,embed,effective,provision,redaction}.go - migration 024_add_alert_config_layers - entities/local_alert_config_layers.go (repo) - middleware/body_limit.go - logger/helpers.go: LogBusinessOperationDetails for audit snapshots - methods/alerting.go: ConfigureAlerts/GetAlertingConfig/DisableAlerts - methods/{customers,distributors,resellers}.go: provision sig change - openapi.yaml: schemas + endpoints + 6 request examples + response examples - templates: per-language dispatchers (alert_.html|txt|subject) plus telegram_.message --- .../024_add_alert_config_layers.sql | 49 + .../024_add_alert_config_layers_rollback.sql | 1 + backend/entities/local_alert_config_layers.go | 171 +++ backend/logger/helpers.go | 16 + backend/main.go | 28 +- backend/methods/alerting.go | 461 +++++--- backend/methods/customers.go | 15 +- backend/methods/distributors.go | 15 +- backend/methods/resellers.go | 15 +- backend/middleware/body_limit.go | 35 + backend/models/alerting.go | 233 +++- backend/openapi.yaml | 991 ++++++++++++++---- backend/services/alerting/effective.go | 157 +++ backend/services/alerting/embed.go | 123 ++- backend/services/alerting/merge.go | 195 ++++ backend/services/alerting/merge_test.go | 135 +++ backend/services/alerting/provision.go | 53 +- backend/services/alerting/redaction.go | 70 ++ backend/services/alerting/template.go | 521 +++------ backend/services/alerting/template_test.go | 824 +++------------ .../alerting/templates/telegram_en.tmpl | 2 +- .../alerting/templates/telegram_it.tmpl | 2 +- 22 files changed, 2548 insertions(+), 1564 deletions(-) create mode 100644 backend/database/migrations/024_add_alert_config_layers.sql create mode 100644 backend/database/migrations/024_add_alert_config_layers_rollback.sql create mode 100644 backend/entities/local_alert_config_layers.go create mode 100644 backend/middleware/body_limit.go create mode 100644 backend/services/alerting/effective.go create mode 100644 backend/services/alerting/merge.go create mode 100644 backend/services/alerting/merge_test.go create mode 100644 backend/services/alerting/redaction.go diff --git a/backend/database/migrations/024_add_alert_config_layers.sql b/backend/database/migrations/024_add_alert_config_layers.sql new file mode 100644 index 00000000..a6e10269 --- /dev/null +++ b/backend/database/migrations/024_add_alert_config_layers.sql @@ -0,0 +1,49 @@ +-- Migration 024: alert_config_layers +-- +-- One row per organization carrying that org's alerting configuration as a +-- flat recipient-based JSON blob. The effective per-tenant Mimir YAML is +-- the server-side merge of all rows walking up the org hierarchy from the +-- tenant to the Owner: +-- +-- Owner.layer → Distributor.layer → Reseller.layer → Customer.layer +-- +-- The merge is internal — /alerts/config exposes only the caller's own +-- row, never the merged effective view or anyone else's row. +-- +-- Merge rules (additive-only for security-relevant fields): +-- - bool channel toggles (enabled.{email,webhook,telegram}): OR. A +-- descendant cannot disable a channel an ancestor enabled. Non-Owner +-- layers cannot store an explicit false (normalised to null on save). +-- - recipient lists (email/webhook/telegram): union with stable dedup. +-- Dedup keys: email→address, webhook→url, telegram→(bot_token,chat_id). +-- - per-recipient severities[]: union; if any contributor uses [] ("all +-- severities") the merged copy widens back to []. +-- +-- Mimir sees a flat YAML per tenant; the layered model is server-internal +-- and invisible to Alertmanager. + +CREATE TABLE IF NOT EXISTS alert_config_layers ( + organization_id VARCHAR(255) PRIMARY KEY, + + -- Serialized AlertingConfigLayer (Go struct): + -- { + -- "enabled": {"email": *bool, "webhook": *bool, "telegram": *bool}, + -- "email_recipients": [{address, severities[], language, format}], + -- "webhook_recipients": [{name, url, severities[]}], + -- "telegram_recipients": [{bot_token, chat_id, severities[]}] + -- } + -- Channel toggles are tri-state (null = "no opinion at this layer, + -- inherit from above"). Per-recipient severities=[] means "all". + config_json JSONB NOT NULL, + + -- Audit fields. updated_by_user_id stores the logto_id of the user who + -- last saved this layer. updated_by_name is denormalised for cheap UI + -- rendering of "who set this". + updated_by_user_id VARCHAR(255), + updated_by_name VARCHAR(255), + updated_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(), + created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW() +); + +COMMENT ON TABLE alert_config_layers IS 'Per-organization alerting config layer. Effective Mimir YAML for a tenant is the merge of all layers from Owner down to that tenant; merge is server-side only and never exposed via API.'; +COMMENT ON COLUMN alert_config_layers.config_json IS 'Serialized AlertingConfigLayer: { enabled:{email,webhook,telegram}, email_recipients[], webhook_recipients[], telegram_recipients[] }. Each recipient carries its own severities[]; email recipients additionally carry language+format. Channel toggles are nullable tri-state.'; diff --git a/backend/database/migrations/024_add_alert_config_layers_rollback.sql b/backend/database/migrations/024_add_alert_config_layers_rollback.sql new file mode 100644 index 00000000..7779d575 --- /dev/null +++ b/backend/database/migrations/024_add_alert_config_layers_rollback.sql @@ -0,0 +1 @@ +DROP TABLE IF EXISTS alert_config_layers; diff --git a/backend/entities/local_alert_config_layers.go b/backend/entities/local_alert_config_layers.go new file mode 100644 index 00000000..0f1f4909 --- /dev/null +++ b/backend/entities/local_alert_config_layers.go @@ -0,0 +1,171 @@ +/* +Copyright (C) 2026 Nethesis S.r.l. +SPDX-License-Identifier: AGPL-3.0-or-later +*/ + +package entities + +import ( + "database/sql" + "encoding/json" + "errors" + "fmt" + "time" + + "github.com/lib/pq" + + "github.com/nethesis/my/backend/database" + "github.com/nethesis/my/backend/models" +) + +// AlertConfigLayerRecord is one organization's saved alerting layer plus the +// audit metadata we render in the UI ("set by Mario, 5 minutes ago"). +type AlertConfigLayerRecord struct { + OrganizationID string `json:"organization_id"` + Config models.AlertingConfigLayer `json:"config"` + UpdatedByUserID *string `json:"updated_by_user_id,omitempty"` + UpdatedByName *string `json:"updated_by_name,omitempty"` + UpdatedAt time.Time `json:"updated_at"` + CreatedAt time.Time `json:"created_at"` +} + +// LocalAlertConfigLayersRepository persists hierarchical alerting config +// layers (one row per organization). +type LocalAlertConfigLayersRepository struct { + db *sql.DB +} + +func NewLocalAlertConfigLayersRepository() *LocalAlertConfigLayersRepository { + return &LocalAlertConfigLayersRepository{db: database.DB} +} + +// ErrAlertConfigLayerNotFound is returned by Get when no layer has ever been +// saved for the given org. Callers translate this to "empty layer / inherit +// from above" rather than a hard 404. +var ErrAlertConfigLayerNotFound = errors.New("alert config layer not found") + +// Get returns the saved layer for the given org, or ErrAlertConfigLayerNotFound +// when no row exists. Callers walking the hierarchy treat the missing-row case +// as "empty layer — inherit only". +func (r *LocalAlertConfigLayersRepository) Get(orgID string) (*AlertConfigLayerRecord, error) { + row := r.db.QueryRow( + `SELECT organization_id, config_json, updated_by_user_id, updated_by_name, updated_at, created_at + FROM alert_config_layers WHERE organization_id = $1`, + orgID, + ) + rec := &AlertConfigLayerRecord{} + var raw []byte + var by, byName sql.NullString + if err := row.Scan(&rec.OrganizationID, &raw, &by, &byName, &rec.UpdatedAt, &rec.CreatedAt); err != nil { + if errors.Is(err, sql.ErrNoRows) { + return nil, ErrAlertConfigLayerNotFound + } + return nil, fmt.Errorf("get alert config layer: %w", err) + } + if err := json.Unmarshal(raw, &rec.Config); err != nil { + return nil, fmt.Errorf("decode alert config layer: %w", err) + } + if by.Valid { + rec.UpdatedByUserID = &by.String + } + if byName.Valid { + rec.UpdatedByName = &byName.String + } + return rec, nil +} + +// Upsert writes or replaces the layer for the given org. updated_at is +// refreshed; created_at is preserved by the ON CONFLICT path. +// +// Calls cfg.Validate() before writing as a defense-in-depth backstop: +// any write path bypassing the HTTP handler (admin tooling, future +// endpoints, migrations) still gets the same regex / format checks as +// the handler. DNS-aware webhook URL checks remain at the handler. +func (r *LocalAlertConfigLayersRepository) Upsert(orgID string, cfg models.AlertingConfigLayer, byUserID, byName string) (*AlertConfigLayerRecord, error) { + if err := cfg.Validate(); err != nil { + return nil, fmt.Errorf("validate layer: %w", err) + } + raw, err := json.Marshal(cfg) + if err != nil { + return nil, fmt.Errorf("encode alert config layer: %w", err) + } + row := r.db.QueryRow( + `INSERT INTO alert_config_layers (organization_id, config_json, updated_by_user_id, updated_by_name, updated_at, created_at) + VALUES ($1, $2::jsonb, NULLIF($3,''), NULLIF($4,''), NOW(), NOW()) + ON CONFLICT (organization_id) DO UPDATE + SET config_json = EXCLUDED.config_json, + updated_by_user_id = EXCLUDED.updated_by_user_id, + updated_by_name = EXCLUDED.updated_by_name, + updated_at = NOW() + RETURNING organization_id, config_json, updated_by_user_id, updated_by_name, updated_at, created_at`, + orgID, string(raw), byUserID, byName, + ) + rec := &AlertConfigLayerRecord{} + var rawOut []byte + var by, byNameOut sql.NullString + if err := row.Scan(&rec.OrganizationID, &rawOut, &by, &byNameOut, &rec.UpdatedAt, &rec.CreatedAt); err != nil { + return nil, fmt.Errorf("upsert alert config layer: %w", err) + } + if err := json.Unmarshal(rawOut, &rec.Config); err != nil { + return nil, fmt.Errorf("decode upsert response: %w", err) + } + if by.Valid { + rec.UpdatedByUserID = &by.String + } + if byNameOut.Valid { + rec.UpdatedByName = &byNameOut.String + } + return rec, nil +} + +// Delete removes the layer for the given org. Idempotent: returns nil even +// when no row matched. Used by the org-deletion cascade. +func (r *LocalAlertConfigLayersRepository) Delete(orgID string) error { + _, err := r.db.Exec(`DELETE FROM alert_config_layers WHERE organization_id = $1`, orgID) + if err != nil { + return fmt.Errorf("delete alert config layer: %w", err) + } + return nil +} + +// GetByOrgIDs fetches the layers for a list of orgs in a single query, +// returned as a map keyed by org_id. Orgs without a row are simply absent +// from the map (treated as "empty layer" by callers). +// +// Used by the merge path: when computing the effective config for tenant T, +// we resolve T's hierarchy chain (Owner→...→T) and bulk-fetch the layers in +// one round-trip. +func (r *LocalAlertConfigLayersRepository) GetByOrgIDs(orgIDs []string) (map[string]*AlertConfigLayerRecord, error) { + if len(orgIDs) == 0 { + return map[string]*AlertConfigLayerRecord{}, nil + } + rows, err := r.db.Query( + `SELECT organization_id, config_json, updated_by_user_id, updated_by_name, updated_at, created_at + FROM alert_config_layers WHERE organization_id = ANY($1)`, + pq.Array(orgIDs), + ) + if err != nil { + return nil, fmt.Errorf("get layers by org_ids: %w", err) + } + defer func() { _ = rows.Close() }() + out := make(map[string]*AlertConfigLayerRecord, len(orgIDs)) + for rows.Next() { + rec := &AlertConfigLayerRecord{} + var raw []byte + var by, byName sql.NullString + if err := rows.Scan(&rec.OrganizationID, &raw, &by, &byName, &rec.UpdatedAt, &rec.CreatedAt); err != nil { + return nil, fmt.Errorf("scan layer row: %w", err) + } + if err := json.Unmarshal(raw, &rec.Config); err != nil { + return nil, fmt.Errorf("decode layer row: %w", err) + } + if by.Valid { + rec.UpdatedByUserID = &by.String + } + if byName.Valid { + rec.UpdatedByName = &byName.String + } + out[rec.OrganizationID] = rec + } + return out, nil +} diff --git a/backend/logger/helpers.go b/backend/logger/helpers.go index 89702779..a38db694 100644 --- a/backend/logger/helpers.go +++ b/backend/logger/helpers.go @@ -119,6 +119,18 @@ func LogTokenExchange(c *gin.Context, component, tokenType string, success bool, // LogBusinessOperation logs business operations (CRUD operations) func LogBusinessOperation(c *gin.Context, component, operation, entityType, entityID string, success bool, err error) { + LogBusinessOperationDetails(c, component, operation, entityType, entityID, success, err, nil) +} + +// LogBusinessOperationDetails is the same as LogBusinessOperation but accepts +// an optional `details` map of structured fields that get attached to the log +// event (e.g. for audit diff "before"/"after" snapshots). Pass nil for the +// no-extra-fields case. +// +// Use this for operations where simple "operation succeeded" logging is +// insufficient and you need the audit log to record WHAT changed — e.g. +// alerting policy edits where forensics need to reconstruct the prior state. +func LogBusinessOperationDetails(c *gin.Context, component, operation, entityType, entityID string, success bool, err error, details map[string]interface{}) { logger := RequestLogger(c, component) event := logger.Info() @@ -132,6 +144,10 @@ func LogBusinessOperation(c *gin.Context, component, operation, entityType, enti Str("entity_id", entityID). Bool("success", success) + for k, v := range details { + event.Interface(k, v) + } + if err != nil { event.Err(err) } diff --git a/backend/main.go b/backend/main.go index edd39982..58a75f28 100644 --- a/backend/main.go +++ b/backend/main.go @@ -269,7 +269,9 @@ func main() { } // =========================================== - // ALERTS - resource-based permission validation (read:systems for GET, manage:systems for POST/DELETE) + // ALERTS - operations stay on `systems` (read:systems / manage:systems); + // configuration policy is gated by the dedicated `alerts` resource so + // only Admin/Super can rewrite the layered MSP policy. // =========================================== alertsGroup := customAuthWithAudit.Group("/alerts", middleware.RequireResourcePermission("systems")) { @@ -287,10 +289,26 @@ func main() { // Per-alert audit timeline (silence created/updated/removed events for the alert detail drawer) alertsGroup.GET("/:fingerprint/activity", methods.GetAlertActivity) - // Configuration management - alertsGroup.GET("/config", methods.GetAlertingConfig) // Get current alerting configuration - alertsGroup.POST("/config", methods.ConfigureAlerts) // Configure alert routing (manage:systems required) - alertsGroup.DELETE("/config", methods.DisableAlerts) // Disable all alerts (manage:systems required) + // Configuration management (per-org layered model) — gated on the + // dedicated `alerts` resource. GET → read:alerts, POST/DELETE → manage:alerts. + // The handler always operates on the caller's own organization layer; merged + // effective views never leave the backend (only the local server-side render + // to Mimir consumes them). + // + // MaxBodySize(1 MiB) caps the JSON payload before binding to prevent + // memory-exhaustion DoS via crafted oversized layers. With the + // per-field `max=N` constraints in models.AlertingConfigLayer the + // realistic worst case is well under 64 KB; 1 MiB leaves comfortable + // headroom for legitimate use of the full recipient lists. + configGroup := alertsGroup.Group("/config", + middleware.RequireResourcePermission("alerts"), + middleware.MaxBodySize(1<<20), + ) + { + configGroup.GET("", methods.GetAlertingConfig) // Caller's own layer (no inherited / merged view leaks to descendants) + configGroup.POST("", methods.ConfigureAlerts) // Save caller's layer + propagate to descendants (manage:alerts required) + configGroup.DELETE("", methods.DisableAlerts) // Remove caller's layer + propagate to descendants (manage:alerts required) + } } // =========================================== diff --git a/backend/methods/alerting.go b/backend/methods/alerting.go index 51ac26e3..e7fa0dde 100644 --- a/backend/methods/alerting.go +++ b/backend/methods/alerting.go @@ -25,7 +25,6 @@ import ( "github.com/gin-gonic/gin" "github.com/lib/pq" - "github.com/nethesis/my/backend/configuration" "github.com/nethesis/my/backend/database" "github.com/nethesis/my/backend/entities" "github.com/nethesis/my/backend/helpers" @@ -39,12 +38,6 @@ import ( const defaultSystemAlertSilenceDurationMinutes = 60 const defaultSystemAlertSilenceComment = "silenced from my" -// systemKeyPattern restricts SystemOverride.SystemKey to characters that are safe -// to embed verbatim in an Alertmanager matcher (which is rendered inside a -// double-quoted YAML scalar). This prevents a user from injecting additional -// matcher labels or breaking out of the YAML scalar with quotes or backslashes. -var systemKeyPattern = regexp.MustCompile(`^[A-Za-z0-9_:.\-]+$`) - // webhookHostDenylist rejects URLs whose host resolves to loopback, link-local, // cloud metadata service, RFC1918 private ranges, or unspecified addresses. // See validateWebhookURL. @@ -176,121 +169,273 @@ func resolveOrgScope(c *gin.Context, user *models.User) ([]string, bool) { return result, true } -// ConfigureAlerts handles POST /api/alerts/config -func ConfigureAlerts(c *gin.Context) { - user, ok := helpers.GetUserFromContext(c) - if !ok { - return +// configPropagationFanoutTimeout caps how long a POST /alerts/config call +// will wait for re-rendering+pushing the effective config across descendant +// tenants. Tuned so a Owner save with hundreds of tenants completes in +// reasonable time without holding the request open too long. +const configPropagationFanoutTimeout = 30 * time.Second + +// configPropagationFanoutConcurrency limits simultaneous in-flight Mimir +// pushes to avoid opening hundreds of sockets when an Owner saves. +const configPropagationFanoutConcurrency = 10 + +// alertLayerMutexes guards per-organization layer save+propagate operations. +// Two parallel POSTs/DELETEs for the same org would otherwise race at the +// Mimir push step: the DB upsert atomically last-write-wins, but the two +// fan-outs run concurrently and the slower-arriving push can land AFTER +// the faster one — leaving Mimir with stale state while the DB holds the +// newer layer. We serialise per-org to make save+propagate a critical +// section. Different orgs are independent (no global lock). +// +// Single-process scope. If/when the backend is deployed multi-instance, +// swap this for a Postgres advisory lock keyed on the same org_id. +var alertLayerMutexes sync.Map // map[string]*sync.Mutex + +func acquireOrgLayerLock(orgID string) func() { + mu, _ := alertLayerMutexes.LoadOrStore(orgID, &sync.Mutex{}) + m := mu.(*sync.Mutex) + m.Lock() + return m.Unlock +} + +// snapshotLayerForAudit produces a JSON-serialisable, secret-redacted snapshot +// of a layer record suitable for inclusion in audit log details. The unredacted +// layer is on disk in alert_config_layers; the audit log only needs to record +// what changed, not the secrets themselves. +func snapshotLayerForAudit(rec *entities.AlertConfigLayerRecord) map[string]interface{} { + if rec == nil { + return nil + } + cfg := alerting.RedactLayerForAudit(rec.Config) + return map[string]interface{}{ + "organization_id": rec.OrganizationID, + "config": cfg, + "updated_by_user_id": rec.UpdatedByUserID, + "updated_by_name": rec.UpdatedByName, + "updated_at": rec.UpdatedAt, } +} - orgID, ok := resolveOrgID(c, user) - if !ok { - return +// snapshotLayerBodyForAudit captures the inbound layer body (post-Normalize) +// before persistence, so the audit "after" reflects what we intend to write. +// Same redaction policy as snapshotLayerForAudit. +func snapshotLayerBodyForAudit(orgID string, layer models.AlertingConfigLayer) map[string]interface{} { + cfg := alerting.RedactLayerForAudit(layer) + return map[string]interface{}{ + "organization_id": orgID, + "config": cfg, } - if !requireOrgID(c, orgID) { +} + +// ConfigureAlerts handles POST /api/alerts/config — writes the CALLER's +// alerting layer (one row per organization in alert_config_layers) and +// propagates the change by re-rendering and re-pushing the effective Mimir +// config for every tenant in the caller's hierarchy. +// +// Per the additive model, descendants can ADD recipients/severity rules but +// cannot disable channels enabled by ancestors: NormalizeLayerForRole strips +// any explicit *bool=&false from non-Owner layers before storage. +// +// Returns a `warnings[]` array listing per-tenant push failures (timeout, +// 5xx, etc.). The caller's layer is saved regardless of push outcome — Mimir +// can be reconciled by saving again. +func ConfigureAlerts(c *gin.Context) { + user, ok := helpers.GetUserFromContext(c) + if !ok { return } - var req models.AlertingConfig + var req models.AlertingConfigLayer if err := c.ShouldBindJSON(&req); err != nil { + // MaxBytesReader (registered via middleware.MaxBodySize on the route) + // surfaces as a "http: request body too large" error here; map to 413 + // so the client can distinguish "too big" from "malformed". + if strings.Contains(err.Error(), "request body too large") { + c.JSON(http.StatusRequestEntityTooLarge, response.Error(http.StatusRequestEntityTooLarge, "request body exceeds the configured maximum", nil)) + return + } c.JSON(http.StatusBadRequest, response.BadRequest("invalid request body: "+err.Error(), nil)) return } - // Validate severity values - validSeverities := map[string]bool{"critical": true, "warning": true, "info": true} - for _, severity := range req.Severities { - if !validSeverities[severity.Severity] { - c.JSON(http.StatusBadRequest, response.BadRequest("invalid severity level: "+severity.Severity+". allowed: critical, warning, info", nil)) - return - } - if err := validateWebhookReceivers(severity.WebhookReceivers); err != nil { - c.JSON(http.StatusBadRequest, response.BadRequest(err.Error(), nil)) - return - } + // Webhook URLs go through DNS-aware validation (denylist resolution, + // loopback/private-range rejection). The model's stateless Validate + // covers format/structure for everything else (email format, severity + // enum, language, format). + if err := validateWebhookRecipients(req.WebhookRecipients); err != nil { + c.JSON(http.StatusBadRequest, response.BadRequest(err.Error(), nil)) + return } - // Validate per-system overrides: SystemKey is rendered verbatim into an - // Alertmanager matcher, so it must be restricted to a safe character set. - for _, sys := range req.Systems { - if !systemKeyPattern.MatchString(sys.SystemKey) { - c.JSON(http.StatusBadRequest, response.BadRequest("invalid system_key: only alphanumeric characters and _ : . - are allowed", nil)) - return - } - if err := validateWebhookReceivers(sys.WebhookReceivers); err != nil { - c.JSON(http.StatusBadRequest, response.BadRequest(err.Error(), nil)) - return - } - } + // Enforce additive-only contract: descendants cannot encode "disable + // channel" by writing explicit false. Owner is exempt (top of the chain + // can globally turn a channel off, though OR with descendants may bring + // it back). + alerting.NormalizeLayerForRole(&req, user.OrgRole) - if err := validateWebhookReceivers(req.WebhookReceivers); err != nil { - c.JSON(http.StatusBadRequest, response.BadRequest(err.Error(), nil)) - return + // Serialise save+propagate per-org: prevents two concurrent saves from + // racing at the Mimir push step, where the slower-arriving fan-out can + // land AFTER the faster one and leave Mimir with stale state while the + // DB holds the newer layer. + releaseLock := acquireOrgLayerLock(user.OrganizationID) + defer releaseLock() + + // Persist the caller's layer. Capture the previous layer (if any) BEFORE + // the upsert so the audit log records the actual diff that was applied. + layerRepo := entities.NewLocalAlertConfigLayersRepository() + prevLayer, prevErr := layerRepo.Get(user.OrganizationID) + if prevErr != nil && !errors.Is(prevErr, entities.ErrAlertConfigLayerNotFound) { + logger.Warn().Err(prevErr).Str("org_id", user.OrganizationID).Msg("failed to read previous layer for audit; continuing") + prevLayer = nil } - // Validate email template language - if req.EmailTemplateLang != "" && !slices.Contains(alerting.ValidTemplateLangs, req.EmailTemplateLang) { - c.JSON(http.StatusBadRequest, response.BadRequest("invalid email_template_lang: allowed values are "+strings.Join(alerting.ValidTemplateLangs, ", "), nil)) + updatedBy := "" + if user.LogtoID != nil { + updatedBy = *user.LogtoID + } + if _, err := layerRepo.Upsert(user.OrganizationID, req, updatedBy, user.Name); err != nil { + logger.Error().Err(err).Str("org_id", user.OrganizationID).Msg("failed to save alert config layer") + c.JSON(http.StatusInternalServerError, response.InternalServerError("failed to save alert config: "+err.Error(), nil)) return } - cfg := configuration.Config - yamlConfig, err := alerting.RenderConfig( - cfg.SMTPHost, cfg.SMTPPort, cfg.SMTPUsername, cfg.SMTPPassword, cfg.SMTPFrom, cfg.SMTPTLS, - cfg.AlertingHistoryWebhookURL, cfg.AlertingHistoryWebhookSecret, - &req, - ) + // Propagate: the caller's save affects the effective config of all + // descendants in their hierarchy (including self). Walk the descendant + // list, fan-out re-render+push to Mimir for each. + userService := local.NewUserService() + descendants, err := userService.GetHierarchicalOrganizationIDs(user.OrgRole, user.OrganizationID) if err != nil { - c.JSON(http.StatusInternalServerError, response.InternalServerError("failed to render alertmanager config: "+err.Error(), nil)) + // Layer saved but couldn't enumerate descendants — return success on + // save with a warning; reconciliation possible by re-saving. + logger.Warn().Err(err).Str("org_id", user.OrganizationID).Msg("layer saved but hierarchy enumeration failed") + auditDetails := map[string]interface{}{ + "before": snapshotLayerForAudit(prevLayer), + "after": snapshotLayerBodyForAudit(user.OrganizationID, req), + } + logger.LogBusinessOperationDetails(c, "alerts", "save_layer", "alert_config_layer", user.OrganizationID, true, nil, auditDetails) + c.JSON(http.StatusOK, response.OK("alerting layer saved (propagation skipped)", gin.H{ + "warnings": []string{fmt.Sprintf("hierarchy: %s", err.Error())}, + })) return } - templateFiles, err := alerting.BuildTemplateFiles(req.EmailTemplateLang, cfg.AppURL) - if err != nil { - c.JSON(http.StatusInternalServerError, response.InternalServerError("failed to load alert email templates: "+err.Error(), nil)) - return + warnings := propagateAlertingConfigToTenants(c.Request.Context(), descendants) + auditDetails := map[string]interface{}{ + "before": snapshotLayerForAudit(prevLayer), + "after": snapshotLayerBodyForAudit(user.OrganizationID, req), + "affected_tenants": len(descendants), + "propagation_warnings": len(warnings), } + logger.LogBusinessOperationDetails(c, "alerts", "save_layer", "alert_config_layer", user.OrganizationID, true, nil, auditDetails) + c.JSON(http.StatusOK, response.OK("alerting configuration updated successfully", gin.H{ + "warnings": warnings, + "propagated_to": len(descendants) - len(warnings), + "affected_tenants": len(descendants), + })) +} - if err := alerting.PushConfig(orgID, yamlConfig, templateFiles); err != nil { - c.JSON(http.StatusInternalServerError, response.InternalServerError("failed to push config to mimir: "+err.Error(), nil)) - return +// propagateAlertingConfigToTenants re-renders and re-pushes the effective +// Mimir config for each tenant in the list, with bounded concurrency and a +// global timeout. Per-tenant errors are collected as warnings (string +// `org : `) and returned; non-erroring tenants are pushed +// successfully. Always returns a non-nil slice. +func propagateAlertingConfigToTenants(parent context.Context, tenants []string) []string { + warnings := []string{} + if len(tenants) == 0 { + return warnings } + var ( + mu sync.Mutex + wg sync.WaitGroup + ) + ctx, cancel := context.WithTimeout(parent, configPropagationFanoutTimeout) + defer cancel() + sem := make(chan struct{}, configPropagationFanoutConcurrency) - c.JSON(http.StatusOK, response.OK("alerting configuration updated successfully", nil)) + for _, tenant := range tenants { + wg.Add(1) + go func(tenant string) { + defer wg.Done() + select { + case sem <- struct{}{}: + defer func() { <-sem }() + case <-ctx.Done(): + mu.Lock() + warnings = append(warnings, fmt.Sprintf("org %s: timed out waiting for slot", tenant)) + mu.Unlock() + return + } + if err := alerting.RenderAndPushEffective(ctx, tenant); err != nil { + logger.Warn().Err(err).Str("org_id", tenant).Msg("config propagation failed") + mu.Lock() + warnings = append(warnings, fmt.Sprintf("org %s: %s", tenant, err.Error())) + mu.Unlock() + } + }(tenant) + } + wg.Wait() + return warnings } -// DisableAlerts handles DELETE /api/alerts/config +// DisableAlerts handles DELETE /api/alerts/config — removes the CALLER's +// alerting layer entirely. The effective config of all descendant tenants +// is re-rendered as the merge of the remaining ancestor layers (so the +// caller's contribution disappears but ancestor recipients/severity rules +// are preserved). To completely silence a tenant's alerting, every layer +// in its chain must drop its contribution; alternatively the Owner can do +// it globally by removing their own layer. func DisableAlerts(c *gin.Context) { user, ok := helpers.GetUserFromContext(c) if !ok { return } - orgID, ok := resolveOrgID(c, user) - if !ok { - return + // Same critical section as ConfigureAlerts: serialise per-org so a + // concurrent save+delete race cannot leave Mimir with stale state. + releaseLock := acquireOrgLayerLock(user.OrganizationID) + defer releaseLock() + + layerRepo := entities.NewLocalAlertConfigLayersRepository() + + // Capture pre-delete snapshot for audit so the log records what was + // removed, not just "delete_layer". + prevLayer, prevErr := layerRepo.Get(user.OrganizationID) + if prevErr != nil && !errors.Is(prevErr, entities.ErrAlertConfigLayerNotFound) { + logger.Warn().Err(prevErr).Str("org_id", user.OrganizationID).Msg("failed to read previous layer for audit; continuing") + prevLayer = nil } - if !requireOrgID(c, orgID) { + + if err := layerRepo.Delete(user.OrganizationID); err != nil { + logger.Error().Err(err).Str("org_id", user.OrganizationID).Msg("failed to delete alert config layer") + c.JSON(http.StatusInternalServerError, response.InternalServerError("failed to delete alert config: "+err.Error(), nil)) return } - cfg := configuration.Config - yamlConfig, err := alerting.RenderConfig( - cfg.SMTPHost, cfg.SMTPPort, cfg.SMTPUsername, cfg.SMTPPassword, cfg.SMTPFrom, cfg.SMTPTLS, - cfg.AlertingHistoryWebhookURL, cfg.AlertingHistoryWebhookSecret, - nil, - ) + userService := local.NewUserService() + descendants, err := userService.GetHierarchicalOrganizationIDs(user.OrgRole, user.OrganizationID) if err != nil { - c.JSON(http.StatusInternalServerError, response.InternalServerError("failed to render blackhole config: "+err.Error(), nil)) + logger.Warn().Err(err).Str("org_id", user.OrganizationID).Msg("layer deleted but hierarchy enumeration failed") + auditDetails := map[string]interface{}{"before": snapshotLayerForAudit(prevLayer), "after": nil} + logger.LogBusinessOperationDetails(c, "alerts", "delete_layer", "alert_config_layer", user.OrganizationID, true, nil, auditDetails) + c.JSON(http.StatusOK, response.OK("alerting layer removed (propagation skipped)", gin.H{ + "warnings": []string{fmt.Sprintf("hierarchy: %s", err.Error())}, + })) return } - if err := alerting.PushConfig(orgID, yamlConfig, nil); err != nil { - c.JSON(http.StatusInternalServerError, response.InternalServerError("failed to push config to mimir: "+err.Error(), nil)) - return + warnings := propagateAlertingConfigToTenants(c.Request.Context(), descendants) + auditDetails := map[string]interface{}{ + "before": snapshotLayerForAudit(prevLayer), + "after": nil, + "affected_tenants": len(descendants), + "propagation_warnings": len(warnings), } - - c.JSON(http.StatusOK, response.OK("all alerts disabled successfully", nil)) + logger.LogBusinessOperationDetails(c, "alerts", "delete_layer", "alert_config_layer", user.OrganizationID, true, nil, auditDetails) + c.JSON(http.StatusOK, response.OK("alerting layer removed successfully", gin.H{ + "warnings": warnings, + "propagated_to": len(descendants) - len(warnings), + "affected_tenants": len(descendants), + })) } // alertsListDefaultPageSize matches the per-list default the project uses @@ -630,53 +775,51 @@ func enrichAlertsWithSystemInfo(orgID string, alerts []map[string]interface{}) { } } -// GetAlertingConfig handles GET /api/alerts/config -// By default returns structured JSON parsed from Mimir YAML. -// Use ?format=yaml to get the raw (redacted) YAML. +// GetAlertingConfig handles GET /api/alerts/config — returns the CALLER's +// own alerting layer. Returns an empty layer (with audit metadata absent) +// when the caller has never saved one; the frontend renders the empty-state +// form on top of it. +// +// Nothing else is exposed: no inherited ancestor layers, no merged +// effective view. Every organization sees only its own configuration, +// regardless of role. The merge happens server-side at render time and +// stays inside the backend. func GetAlertingConfig(c *gin.Context) { user, ok := helpers.GetUserFromContext(c) if !ok { return } - orgID, ok := resolveOrgID(c, user) - if !ok { - return - } - if !requireOrgID(c, orgID) { - return - } - - body, err := alerting.GetConfig(orgID) - if err != nil { - c.JSON(http.StatusInternalServerError, response.InternalServerError("failed to fetch alerting config from mimir: "+err.Error(), nil)) - return - } - - // No config exists for this tenant (Mimir 404 or empty body). Return null - // so the frontend can show the "no configuration found" empty state. - if len(body) == 0 { - c.JSON(http.StatusOK, response.OK("alerting configuration retrieved successfully", gin.H{ - "config": nil, - })) + repo := entities.NewLocalAlertConfigLayersRepository() + rec, err := repo.Get(user.OrganizationID) + if err != nil && !errors.Is(err, entities.ErrAlertConfigLayerNotFound) { + logger.Error().Err(err).Str("org_id", user.OrganizationID).Msg("failed to load alert config layer") + c.JSON(http.StatusInternalServerError, response.InternalServerError("failed to load alert config", nil)) return } - if c.Query("format") == "yaml" { - c.JSON(http.StatusOK, response.OK("alerting configuration retrieved successfully", gin.H{ - "config": alerting.RedactSensitiveConfig(string(body)), + if rec == nil { + // First-time view: emit an empty layer body so the UI can render + // without a null-check, plus null audit fields the UI uses to detect + // the "never saved" state. + c.JSON(http.StatusOK, response.OK("alerting layer retrieved successfully", gin.H{ + "enabled": models.ChannelToggles{}, + "email_recipients": []models.EmailRecipient{}, + "webhook_recipients": []models.WebhookRecipient{}, + "telegram_recipients": []models.TelegramRecipient{}, + "updated_by_name": nil, + "updated_at": nil, })) return } - cfg, err := alerting.ParseConfig(string(body)) - if err != nil { - c.JSON(http.StatusInternalServerError, response.InternalServerError("failed to parse alerting config: "+err.Error(), nil)) - return - } - - c.JSON(http.StatusOK, response.OK("alerting configuration retrieved successfully", gin.H{ - "config": cfg, + c.JSON(http.StatusOK, response.OK("alerting layer retrieved successfully", gin.H{ + "enabled": rec.Config.Enabled, + "email_recipients": rec.Config.EmailRecipients, + "webhook_recipients": rec.Config.WebhookRecipients, + "telegram_recipients": rec.Config.TelegramRecipients, + "updated_by_name": rec.UpdatedByName, + "updated_at": rec.UpdatedAt, })) } @@ -1634,19 +1777,31 @@ func parseDateRange(c *gin.Context) (*time.Time, *time.Time, error) { return from, to, nil } -// validateWebhookReceivers enforces that every webhook URL is a plain http/https +// validateWebhookRecipients enforces that every webhook URL is a plain http/https // URL pointing to a publicly-routable host. This protects Mimir's Alertmanager // (which dispatches alert payloads from inside the internal network) from being // abused as a blind SSRF relay to loopback, metadata, or private-range hosts. -func validateWebhookReceivers(receivers []models.WebhookReceiver) error { - for _, r := range receivers { +func validateWebhookRecipients(recipients []models.WebhookRecipient) error { + for _, r := range recipients { if err := validateWebhookURL(r.URL); err != nil { - return fmt.Errorf("webhook receiver %q: %w", r.Name, err) + return fmt.Errorf("webhook recipient %q: %w", r.Name, err) } } return nil } +// fqdnPattern restricts webhook hostnames to the canonical RFC1035 form when +// they aren't valid IP literals. Rejecting non-canonical forms (decimal IPs +// like "2130706433", octal "0177.0.0.1", hex "0x7f.0.0.1") closes the door +// on libc-dependent address parsing where some resolvers (notably glibc) +// would interpret them as 127.0.0.1, while our denylist keys on string +// prefixes that miss those encodings. +var fqdnPattern = regexp.MustCompile(`^[a-zA-Z0-9]([a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?(\.[a-zA-Z0-9]([a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?)*\.?$`) + +// cgnatRange is the carrier-grade NAT range (RFC6598). Not covered by +// net.IP.IsPrivate() and a real bypass risk on cloud and ISP networks. +var cgnatRange = &net.IPNet{IP: net.IPv4(100, 64, 0, 0).To4(), Mask: net.CIDRMask(10, 32)} + func validateWebhookURL(raw string) error { u, err := url.Parse(raw) if err != nil { @@ -1671,31 +1826,55 @@ func validateWebhookURL(raw string) error { } } - // Reject IP literals pointing to private/loopback/link-local/multicast/unspecified. - // Also handles IPv6-mapped IPv4 (e.g. ::ffff:127.0.0.1) via ip.To4() unmasking. - if ip := net.ParseIP(host); ip != nil { - if err := rejectNonPublicIP(ip); err != nil { - return fmt.Errorf("webhook url host %q: %w", host, err) + // Strict input shape. Either: + // 1) a valid IP literal (v4 or v6), or + // 2) a canonical FQDN (RFC1035 — alpha-prefixed labels, optional trailing dot) + // AND containing at least one alphabetic character. + // The "at least one letter" rule rejects all-digit hosts that some resolvers + // (notably macOS BSD libc) interpret as alternative IP encodings. Concrete + // case caught: "0177.0.0.1" — net.ParseIP returns nil, fqdnPattern matches, + // but BSD getaddrinfo strips the leading zero and resolves to "177.0.0.1" + // (a public IP unrelated to the user's intent of 127.0.0.1). Without the + // letter requirement, the URL would be saved with an unexpected destination. + parsedIP := net.ParseIP(host) + if parsedIP == nil { + if !fqdnPattern.MatchString(host) { + return fmt.Errorf("webhook url host %q is not a canonical hostname or IP literal", host) + } + if !strings.ContainsAny(host, "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ") { + return fmt.Errorf("webhook url host %q must be either an IP literal or a hostname with at least one letter", host) } } - // DNS resolution: resolve the hostname and reject if ANY address is private. - // This mitigates DNS rebinding attacks where the hostname initially resolves - // to a public IP at validation time but later resolves to an internal IP when - // Alertmanager delivers the webhook. - if net.ParseIP(host) == nil { - addrs, err := net.LookupHost(host) - if err != nil { - return fmt.Errorf("webhook url host %q: dns resolution failed: %w", host, err) + // IP literal path: reject loopback / private / link-local / multicast / + // unspecified / CGNAT. Handles IPv6-mapped IPv4 via ip.To4() unmasking. + if parsedIP != nil { + if err := rejectNonPublicIP(parsedIP); err != nil { + return fmt.Errorf("webhook url host %q: %w", host, err) } - for _, addr := range addrs { - ip := net.ParseIP(addr) - if ip == nil { - return fmt.Errorf("webhook url host %q resolved to unparseable address %q", host, addr) - } - if err := rejectNonPublicIP(ip); err != nil { - return fmt.Errorf("webhook url host %q resolved to non-public address %s: %w", host, addr, err) - } + return nil + } + + // FQDN path: resolve and reject if ANY returned address is non-public. + // NOTE on DNS rebinding: this validation is point-in-time; Mimir's + // Alertmanager re-resolves the hostname when delivering the webhook. + // A short-TTL record can resolve to a public IP here and to a private + // one at delivery. Authoritative mitigation requires either pinning the + // resolved IP into the URL pushed to Mimir (breaks legitimate cloud-LB + // hosts whose IPs rotate) or running the egress through a proxy that + // re-validates per-request. Today we accept the residual risk and rely + // on Mimir-side network ACLs to backstop egress to private ranges. + addrs, err := net.LookupHost(host) + if err != nil { + return fmt.Errorf("webhook url host %q: dns resolution failed: %w", host, err) + } + for _, addr := range addrs { + ip := net.ParseIP(addr) + if ip == nil { + return fmt.Errorf("webhook url host %q resolved to unparseable address %q", host, addr) + } + if err := rejectNonPublicIP(ip); err != nil { + return fmt.Errorf("webhook url host %q resolved to non-public address %s: %w", host, addr, err) } } @@ -1703,8 +1882,8 @@ func validateWebhookURL(raw string) error { } // rejectNonPublicIP returns an error if the IP is loopback, private, link-local, -// multicast, or unspecified. For IPv6-mapped IPv4 addresses (::ffff:A.B.C.D), -// the underlying IPv4 is checked. +// multicast, unspecified, or in the carrier-grade NAT range (RFC6598). For +// IPv6-mapped IPv4 addresses (::ffff:A.B.C.D), the underlying IPv4 is checked. func rejectNonPublicIP(ip net.IP) error { // Unmask IPv6-mapped IPv4 so checks work on the real address. if v4 := ip.To4(); v4 != nil { @@ -1715,5 +1894,9 @@ func rejectNonPublicIP(ip net.IP) error { ip.IsMulticast() || ip.IsUnspecified() { return fmt.Errorf("address is not publicly routable") } + // IsPrivate covers RFC1918 only — not CGNAT (100.64.0.0/10). + if v4 := ip.To4(); v4 != nil && cgnatRange.Contains(v4) { + return fmt.Errorf("address is in the carrier-grade NAT range (100.64.0.0/10)") + } return nil } diff --git a/backend/methods/customers.go b/backend/methods/customers.go index c579bc5e..d8fc69cf 100644 --- a/backend/methods/customers.go +++ b/backend/methods/customers.go @@ -79,15 +79,14 @@ func CreateCustomer(c *gin.Context) { // Log the action logger.LogBusinessOperation(c, "customers", "create", "customer", customer.ID, true, nil) - // Auto-provision default alerting configuration so the built-in history webhook - // is active from day one. Mail and webhook notifications are always disabled on - // creation; the customer's email (if present) is stored as a pre-filled - // recipient but must be explicitly enabled. Failure is logged but does not block - // customer creation. + // Auto-provision default alerting configuration so the built-in history + // webhook is active from day one and any ancestor layers propagate to + // Mimir for the new tenant. The new customer starts with no layer of + // its own; admins opt in to notifications by saving a layer via POST + // /alerts/config. Failure is logged but does not block customer + // creation. if customer.LogtoID != nil && *customer.LogtoID != "" { - defaultEmail, _ := customer.CustomData["email"].(string) - defaultLang, _ := customer.CustomData["language"].(string) - if err := alerting.ProvisionDefaultConfig(*customer.LogtoID, defaultEmail, defaultLang); err != nil { + if err := alerting.ProvisionDefaultConfig(*customer.LogtoID); err != nil { logger.Warn(). Err(err). Str("customer_id", customer.ID). diff --git a/backend/methods/distributors.go b/backend/methods/distributors.go index 379b8ada..7792714c 100644 --- a/backend/methods/distributors.go +++ b/backend/methods/distributors.go @@ -79,15 +79,14 @@ func CreateDistributor(c *gin.Context) { // Log the action logger.LogBusinessOperation(c, "distributors", "create", "distributor", distributor.ID, true, nil) - // Auto-provision default alerting configuration so the built-in history webhook - // is active from day one. Mail and webhook notifications are always disabled on - // creation; the distributor's email (if present) is stored as a pre-filled - // recipient but must be explicitly enabled. Failure is logged but does not block - // distributor creation. + // Auto-provision default alerting configuration so the built-in history + // webhook is active from day one and any ancestor layers (Owner already + // configured) propagate to Mimir for the new tenant. The new distributor + // starts with no layer of its own; admins opt in to notifications by + // saving a layer via POST /alerts/config. Failure is logged but does + // not block distributor creation. if distributor.LogtoID != nil && *distributor.LogtoID != "" { - defaultEmail, _ := distributor.CustomData["email"].(string) - defaultLang, _ := distributor.CustomData["language"].(string) - if err := alerting.ProvisionDefaultConfig(*distributor.LogtoID, defaultEmail, defaultLang); err != nil { + if err := alerting.ProvisionDefaultConfig(*distributor.LogtoID); err != nil { logger.Warn(). Err(err). Str("distributor_id", distributor.ID). diff --git a/backend/methods/resellers.go b/backend/methods/resellers.go index bdf45345..e1f9e0b0 100644 --- a/backend/methods/resellers.go +++ b/backend/methods/resellers.go @@ -79,15 +79,14 @@ func CreateReseller(c *gin.Context) { // Log the action logger.LogBusinessOperation(c, "resellers", "create", "reseller", reseller.ID, true, nil) - // Auto-provision default alerting configuration so the built-in history webhook - // is active from day one. Mail and webhook notifications are always disabled on - // creation; the reseller's email (if present) is stored as a pre-filled - // recipient but must be explicitly enabled. Failure is logged but does not block - // reseller creation. + // Auto-provision default alerting configuration so the built-in history + // webhook is active from day one and any ancestor layers (Owner / + // Distributor) propagate to Mimir for the new tenant. The new reseller + // starts with no layer of its own; admins opt in to notifications by + // saving a layer via POST /alerts/config. Failure is logged but does + // not block reseller creation. if reseller.LogtoID != nil && *reseller.LogtoID != "" { - defaultEmail, _ := reseller.CustomData["email"].(string) - defaultLang, _ := reseller.CustomData["language"].(string) - if err := alerting.ProvisionDefaultConfig(*reseller.LogtoID, defaultEmail, defaultLang); err != nil { + if err := alerting.ProvisionDefaultConfig(*reseller.LogtoID); err != nil { logger.Warn(). Err(err). Str("reseller_id", reseller.ID). diff --git a/backend/middleware/body_limit.go b/backend/middleware/body_limit.go new file mode 100644 index 00000000..3fdcc77a --- /dev/null +++ b/backend/middleware/body_limit.go @@ -0,0 +1,35 @@ +/* + * Copyright (C) 2026 Nethesis S.r.l. + * SPDX-License-Identifier: AGPL-3.0-or-later + */ + +package middleware + +import ( + "net/http" + + "github.com/gin-gonic/gin" + + "github.com/nethesis/my/backend/response" +) + +// MaxBodySize wraps the request body in http.MaxBytesReader so that handlers +// downstream (typically c.ShouldBindJSON) cannot allocate more than `limit` +// bytes when decoding the body. Use on routes whose payload should never +// exceed a known cap, to prevent memory-exhaustion DoS via crafted oversized +// JSON. +// +// On overflow the handler returns 413 Payload Too Large; subsequent +// middlewares and handlers are skipped via c.Abort. +func MaxBodySize(limit int64) gin.HandlerFunc { + return func(c *gin.Context) { + c.Request.Body = http.MaxBytesReader(c.Writer, c.Request.Body, limit) + c.Next() + } +} + +// PayloadTooLarge writes a structured 413 response and aborts. Helper for +// handlers that catch http.MaxBytesReader errors during binding. +func PayloadTooLarge(c *gin.Context, msg string) { + c.AbortWithStatusJSON(http.StatusRequestEntityTooLarge, response.Error(http.StatusRequestEntityTooLarge, msg, nil)) +} diff --git a/backend/models/alerting.go b/backend/models/alerting.go index 2bd11531..1fa4ffa4 100644 --- a/backend/models/alerting.go +++ b/backend/models/alerting.go @@ -5,57 +5,188 @@ SPDX-License-Identifier: AGPL-3.0-or-later package models -// WebhookReceiver represents a generic webhook receiver with name and URL -type WebhookReceiver struct { - Name string `json:"name" binding:"required,max=100"` - URL string `json:"url" binding:"required,url,max=2048"` -} - -// TelegramReceiver represents a Telegram bot notification target. -// BotToken is the secret token obtained from @BotFather. -// ChatID is the numeric identifier of the target chat (user, group, or channel). -type TelegramReceiver struct { - BotToken string `json:"bot_token" binding:"required"` - ChatID int64 `json:"chat_id" binding:"required"` -} - -// SeverityOverride defines mail/webhook/telegram settings for a specific severity level -type SeverityOverride struct { - Severity string `json:"severity" binding:"required,oneof=critical warning info"` - MailEnabled *bool `json:"mail_enabled"` - WebhookEnabled *bool `json:"webhook_enabled"` - TelegramEnabled *bool `json:"telegram_enabled"` - MailAddresses []string `json:"mail_addresses,omitempty" binding:"max=50,dive,email"` - WebhookReceivers []WebhookReceiver `json:"webhook_receivers,omitempty" binding:"max=20"` - TelegramReceivers []TelegramReceiver `json:"telegram_receivers,omitempty" binding:"max=20"` -} - -// SystemOverride defines mail/webhook/telegram settings for a specific system_key -type SystemOverride struct { - SystemKey string `json:"system_key" binding:"required"` - MailEnabled *bool `json:"mail_enabled"` - WebhookEnabled *bool `json:"webhook_enabled"` - TelegramEnabled *bool `json:"telegram_enabled"` - MailAddresses []string `json:"mail_addresses,omitempty" binding:"max=50,dive,email"` - WebhookReceivers []WebhookReceiver `json:"webhook_receivers,omitempty" binding:"max=20"` - TelegramReceivers []TelegramReceiver `json:"telegram_receivers,omitempty" binding:"max=20"` -} - -// AlertingConfig is the main configuration structure for alerting -type AlertingConfig struct { - // Global settings - MailEnabled bool `json:"mail_enabled"` - WebhookEnabled bool `json:"webhook_enabled"` - TelegramEnabled bool `json:"telegram_enabled"` - MailAddresses []string `json:"mail_addresses" binding:"max=50,dive,email"` - WebhookReceivers []WebhookReceiver `json:"webhook_receivers" binding:"max=20"` - TelegramReceivers []TelegramReceiver `json:"telegram_receivers" binding:"max=20"` - // Per-severity overrides - Severities []SeverityOverride `json:"severities,omitempty" binding:"max=10"` - // Per-system_key overrides - Systems []SystemOverride `json:"systems,omitempty" binding:"max=500"` - // Email template language: "en" (default) or "it" - EmailTemplateLang string `json:"email_template_lang,omitempty"` +import ( + "fmt" + "net/mail" + "net/url" + "strings" +) + +// validSeverities is the canonical Alertmanager severity set we route on. +// Recipients carry a `severities[]` field; empty means "all severities". +var validSeverities = map[string]struct{}{ + "critical": {}, + "warning": {}, + "info": {}, +} + +// validLanguages restricts EmailRecipient.Language to the set of templates +// shipped under services/alerting/templates/. +var validLanguages = map[string]struct{}{ + "en": {}, + "it": {}, +} + +// validFormats restricts EmailRecipient.Format. "html" emits only `html:` +// in the rendered email_configs entry (Alertmanager generates an +// equivalent text/plain alternative automatically but we keep the wire +// behavior explicit). "plain" emits only `text:` so spartan mailboxes get +// a stripped-down body. +var validFormats = map[string]struct{}{ + "html": {}, + "plain": {}, +} + +// ChannelToggles is the per-layer enable/disable triplet. *bool keeps +// "not set at this layer / inherit from above" distinguishable from +// "explicitly false". The handler normalises any explicit `false` from +// non-Owner layers to nil before persisting (additive contract: only +// Owner can disable a channel globally). +type ChannelToggles struct { + Email *bool `json:"email"` + Webhook *bool `json:"webhook"` + Telegram *bool `json:"telegram"` +} + +// EmailRecipient is a single email destination with its own routing scope. +// Severities=[] means "all severities" — the recipient lands on the global +// receiver in Alertmanager. A non-empty subset narrows the route to those +// severities via an extra matcher (`severity="X"` or `severity=~"X|Y"`). +// Language/Format pick the email template variant: at least one recipient +// in the merged config can pin one language, another a different one +// (we render one email_configs per recipient with template overrides). +type EmailRecipient struct { + Address string `json:"address" binding:"required,email,max=320"` + Severities []string `json:"severities" binding:"max=3,dive,oneof=critical warning info"` + Language string `json:"language,omitempty" binding:"omitempty,oneof=en it"` + Format string `json:"format,omitempty" binding:"omitempty,oneof=html plain"` +} + +// WebhookRecipient is a generic outbound HTTP receiver with optional +// severity narrowing. Name is purely descriptive (rendered into the +// receiver name in Alertmanager YAML). URL is validated against a +// denylist of private/loopback/metadata destinations at the handler. +type WebhookRecipient struct { + Name string `json:"name" binding:"required,max=100"` + URL string `json:"url" binding:"required,url,max=2048"` + Severities []string `json:"severities" binding:"max=3,dive,oneof=critical warning info"` +} + +// TelegramRecipient is a Telegram bot destination with optional severity +// narrowing. BotToken is bearer-equivalent and never returned outside the +// owning org (irrelevant in the current API since /alerts/config returns +// only the caller's own layer, but the storage layer still treats it as +// sensitive: encrypted-at-rest by the database, redacted from any future +// admin-only inspection path). +type TelegramRecipient struct { + BotToken string `json:"bot_token" binding:"required,max=256"` + ChatID int64 `json:"chat_id" binding:"required"` + Severities []string `json:"severities" binding:"max=3,dive,oneof=critical warning info"` +} + +// AlertingConfigLayer is the per-organization layer persisted in +// alert_config_layers.config_json. It doubles as the internal type +// produced by MergeForRender when the renderer builds the effective +// per-tenant Mimir YAML — the merged result is conceptually "a layer +// where each entry knows its own scope (severities[]) and per-recipient +// rendering hints (language/format)". +// +// The API surface (POST/GET /alerts/config) is exactly this struct. +// Nothing about the layered model — neither inherited ancestor recipients +// nor the merged effective preview — ever leaves the owning org. Server +// performs the merge at render time only. +type AlertingConfigLayer struct { + Enabled ChannelToggles `json:"enabled"` + EmailRecipients []EmailRecipient `json:"email_recipients" binding:"max=50"` + WebhookRecipients []WebhookRecipient `json:"webhook_recipients" binding:"max=20"` + TelegramRecipients []TelegramRecipient `json:"telegram_recipients" binding:"max=20"` +} + +// Validate runs stateless format/structure checks that must hold for every +// write path into alert_config_layers. The handler also runs DNS-aware +// webhook URL checks; this Validate is the storage-layer backstop that +// guarantees regardless of where the layer originates (HTTP handler, +// provisioning path, admin tool, future endpoint), the persisted bytes +// satisfy the contract. +func (c *AlertingConfigLayer) Validate() error { + for i, r := range c.EmailRecipients { + if err := validateEmailFormat(r.Address); err != nil { + return fmt.Errorf("email_recipients[%d]: %w", i, err) + } + if err := validateSeverities(r.Severities); err != nil { + return fmt.Errorf("email_recipients[%d]: %w", i, err) + } + if r.Language != "" { + if _, ok := validLanguages[r.Language]; !ok { + return fmt.Errorf("email_recipients[%d]: unknown language %q", i, r.Language) + } + } + if r.Format != "" { + if _, ok := validFormats[r.Format]; !ok { + return fmt.Errorf("email_recipients[%d]: unknown format %q", i, r.Format) + } + } + } + for i, r := range c.WebhookRecipients { + if err := validateStaticWebhookURL(r.URL); err != nil { + return fmt.Errorf("webhook_recipients[%d] %q: %w", i, r.Name, err) + } + if err := validateSeverities(r.Severities); err != nil { + return fmt.Errorf("webhook_recipients[%d]: %w", i, err) + } + } + for i, r := range c.TelegramRecipients { + if strings.TrimSpace(r.BotToken) == "" { + return fmt.Errorf("telegram_recipients[%d]: bot_token is empty", i) + } + if err := validateSeverities(r.Severities); err != nil { + return fmt.Errorf("telegram_recipients[%d]: %w", i, err) + } + } + return nil +} + +func validateSeverities(s []string) error { + for _, v := range s { + if _, ok := validSeverities[v]; !ok { + return fmt.Errorf("invalid severity %q", v) + } + } + return nil +} + +// validateStaticWebhookURL runs every check that does NOT require name +// resolution: scheme is http/https, no userinfo, host is non-empty and +// well-formed (IP literal or canonical FQDN). Network-aware checks +// (denylist resolution, IP private/loopback rejection of resolved DNS +// answers) are run by the handler. +func validateStaticWebhookURL(raw string) error { + u, err := url.Parse(raw) + if err != nil { + return fmt.Errorf("invalid webhook url: %w", err) + } + scheme := strings.ToLower(u.Scheme) + if scheme != "http" && scheme != "https" { + return fmt.Errorf("webhook url must use http or https, got %q", u.Scheme) + } + if u.User != nil { + return fmt.Errorf("webhook url must not contain credentials") + } + if u.Hostname() == "" { + return fmt.Errorf("webhook url is missing a host") + } + return nil +} + +func validateEmailFormat(s string) error { + s = strings.TrimSpace(s) + if s == "" { + return fmt.Errorf("email is empty") + } + if _, err := mail.ParseAddress(s); err != nil { + return fmt.Errorf("invalid email %q: %w", s, err) + } + return nil } // AlertStatus represents the status metadata for an active alert from Alertmanager. diff --git a/backend/openapi.yaml b/backend/openapi.yaml index 18a06b43..91bcb48b 100644 --- a/backend/openapi.yaml +++ b/backend/openapi.yaml @@ -142,19 +142,61 @@ components: data: $ref: '#/components/schemas/ErrorData' - SuccessResponse: + ChannelToggles: type: object + description: | + Per-layer enable/disable for each notification channel. Each value + is tri-state: + * `true` — explicitly enabled at this layer + * `false` — explicitly disabled (Owner only; non-Owner false is + normalised to null on save) + * `null` — no opinion at this layer; effective state inherits + from the merge of any ancestor layer that took a + position. If no layer in the chain enables a channel, + the channel stays off. properties: - code: - type: integer - example: 200 - message: - type: string - data: - type: object + email: + type: boolean nullable: true - - WebhookReceiver: + webhook: + type: boolean + nullable: true + telegram: + type: boolean + nullable: true + EmailRecipient: + type: object + required: + - address + properties: + address: + type: string + format: email + maxLength: 320 + severities: + type: array + maxItems: 3 + items: + type: string + enum: [critical, warning, info] + description: | + Severity scope for this recipient. Empty (or omitted) means + "all severities" — the recipient lands on every per-severity + receiver. A non-empty subset narrows delivery to those severities. + language: + type: string + enum: [en, it] + description: | + Rendering language for this recipient's email body and subject. + Defaults to `en` when omitted. + format: + type: string + enum: [html, plain] + description: | + Body format preference. `html` (default) emits multipart with + an html primary body and text alternative; `plain` emits only + a text body. + WebhookRecipient: type: object required: - name @@ -162,12 +204,25 @@ components: properties: name: type: string - description: Descriptive name for this webhook receiver + maxLength: 100 + description: Descriptive label for the webhook target (UI only). url: type: string format: uri - description: Webhook URL - TelegramReceiver: + maxLength: 2048 + description: | + Webhook URL. Validation rejects non-public destinations (loopback, + RFC1918, RFC6598 CGNAT, link-local, multicast, cloud metadata) and + requires a canonical hostname (containing at least one letter) or + a valid IP literal. Only http/https schemes are accepted; URLs + cannot embed userinfo. + severities: + type: array + maxItems: 3 + items: + type: string + enum: [critical, warning, info] + TelegramRecipient: type: object required: - bot_token @@ -175,85 +230,18 @@ components: properties: bot_token: type: string - description: Telegram bot token obtained from @BotFather + maxLength: 256 example: "123456:ABC-DEF1234ghIkl-zyx57W2v1u123ew11" chat_id: type: integer format: int64 - description: Numeric Telegram chat ID (user, group, or channel) example: -1001234567890 - SeverityOverride: - type: object - required: - - severity - properties: - severity: - type: string - enum: [critical, warning, info] - mail_enabled: - type: boolean - description: Override global mail_enabled for this severity - nullable: true - webhook_enabled: - type: boolean - description: Override global webhook_enabled for this severity - nullable: true - telegram_enabled: - type: boolean - description: Override global telegram_enabled for this severity - nullable: true - mail_addresses: - type: array - description: Email addresses for this severity. Inherits global addresses if empty. - items: - type: string - format: email - webhook_receivers: - type: array - description: Webhook receivers for this severity. Inherits global receivers if empty. - items: - $ref: '#/components/schemas/WebhookReceiver' - telegram_receivers: - type: array - description: Telegram receivers for this severity. Inherits global receivers if empty. - items: - $ref: '#/components/schemas/TelegramReceiver' - SystemOverride: - type: object - required: - - system_key - properties: - system_key: - type: string - description: The system_key label value to match - mail_enabled: - type: boolean - description: Override global mail_enabled for this system - nullable: true - webhook_enabled: - type: boolean - description: Override global webhook_enabled for this system - nullable: true - telegram_enabled: - type: boolean - description: Override global telegram_enabled for this system - nullable: true - mail_addresses: + severities: type: array - description: Email addresses for this system. Inherits global addresses if empty. + maxItems: 3 items: type: string - format: email - webhook_receivers: - type: array - description: Webhook receivers for this system. Inherits global receivers if empty. - items: - $ref: '#/components/schemas/WebhookReceiver' - telegram_receivers: - type: array - description: Telegram receivers for this system. Inherits global receivers if empty. - items: - $ref: '#/components/schemas/TelegramReceiver' + enum: [critical, warning, info] AlertmanagerSilenceStatus: type: object properties: @@ -311,80 +299,40 @@ components: status: $ref: '#/components/schemas/AlertmanagerSilenceStatus' - AlertingConfig: + AlertingConfigLayer: type: object description: | - Generic alerting configuration. Mimir YAML is the source of truth; this JSON is derived from it. - Priority order (highest first): per-system overrides > per-severity overrides > global settings. - A built-in history webhook (ALERTING_HISTORY_WEBHOOK_URL) is always active and cannot be disabled. + The caller's alerting configuration. This is what POST /alerts/config + accepts and what GET /alerts/config returns; the merged effective + config that backs Mimir is computed server-side and never exposed. + + Each recipient carries its own `severities[]` (empty = all + severities); email recipients additionally carry per-recipient + `language` and `format`. Channel toggles under `enabled` are + tri-state: null = no opinion at this layer (inherit), true = + enabled, false = disabled (Owner only — non-Owner false is + normalised to null on save). + + Body size: the request body is capped at 1 MiB on POST. Oversized + requests are rejected with HTTP 413 before binding. properties: - mail_enabled: - type: boolean - description: Globally enable or disable email notifications - default: false - webhook_enabled: - type: boolean - description: Globally enable or disable custom webhook notifications - default: false - telegram_enabled: - type: boolean - description: Globally enable or disable Telegram notifications - default: false - mail_addresses: - type: array - description: Global default email recipients - items: - type: string - format: email - webhook_receivers: - type: array - description: Global default webhook receivers - items: - $ref: '#/components/schemas/WebhookReceiver' - telegram_receivers: + enabled: + $ref: '#/components/schemas/ChannelToggles' + email_recipients: type: array - description: Global Telegram bot receivers + maxItems: 50 items: - $ref: '#/components/schemas/TelegramReceiver' - severities: + $ref: '#/components/schemas/EmailRecipient' + webhook_recipients: type: array - description: Per-severity overrides. Empty list = use global settings for all severities. + maxItems: 20 items: - $ref: '#/components/schemas/SeverityOverride' - systems: + $ref: '#/components/schemas/WebhookRecipient' + telegram_recipients: type: array - description: Per-system_key overrides (highest priority). Empty list = no per-system overrides. + maxItems: 20 items: - $ref: '#/components/schemas/SystemOverride' - email_template_lang: - type: string - enum: [en, it] - default: en - description: Language for email and Telegram notification templates. "en" (English, default) or "it" (Italian). - example: - mail_enabled: true - webhook_enabled: true - telegram_enabled: true - mail_addresses: ["global@example.com"] - webhook_receivers: - - name: "slack" - url: "https://hooks.slack.com/services/T00/B00/XXX" - telegram_receivers: - - bot_token: "123456:ABC-DEF1234ghIkl-zyx57W2v1u123ew11" - chat_id: -1001234567890 - email_template_lang: "en" - severities: - - severity: "critical" - mail_enabled: true - webhook_enabled: true - mail_addresses: ["oncall@example.com"] - - severity: "warning" - mail_enabled: false - webhook_enabled: false - systems: - - system_key: "ns8-prod-01" - mail_enabled: false - webhook_enabled: true + $ref: '#/components/schemas/TelegramRecipient' ErrorData: type: object @@ -549,6 +497,10 @@ components: format: int64 description: Auto-incrementing record ID example: 1 + organization_id: + type: string + description: Tenant the alert belongs to (logto_id of the owning org) + example: "m4m3mdjdiizs" system_key: type: string description: System key extracted from alert labels @@ -9001,6 +8953,65 @@ paths: type: array items: $ref: '#/components/schemas/ActiveAlert' + examples: + ActiveAndSuppressedOnSystem: + summary: Two firing alerts on the same system, one silenced + description: | + The endpoint always includes silenced alerts so the UI can + show the muted state in the system detail view. `state` is + `"suppressed"` when at least one active silence matches the + alert; the matching silence IDs are listed in `silencedBy`. + value: + code: 200 + message: "alerts retrieved successfully" + data: + alerts: + - fingerprint: "0a9d04bb6eed523f" + labels: + alertname: "DiskFilling" + severity: "warning" + system_key: "NETH-D417-A2C2-7810-43D2-984B-2164-34C1-B22E" + annotations: + summary: "/var is 92% full" + description: "Disk usage exceeded warning threshold." + status: + state: "suppressed" + silencedBy: + - "d9f91c6e-1b33-484e-befa-bfb41020e178" + inhibitedBy: [] + startsAt: "2026-05-12T08:14:00Z" + endsAt: "2026-05-12T08:44:00Z" + system: + id: "e4eb4844-46f6-448c-8279-7cfedf5e1037" + system_key: "NETH-D417-A2C2-7810-43D2-984B-2164-34C1-B22E" + name: "test-sys" + type: "ns8" + - fingerprint: "11a9302b0fa6526e" + labels: + alertname: "HighCPU" + severity: "critical" + system_key: "NETH-D417-A2C2-7810-43D2-984B-2164-34C1-B22E" + annotations: + summary: "CPU usage 98%" + description: "Sustained high CPU." + status: + state: "active" + silencedBy: [] + inhibitedBy: [] + startsAt: "2026-05-12T08:20:00Z" + endsAt: "2026-05-12T08:50:00Z" + system: + id: "e4eb4844-46f6-448c-8279-7cfedf5e1037" + system_key: "NETH-D417-A2C2-7810-43D2-984B-2164-34C1-B22E" + name: "test-sys" + type: "ns8" + NoActiveAlerts: + summary: System has no firing alerts + value: + code: 200 + message: "alerts retrieved successfully" + data: + alerts: [] '401': $ref: '#/components/responses/Unauthorized' '403': @@ -9050,6 +9061,44 @@ paths: type: array items: $ref: '#/components/schemas/AlertmanagerSilence' + examples: + ActiveSilence: + summary: One active silence for the system + description: | + The silence matches on `system_key` (server-injected) plus the + labels that uniquely identified the alert at silence creation + time. `silencedBy` on the active alert references this same + `id`. + value: + code: 200 + message: "silences retrieved successfully" + data: + silences: + - id: "d9f91c6e-1b33-484e-befa-bfb41020e178" + matchers: + - name: "system_key" + value: "NETH-D417-A2C2-7810-43D2-984B-2164-34C1-B22E" + isRegex: false + - name: "alertname" + value: "DiskFilling" + isRegex: false + - name: "severity" + value: "warning" + isRegex: false + startsAt: "2026-05-12T08:16:36Z" + endsAt: "2026-05-12T09:16:36Z" + updatedAt: "2026-05-12T08:16:36Z" + createdBy: "R1C1 Admin " + comment: "silenced during maintenance window" + status: + state: "active" + NoSilences: + summary: No silences for the system + value: + code: 200 + message: "silences retrieved successfully" + data: + silences: [] '401': $ref: '#/components/responses/Unauthorized' '403': @@ -9106,6 +9155,27 @@ paths: format: date-time description: Optional explicit end time (RFC3339). Takes precedence over duration_minutes. example: "2024-01-01T02:00:00Z" + examples: + ExplicitEndAt: + summary: Silence until a specific date/time + description: | + When `end_at` is set, the silence expires at that moment + regardless of `duration_minutes`. The backend resolves the + alert by `fingerprint`, attaches the system's authoritative + `system_key` to the matchers, and creates the silence. + value: + fingerprint: "0a9d04bb6eed523f" + comment: "silenced during maintenance window" + end_at: "2026-05-12T09:16:36Z" + DurationBased: + summary: Silence for the next 60 minutes + description: | + Without `end_at`, `duration_minutes` applies. If both are + omitted, the silence defaults to 60 minutes from creation. + value: + fingerprint: "0a9d04bb6eed523f" + comment: "investigating" + duration_minutes: 60 responses: '200': description: Alert silence created successfully @@ -9126,6 +9196,19 @@ paths: silence_id: type: string example: "4e6f0c30-c383-4e22-9443-0d7b6a8bd40b" + examples: + Created: + summary: Silence created + description: | + `silence_id` is the Alertmanager-assigned UUID; use it to + look up, update, or delete the silence later. The + corresponding `silenced` event is appended to the alert's + activity timeline (`GET /alerts/{fingerprint}/activity`). + value: + code: 200 + message: "alert silenced successfully" + data: + silence_id: "d9f91c6e-1b33-484e-befa-bfb41020e178" '400': $ref: '#/components/responses/BadRequest' '401': @@ -9181,6 +9264,32 @@ paths: properties: silence: $ref: '#/components/schemas/AlertmanagerSilence' + examples: + ActiveSilence: + summary: Silence found and active + value: + code: 200 + message: "silence retrieved successfully" + data: + silence: + id: "d9f91c6e-1b33-484e-befa-bfb41020e178" + matchers: + - name: "system_key" + value: "NETH-D417-A2C2-7810-43D2-984B-2164-34C1-B22E" + isRegex: false + - name: "alertname" + value: "DiskFilling" + isRegex: false + - name: "severity" + value: "warning" + isRegex: false + startsAt: "2026-05-12T08:16:36Z" + endsAt: "2026-05-12T09:16:36Z" + updatedAt: "2026-05-12T08:16:36Z" + createdBy: "R1C1 Admin " + comment: "silenced during maintenance window" + status: + state: "active" '401': $ref: '#/components/responses/Unauthorized' '403': @@ -9231,6 +9340,17 @@ paths: format: date-time description: New end time (RFC3339). Must be in the future. example: "2024-01-01T04:00:00Z" + examples: + ExtendEndTime: + summary: Extend the silence by 3 more hours + description: | + Alertmanager treats an update as "create a new silence with + the same matchers and start_at, then drop the old one", so + the response carries a new `silence_id`. The activity + timeline records this as a `silence_updated` event. + value: + comment: "extended for maintenance window" + end_at: "2026-05-12T12:16:36Z" responses: '200': description: Silence updated successfully @@ -9251,6 +9371,14 @@ paths: silence_id: type: string example: "4e6f0c30-c383-4e22-9443-0d7b6a8bd40b" + examples: + Updated: + summary: New silence id after update + value: + code: 200 + message: "silence updated successfully" + data: + silence_id: "f1e1c2a4-7e57-4b1a-aaa0-2b96c8b5a3aa" '400': $ref: '#/components/responses/BadRequest' '401': @@ -9299,6 +9427,19 @@ paths: message: type: string example: "silence disabled successfully" + examples: + Disabled: + summary: Silence removed + description: | + The silence is expired (not hard-deleted) so it disappears + from `GET /silences` but stays referenced in the alert's + activity timeline as an `unsilenced` event. On the wire + the response also carries `"data": null`; the example + omits it to match the declared schema which only includes + `code` and `message`. + value: + code: 200 + message: "silence disabled successfully" '400': $ref: '#/components/responses/BadRequest' '401': @@ -9409,6 +9550,78 @@ paths: $ref: '#/components/schemas/AlertHistoryRecord' pagination: $ref: '#/components/schemas/Pagination' + examples: + ResolvedAlertsForSystem: + summary: Two resolved alerts on this system + description: | + Records are scoped to the path system's `system_key`. The + response is identical in shape to `GET /alerts/history` but + filtered to one system without needing to pass it as a + query param. + value: + code: 200 + message: "alert history retrieved successfully" + data: + alerts: + - id: 55 + organization_id: "m4m3mdjdiizs" + system_key: "NETH-D417-A2C2-7810-43D2-984B-2164-34C1-B22E" + alertname: "PlainBodyTest" + severity: "critical" + status: "resolved" + fingerprint: "11a9302b0fa6526e" + starts_at: "2026-05-12T07:46:50Z" + ends_at: "2026-05-12T07:51:50Z" + summary: "plain body check" + labels: + alertname: "PlainBodyTest" + severity: "critical" + system_key: "NETH-D417-A2C2-7810-43D2-984B-2164-34C1-B22E" + annotations: + summary: "plain body check" + description: "resolved" + receiver: "severity-critical-receiver" + created_at: "2026-05-12T07:52:00Z" + - id: 54 + organization_id: "m4m3mdjdiizs" + system_key: "NETH-D417-A2C2-7810-43D2-984B-2164-34C1-B22E" + alertname: "HistFlowTest" + severity: "critical" + status: "resolved" + fingerprint: "9c1a23e87f4d0a11" + starts_at: "2026-05-12T08:01:07Z" + ends_at: "2026-05-12T08:06:06Z" + summary: "history flow check" + labels: + alertname: "HistFlowTest" + severity: "critical" + system_key: "NETH-D417-A2C2-7810-43D2-984B-2164-34C1-B22E" + annotations: + summary: "history flow check" + description: "resolved" + receiver: "severity-critical-receiver" + created_at: "2026-05-12T08:11:30Z" + pagination: + page: 1 + page_size: 50 + total_count: 2 + total_pages: 1 + has_next: false + has_prev: false + EmptyHistory: + summary: No history rows for this system yet + value: + code: 200 + message: "alert history retrieved successfully" + data: + alerts: [] + pagination: + page: 1 + page_size: 50 + total_count: 0 + total_pages: 0 + has_next: false + has_prev: false '400': $ref: '#/components/responses/BadRequest' '401': @@ -11633,6 +11846,79 @@ paths: $ref: '#/components/schemas/AlertHistoryRecord' pagination: $ref: '#/components/schemas/Pagination' + examples: + TwoResolvedAlerts: + summary: Two resolved alerts on the same system + description: | + Result for a Customer caller. Same `system_key` appears in both + rows because they were fired against the same NS8 host. Each + row is a discrete event (firing → resolved) captured by the + history webhook at dispatch time; `created_at` records when + the row landed in `alert_history`. + value: + code: 200 + message: alert history retrieved successfully + data: + alerts: + - id: 55 + organization_id: "m4m3mdjdiizs" + system_key: "NETH-D417-A2C2-7810-43D2-984B-2164-34C1-B22E" + alertname: "PlainBodyTest" + severity: "critical" + status: "resolved" + fingerprint: "11a9302b0fa6526e" + starts_at: "2026-05-12T07:46:50Z" + ends_at: "2026-05-12T07:51:50Z" + summary: "plain body check" + labels: + alertname: "PlainBodyTest" + severity: "critical" + system_key: "NETH-D417-A2C2-7810-43D2-984B-2164-34C1-B22E" + annotations: + summary: "plain body check" + description: "checking html:'' fix" + receiver: "severity-critical-receiver" + created_at: "2026-05-12T07:52:00Z" + - id: 54 + organization_id: "m4m3mdjdiizs" + system_key: "NETH-D417-A2C2-7810-43D2-984B-2164-34C1-B22E" + alertname: "HistFlowTest" + severity: "critical" + status: "resolved" + fingerprint: "9c1a23e87f4d0a11" + starts_at: "2026-05-12T08:01:07Z" + ends_at: "2026-05-12T08:06:06Z" + summary: "history flow check" + labels: + alertname: "HistFlowTest" + severity: "critical" + system_key: "NETH-D417-A2C2-7810-43D2-984B-2164-34C1-B22E" + annotations: + summary: "history flow check" + description: "resolved" + receiver: "severity-critical-receiver" + created_at: "2026-05-12T08:11:30Z" + pagination: + page: 1 + page_size: 50 + total_count: 2 + total_pages: 1 + has_next: false + has_prev: false + EmptyHistory: + summary: No history rows match + value: + code: 200 + message: alert history retrieved successfully + data: + alerts: [] + pagination: + page: 1 + page_size: 50 + total_count: 0 + total_pages: 0 + has_next: false + has_prev: false '400': $ref: '#/components/responses/BadRequest' '401': @@ -11708,6 +11994,51 @@ paths: type: array items: $ref: '#/components/schemas/AlertActivityEntry' + examples: + SilenceCreatedThenRemoved: + summary: A silence was created and later removed + description: | + Events are most-recent first. Both rows share the same + `silence_id` because they describe the same silence's + lifecycle. `actor_user_id` is the logto_id of the operator + who performed the action; `details` carries the silence + metadata captured at action time (comment, end_at, etc.). + value: + code: 200 + message: alert activity retrieved successfully + data: + events: + - id: 5 + organization_id: "m4m3mdjdiizs" + fingerprint: "0a9d04bb6eed523f" + action: "unsilenced" + actor_user_id: "c5gpnoo2do48" + actor_name: "R1C1 Admin" + silence_id: "d9f91c6e-1b33-484e-befa-bfb41020e178" + details: {} + created_at: "2026-05-12T08:20:38.410596Z" + - id: 4 + organization_id: "m4m3mdjdiizs" + fingerprint: "0a9d04bb6eed523f" + action: "silenced" + actor_user_id: "c5gpnoo2do48" + actor_name: "R1C1 Admin" + silence_id: "d9f91c6e-1b33-484e-befa-bfb41020e178" + details: + comment: "silenced during maintenance window" + end_at: "2026-05-12T09:16:36Z" + duration_minutes: 0 + created_at: "2026-05-12T08:16:36.661832Z" + EmptyTimeline: + summary: No silence events yet + description: | + The alert has fired but has never been silenced. The events + array is empty (not `null`). + value: + code: 200 + message: alert activity retrieved successfully + data: + events: [] '400': $ref: '#/components/responses/BadRequest' '401': @@ -11724,63 +12055,222 @@ paths: operationId: configureAlerts tags: - Backend - Alerts - summary: Configure alert routing + summary: Save the caller's alerting layer description: | - Accepts a generic AlertingConfig JSON, renders it into a valid Alertmanager YAML configuration, - and pushes it to Mimir. SMTP settings are injected server-side. - A built-in history webhook (ALERTING_HISTORY_WEBHOOK_URL) is always included and cannot be disabled. - Priority order: per-system overrides > per-severity overrides > global settings. - Address fallback: if an override's address list is empty, global addresses are used. + Saves the CALLER's alerting configuration layer (one row per + organization in alert_config_layers). The body is an + `AlertingConfigLayer`: three channel toggles plus three recipient + lists. Each recipient carries its own `severities[]`; email + recipients additionally carry `language` and `format`. + + After save, the effective per-tenant Mimir YAML is recomputed + server-side (merge of all layers walking up to the Owner) and + pushed to every tenant in the caller's hierarchy with bounded + concurrency. Per-tenant push failures are returned in `warnings[]`; + the caller's layer is saved regardless of push outcome (Mimir can + be reconciled by saving again). + + Additive-only contract: descendants can ADD recipients but cannot + disable channels enabled by ancestors. The server normalises any + explicit `false` in `enabled.{email,webhook,telegram}` from + non-Owner layers to null on storage. + + Save+propagate is serialised per-organization (in-process mutex) to + prevent two concurrent saves from racing at the Mimir push step. + Body is capped at 1 MiB; oversized payloads are rejected with 413. + + Requires `manage:alerts` permission. security: - BearerAuth: [] - parameters: - - name: organization_id - in: query - description: Target organization ID. Required for Owner, Distributor, and Reseller roles. Customer role uses their own organization automatically. - schema: - type: string requestBody: required: true content: application/json: schema: - $ref: '#/components/schemas/AlertingConfig' - responses: - '200': - description: Alerting configuration updated - content: - application/json: - schema: - $ref: '#/components/schemas/SuccessResponse' + $ref: '#/components/schemas/AlertingConfigLayer' + examples: + OwnerGlobalBaseline: + summary: Owner — global baseline + description: | + Owner enables email + webhook globally, sets a NOC + recipient on all severities in Italian HTML, plus a SIEM + webhook on every severity. Every descendant inherits. + value: + enabled: { email: true, webhook: true, telegram: false } + email_recipients: + - address: "noc@msp.example" + severities: [] + language: "it" + format: "html" + webhook_recipients: + - name: "central-siem" + url: "https://siem.example/api/alerts" + severities: [] + telegram_recipients: [] + DescendantAddRecipient: + summary: Reseller — additively add a recipient + description: | + Reseller does NOT touch channel toggles (null = "no + opinion"); it just adds a local NOC mailbox in English + for critical+warning. Merged with Owner's recipients. + value: + enabled: { email: null, webhook: null, telegram: null } + email_recipients: + - address: "noc@reseller.example" + severities: ["critical", "warning"] + language: "en" + format: "html" + webhook_recipients: [] + telegram_recipients: [] + CustomerMixedFormatAndLang: + summary: Customer — mixed languages and formats per recipient + description: | + Different recipients can request different bodies. The + on-call inbox wants plain text (alerts piped into a + ticketing tool); the manager wants HTML in Italian. + value: + enabled: { email: null, webhook: null, telegram: null } + email_recipients: + - address: "oncall@customer.example" + severities: ["critical"] + language: "en" + format: "plain" + - address: "manager@customer.example" + severities: [] + language: "it" + format: "html" + webhook_recipients: [] + telegram_recipients: [] + CustomerWebhookCriticalOnly: + summary: Customer — Slack webhook only for `critical` + description: | + Customer adds a Slack webhook scoped to critical. The + rendered Alertmanager route puts this webhook only on + the critical receiver. + value: + enabled: { email: null, webhook: null, telegram: null } + email_recipients: [] + webhook_recipients: + - name: "ops-slack" + url: "https://hooks.slack.com/services/T000/B000/XXX" + severities: ["critical"] + telegram_recipients: [] + TelegramAllSeverities: + summary: Customer — Telegram channel on every severity + description: | + Single Telegram bot pushing to a channel for every + severity (severities=[]). Telegram messages are + currently always rendered in English. + value: + enabled: { email: null, webhook: null, telegram: true } + email_recipients: [] + webhook_recipients: [] + telegram_recipients: + - bot_token: "123456:ABC-DEF1234ghIkl" + chat_id: -1001234567890 + severities: [] + InheritPurely: + summary: Descendant — explicit "inherit everything" + description: | + Saving an empty layer is meaningful: it just records + audit metadata (who/when) without contributing + recipients or toggles. + value: + enabled: { email: null, webhook: null, telegram: null } + email_recipients: [] + webhook_recipients: [] + telegram_recipients: [] + responses: + '200': + description: Layer saved (and propagation attempted) + content: + application/json: + schema: + type: object + properties: + code: + type: integer + example: 200 + message: + type: string + example: alerting configuration updated successfully + data: + type: object + properties: + affected_tenants: + type: integer + description: Number of tenants in caller's hierarchy whose effective config was recomputed + propagated_to: + type: integer + description: Of `affected_tenants`, how many were successfully pushed to Mimir + warnings: + type: array + description: | + Per-tenant push errors. Always present; empty when every push succeeded. + Each entry: `org : `. + items: + type: string '400': $ref: '#/components/responses/BadRequest' '401': $ref: '#/components/responses/Unauthorized' '403': $ref: '#/components/responses/Forbidden' + '413': + description: Request body exceeds the configured maximum (1 MiB). + content: + application/json: + schema: + type: object + properties: + code: + type: integer + example: 413 + message: + type: string + example: request body exceeds the configured maximum '500': $ref: '#/components/responses/InternalServerError' delete: operationId: disableAlerts tags: - Backend - Alerts - summary: Disable all alerts - description: Replaces the Alertmanager config with a blackhole-only configuration, silencing all alerts for the organization. The built-in history webhook remains active. + summary: Remove the caller's alerting layer + description: | + Removes the CALLER's layer from alert_config_layers. The effective + config of all descendant tenants is recomputed without the caller's + contribution and re-pushed to Mimir; ancestor layers are preserved. + To completely silence a tenant, every layer in its chain must drop + its contribution. + + Requires `manage:alerts` permission. security: - BearerAuth: [] - parameters: - - name: organization_id - in: query - description: Target organization ID. Required for Owner, Distributor, and Reseller roles. - schema: - type: string responses: '200': - description: All alerts disabled + description: Layer removed (and propagation attempted) content: application/json: schema: - $ref: '#/components/schemas/SuccessResponse' + type: object + properties: + code: + type: integer + example: 200 + message: + type: string + example: alerting layer removed successfully + data: + type: object + properties: + affected_tenants: + type: integer + propagated_to: + type: integer + warnings: + type: array + items: + type: string '400': $ref: '#/components/responses/BadRequest' '401': @@ -11793,27 +12283,24 @@ paths: operationId: getAlertingConfig tags: - Backend - Alerts - summary: Get current alerting configuration + summary: Get the caller's alerting layer description: | - Retrieves the current alerting configuration from Mimir (source of truth). - By default returns structured JSON (AlertingConfig). Use ?format=yaml to get the raw redacted YAML. + Returns the CALLER's own alerting configuration layer. No inherited + ancestor layers, no merged effective view: every organization sees + only its own configuration. The server-side merge that backs the + Mimir YAML stays inside the backend. + + When the caller has never saved a layer the response body contains + an empty layer (toggles all null, recipient lists empty) and the + two audit fields set to null. The UI uses that state to render a + first-save form. + + Requires `read:alerts` permission. security: - BearerAuth: [] - parameters: - - name: organization_id - in: query - description: Target organization ID. Required for Owner, Distributor, and Reseller roles. - schema: - type: string - - name: format - in: query - description: Response format. Omit for JSON (default) or set to "yaml" for raw YAML. - schema: - type: string - enum: [yaml] responses: '200': - description: Current alerting configuration + description: Caller's layer content: application/json: schema: @@ -11824,23 +12311,52 @@ paths: example: 200 message: type: string - example: alerting configuration retrieved successfully + example: alerting layer retrieved successfully data: - type: object - properties: - config: - oneOf: - - $ref: '#/components/schemas/AlertingConfig' - - type: string - description: Raw redacted YAML (when ?format=yaml) - '400': - $ref: '#/components/responses/BadRequest' + allOf: + - $ref: '#/components/schemas/AlertingConfigLayer' + - type: object + properties: + updated_by_name: + type: string + nullable: true + updated_at: + type: string + format: date-time + nullable: true + examples: + Configured: + summary: Caller has saved a layer + value: + code: 200 + message: "alerting layer retrieved successfully" + data: + enabled: { email: true, webhook: null, telegram: null } + email_recipients: + - address: "noc@reseller.example" + severities: ["critical"] + language: "it" + format: "html" + webhook_recipients: [] + telegram_recipients: [] + updated_by_name: "Reseller Admin" + updated_at: "2026-05-09T10:14:00Z" + FirstTime: + summary: Caller has not saved a layer yet + value: + code: 200 + message: "alerting layer retrieved successfully" + data: + enabled: { email: null, webhook: null, telegram: null } + email_recipients: [] + webhook_recipients: [] + telegram_recipients: [] + updated_by_name: null + updated_at: null '401': $ref: '#/components/responses/Unauthorized' '403': $ref: '#/components/responses/Forbidden' - '500': - $ref: '#/components/responses/InternalServerError' /alerts: get: @@ -11989,6 +12505,67 @@ paths: a string `org : `. items: type: string + examples: + ActiveAlertWithEnrichment: + summary: One active warning across the caller's hierarchy + description: | + A single warning alert returned with local-DB system enrichment. + `state="active"` means Mimir has not been told to silence it; an + actively-muted alert would have `state="suppressed"` and a + non-empty `silencedBy`. + value: + code: 200 + message: alerts retrieved successfully + data: + alerts: + - fingerprint: "0a9d04bb6eed523f" + labels: + alertname: "DiskFilling" + severity: "warning" + system_key: "NETH-D417-A2C2-7810-43D2-984B-2164-34C1-B22E" + annotations: + summary: "/var is 92% full on test-sys" + description: "Disk usage exceeded the warning threshold." + status: + state: "active" + silencedBy: [] + inhibitedBy: [] + startsAt: "2026-05-12T08:14:00Z" + endsAt: "2026-05-12T08:44:00Z" + system: + id: "e4eb4844-46f6-448c-8279-7cfedf5e1037" + system_key: "NETH-D417-A2C2-7810-43D2-984B-2164-34C1-B22E" + name: "test-sys" + type: "ns8" + pagination: + page: 1 + page_size: 50 + total_count: 1 + total_pages: 1 + has_next: false + has_prev: false + warnings: [] + PartialFanoutWarning: + summary: One tenant timed out during fan-out + description: | + When `organization_id` is omitted (or `include=descendants` is + used), the request fans out to every tenant in scope. A single + slow Mimir does not fail the whole request — the rest of the + results are returned and the failing tenant lands in `warnings`. + value: + code: 200 + message: alerts retrieved successfully + data: + alerts: [] + pagination: + page: 1 + page_size: 50 + total_count: 0 + total_pages: 0 + has_next: false + has_prev: false + warnings: + - "org pt8gqs6y5wpr: context deadline exceeded" '400': $ref: '#/components/responses/BadRequest' '401': diff --git a/backend/services/alerting/effective.go b/backend/services/alerting/effective.go new file mode 100644 index 00000000..cc8b36af --- /dev/null +++ b/backend/services/alerting/effective.go @@ -0,0 +1,157 @@ +/* +Copyright (C) 2026 Nethesis S.r.l. +SPDX-License-Identifier: AGPL-3.0-or-later +*/ + +package alerting + +import ( + "context" + "fmt" + + "github.com/nethesis/my/backend/configuration" + "github.com/nethesis/my/backend/database" + "github.com/nethesis/my/backend/entities" + "github.com/nethesis/my/backend/logger" + "github.com/nethesis/my/backend/models" +) + +// ErrChainTooDeep is returned by ResolveAncestorChain when an org's parent +// chain exceeds the maximum hop count without reaching the Owner. Callers +// treat this as a hard error rather than continuing with a possibly +// truncated chain — silently dropping upstream layers can underspecify the +// effective config (recipients missing, channels accidentally disabled). +var ErrChainTooDeep = fmt.Errorf("ancestor chain exceeds max depth") + +// MaxChainDepth caps how deep ResolveAncestorChain will walk. The +// application's hierarchy is fixed at 4 levels (Owner -> Distributor -> +// Reseller -> Customer), so 8 leaves comfortable margin while still +// catching pathological data (cycles, runaway parent chains). +const MaxChainDepth = 8 + +// ResolveAncestorChain returns the list of org IDs from the Owner (top) down +// to the given tenant (inclusive). The chain is built by walking the +// `custom_data->>'createdBy'` field of distributors/resellers/customers up +// to the org that has no parent recorded (the Owner). The Owner's org id is +// included as the first element so the merge picks up its layer when present. +// +// Cycle protection: capped at MaxChainDepth hops. If a chain would exceed +// the cap (cycle or genuinely too deep), returns ErrChainTooDeep so the +// caller can fail closed rather than render an incomplete config. +func ResolveAncestorChain(tenantOrgID string) ([]string, error) { + if tenantOrgID == "" { + return nil, fmt.Errorf("tenantOrgID is required") + } + chain := []string{tenantOrgID} + current := tenantOrgID + for i := 0; i < MaxChainDepth; i++ { + parent, err := lookupCreatedBy(current) + if err != nil { + return nil, fmt.Errorf("walk hierarchy at %s: %w", current, err) + } + if parent == "" { + return chain, nil + } + // Detect a cycle defensively: same id seen twice would loop forever. + for _, seen := range chain { + if seen == parent { + return chain, nil + } + } + chain = append([]string{parent}, chain...) + current = parent + } + // MaxChainDepth exhausted without hitting an Owner (no-parent) row. + // Either a cycle slipped through the in-loop check or the hierarchy is + // deeper than the model supports. Either way, fail closed. + return nil, fmt.Errorf("%w (tenant=%s, depth=%d)", ErrChainTooDeep, tenantOrgID, MaxChainDepth) +} + +// lookupCreatedBy returns the createdBy field of the org matching orgID +// across the three org tables, or "" when not found (which is how we +// recognize the Owner — its org id is referenced in createdBy of others +// but doesn't have a row in any of the three tables). +func lookupCreatedBy(orgID string) (string, error) { + row := database.DB.QueryRow( + `SELECT custom_data->>'createdBy' FROM distributors WHERE logto_id = $1 AND deleted_at IS NULL + UNION ALL + SELECT custom_data->>'createdBy' FROM resellers WHERE logto_id = $1 AND deleted_at IS NULL + UNION ALL + SELECT custom_data->>'createdBy' FROM customers WHERE logto_id = $1 AND deleted_at IS NULL + LIMIT 1`, + orgID, + ) + var parent *string + err := row.Scan(&parent) + if err != nil { + // sql.ErrNoRows means: orgID is not in any of the three tables. + // Treat this as "no parent" (likely the Owner). Other errors bubble. + if err.Error() == "sql: no rows in result set" { + return "", nil + } + return "", err + } + if parent == nil { + return "", nil + } + return *parent, nil +} + +// computeEffectiveLayer is the package-private entry point that walks the +// tenant's ancestor chain, fetches every layer in a single round-trip, and +// merges them in order from Owner to tenant. Empty layers (orgs with no row +// in alert_config_layers) contribute nothing but don't break the chain. +// +// Package-private intentionally: the merged view never leaves the backend. +// Only RenderAndPushEffective uses it, to drive the Mimir YAML push. +func computeEffectiveLayer(tenantOrgID string) (models.AlertingConfigLayer, error) { + chain, err := ResolveAncestorChain(tenantOrgID) + if err != nil { + return models.AlertingConfigLayer{}, err + } + repo := entities.NewLocalAlertConfigLayersRepository() + layersByOrg, err := repo.GetByOrgIDs(chain) + if err != nil { + return models.AlertingConfigLayer{}, err + } + + ordered := make([]models.AlertingConfigLayer, 0, len(chain)) + for _, oid := range chain { + rec, ok := layersByOrg[oid] + if !ok { + continue + } + ordered = append(ordered, rec.Config) + } + + return MergeLayers(ordered), nil +} + +// RenderAndPushEffective re-computes and pushes the effective Mimir +// alertmanager config for one tenant. Used by the propagation path: when +// any layer in a tenant's chain is saved, RenderAndPushEffective is invoked +// for every affected tenant to keep Mimir in sync. +func RenderAndPushEffective(ctx context.Context, tenantOrgID string) error { + effective, err := computeEffectiveLayer(tenantOrgID) + if err != nil { + return fmt.Errorf("compute effective for %s: %w", tenantOrgID, err) + } + cfg := configuration.Config + yamlConfig, err := RenderConfig( + cfg.SMTPHost, cfg.SMTPPort, cfg.SMTPUsername, cfg.SMTPPassword, cfg.SMTPFrom, cfg.SMTPTLS, + cfg.AlertingHistoryWebhookURL, cfg.AlertingHistoryWebhookSecret, + &effective, + ) + if err != nil { + return fmt.Errorf("render YAML for %s: %w", tenantOrgID, err) + } + templateFiles, err := BuildTemplateFiles(cfg.AppURL) + if err != nil { + return fmt.Errorf("build templates for %s: %w", tenantOrgID, err) + } + if err := PushConfig(tenantOrgID, yamlConfig, templateFiles); err != nil { + return fmt.Errorf("push config for %s: %w", tenantOrgID, err) + } + logger.Debug().Str("tenant", tenantOrgID).Msg("effective alerting config pushed to mimir") + return nil +} diff --git a/backend/services/alerting/embed.go b/backend/services/alerting/embed.go index b27a3276..122e23ee 100644 --- a/backend/services/alerting/embed.go +++ b/backend/services/alerting/embed.go @@ -15,67 +15,82 @@ import ( var templateFS embed.FS // ValidTemplateLangs lists supported email template languages. +// +// All supported languages are shipped with every tenant push because the +// merged effective config can mix languages across recipients (one email +// recipient in en, another in it). The renderer picks per-recipient which +// dispatcher to reference. var ValidTemplateLangs = []string{"en", "it"} -// BuildTemplateFiles returns all Alertmanager template file contents for -// the given language, plus a generated dispatcher template that routes -// firing/resolved notifications to the correct language-specific template, -// plus the Telegram message template for the same language. -// lang defaults to "en" when empty. -// appURL is substituted into the ${APP_URL} placeholder inside the templates, -// used by the "view system" CTA to build a link to the frontend. -func BuildTemplateFiles(lang, appURL string) (map[string]string, error) { - if lang == "" { - lang = "en" - } - - names := []string{ - "firing_" + lang + ".html", - "resolved_" + lang + ".html", - "firing_" + lang + ".txt", - "resolved_" + lang + ".txt", - "telegram_" + lang + ".tmpl", - } - - files := make(map[string]string, len(names)+1) - for _, name := range names { - content, err := templateFS.ReadFile("templates/" + name) - if err != nil { - return nil, fmt.Errorf("loading alert template %s: %w", name, err) +// BuildTemplateFiles returns the complete bundle of Alertmanager template +// files for every supported language plus per-language dispatcher templates +// (alert_.html / alert_.txt / alert_.subject). +// +// appURL is substituted into the ${APP_URL} placeholder inside the +// language-specific templates, used by the "view system" CTA to build a +// link to the frontend. +// +// Output layout: +// +// firing_en.html / firing_en.txt / firing_en.subject (defined in firing_en.html) +// resolved_en.html / resolved_en.txt / resolved_en.subject +// telegram_en.tmpl +// (same for it) +// _dispatcher.tmpl — defines alert_en.{html,txt,subject} and alert_it.{html,txt,subject} +func BuildTemplateFiles(appURL string) (map[string]string, error) { + files := map[string]string{} + for _, lang := range ValidTemplateLangs { + names := []string{ + "firing_" + lang + ".html", + "resolved_" + lang + ".html", + "firing_" + lang + ".txt", + "resolved_" + lang + ".txt", + "telegram_" + lang + ".tmpl", + } + for _, name := range names { + content, err := templateFS.ReadFile("templates/" + name) + if err != nil { + return nil, fmt.Errorf("loading alert template %s: %w", name, err) + } + files[name] = strings.ReplaceAll(string(content), "${APP_URL}", appURL) } - files[name] = strings.ReplaceAll(string(content), "${APP_URL}", appURL) } - - // Dispatcher routes firing/resolved to the correct language template. - files["_dispatcher.tmpl"] = buildDispatcher(lang) - + files["_dispatcher.tmpl"] = buildDispatcher() return files, nil } -// buildDispatcher generates a small Alertmanager template file that dispatches -// to the correct firing/resolved template based on .Status. -func buildDispatcher(lang string) string { +// buildDispatcher generates a per-language dispatcher template file that +// routes firing/resolved notifications to the right language-specific +// template. Names are suffixed with `_` so multiple languages can +// coexist in the same Mimir-loaded template set without colliding on the +// unqualified `alert.html` / `alert.txt` / `alert.subject` names. +// +// Each email_configs entry in the rendered YAML picks its dispatcher via +// `{{ template "alert_.html" . }}` (and equivalents for text/subject). +func buildDispatcher() string { var sb strings.Builder - fmt.Fprintf(&sb, - "{{ define \"alert.html\" }}"+ - "{{ if eq .Status \"firing\" }}{{ template \"firing_%s.html\" . }}"+ - "{{ else }}{{ template \"resolved_%s.html\" . }}{{ end }}"+ - "{{ end }}\n", - lang, lang, - ) - fmt.Fprintf(&sb, - "{{ define \"alert.txt\" }}"+ - "{{ if eq .Status \"firing\" }}{{ template \"firing_%s.txt\" . }}"+ - "{{ else }}{{ template \"resolved_%s.txt\" . }}{{ end }}"+ - "{{ end }}\n", - lang, lang, - ) - fmt.Fprintf(&sb, - "{{ define \"alert.subject\" }}"+ - "{{ if eq .Status \"firing\" }}{{ template \"firing_%s.subject\" . }}"+ - "{{ else }}{{ template \"resolved_%s.subject\" . }}{{ end }}"+ - "{{ end }}\n", - lang, lang, - ) + for _, lang := range ValidTemplateLangs { + fmt.Fprintf(&sb, + "{{ define \"alert_%s.html\" }}"+ + "{{ if eq .Status \"firing\" }}{{ template \"firing_%s.html\" . }}"+ + "{{ else }}{{ template \"resolved_%s.html\" . }}{{ end }}"+ + "{{ end }}\n", + lang, lang, lang, + ) + fmt.Fprintf(&sb, + "{{ define \"alert_%s.txt\" }}"+ + "{{ if eq .Status \"firing\" }}{{ template \"firing_%s.txt\" . }}"+ + "{{ else }}{{ template \"resolved_%s.txt\" . }}{{ end }}"+ + "{{ end }}\n", + lang, lang, lang, + ) + fmt.Fprintf(&sb, + "{{ define \"alert_%s.subject\" }}"+ + "{{ if eq .Status \"firing\" }}{{ template \"firing_%s.subject\" . }}"+ + "{{ else }}{{ template \"resolved_%s.subject\" . }}{{ end }}"+ + "{{ end }}\n", + lang, lang, lang, + ) + } return sb.String() } diff --git a/backend/services/alerting/merge.go b/backend/services/alerting/merge.go new file mode 100644 index 00000000..8b80afd9 --- /dev/null +++ b/backend/services/alerting/merge.go @@ -0,0 +1,195 @@ +/* +Copyright (C) 2026 Nethesis S.r.l. +SPDX-License-Identifier: AGPL-3.0-or-later +*/ + +package alerting + +import ( + "strconv" + "strings" + + "github.com/nethesis/my/backend/models" +) + +// MergeLayers combines a sequence of organization-level config layers into +// the effective AlertingConfigLayer that the renderer turns into Mimir YAML. +// +// `layers` is ordered from least specific (Owner) to most specific (the +// tenant whose effective config we are computing). Order matters for dedup +// collisions (first occurrence wins for language/format) and for the +// "any layer says all severities" widening rule. +// +// Behaviour summary (security-critical: descendants can ADD, never REMOVE): +// - bool channel toggles: OR — if any layer enables a channel, effective +// is enabled. A nil or false in a deeper layer never disables what an +// ancestor enabled. +// - recipient lists: union with stable dedup. Dedup keys are +// email→address, webhook→URL, telegram→(bot_token, chat_id). +// - severities[] per recipient: union; if any contributing copy has +// severities=[] ("all severities"), the merged copy widens back to []. +// - language/format on a deduped email recipient: first-occurrence wins +// (Owner intent is preserved; descendants cannot retitle ancestor mail). +// +// IMPORTANT: the merged result is server-internal. It feeds the Mimir YAML +// renderer and nothing else. /alerts/config never returns a merged view to +// any client — descendants only see their own layer. +func MergeLayers(layers []models.AlertingConfigLayer) models.AlertingConfigLayer { + out := models.AlertingConfigLayer{ + EmailRecipients: []models.EmailRecipient{}, + WebhookRecipients: []models.WebhookRecipient{}, + TelegramRecipients: []models.TelegramRecipient{}, + } + + // OR accumulators for the three toggles; promoted to *bool at the end. + emailEnabled := false + webhookEnabled := false + telegramEnabled := false + + // Index into out.* lists by dedup key so collisions can update the + // existing entry's severities (union) without reordering. + emailIdx := map[string]int{} + webhookIdx := map[string]int{} + telegramIdx := map[string]int{} + + for _, layer := range layers { + if layer.Enabled.Email != nil && *layer.Enabled.Email { + emailEnabled = true + } + if layer.Enabled.Webhook != nil && *layer.Enabled.Webhook { + webhookEnabled = true + } + if layer.Enabled.Telegram != nil && *layer.Enabled.Telegram { + telegramEnabled = true + } + + for _, r := range layer.EmailRecipients { + addr := strings.TrimSpace(r.Address) + if addr == "" { + continue + } + if i, seen := emailIdx[addr]; seen { + out.EmailRecipients[i].Severities = unionSeverities(out.EmailRecipients[i].Severities, r.Severities) + continue + } + emailIdx[addr] = len(out.EmailRecipients) + out.EmailRecipients = append(out.EmailRecipients, models.EmailRecipient{ + Address: addr, + Severities: normalizeSeverities(r.Severities), + Language: r.Language, + Format: r.Format, + }) + } + for _, r := range layer.WebhookRecipients { + url := strings.TrimSpace(r.URL) + if url == "" { + continue + } + if i, seen := webhookIdx[url]; seen { + out.WebhookRecipients[i].Severities = unionSeverities(out.WebhookRecipients[i].Severities, r.Severities) + continue + } + webhookIdx[url] = len(out.WebhookRecipients) + out.WebhookRecipients = append(out.WebhookRecipients, models.WebhookRecipient{ + Name: r.Name, + URL: url, + Severities: normalizeSeverities(r.Severities), + }) + } + for _, r := range layer.TelegramRecipients { + key := telegramKey(r) + if i, seen := telegramIdx[key]; seen { + out.TelegramRecipients[i].Severities = unionSeverities(out.TelegramRecipients[i].Severities, r.Severities) + continue + } + telegramIdx[key] = len(out.TelegramRecipients) + out.TelegramRecipients = append(out.TelegramRecipients, models.TelegramRecipient{ + BotToken: r.BotToken, + ChatID: r.ChatID, + Severities: normalizeSeverities(r.Severities), + }) + } + } + + out.Enabled = models.ChannelToggles{ + Email: boolPtr(emailEnabled), + Webhook: boolPtr(webhookEnabled), + Telegram: boolPtr(telegramEnabled), + } + return out +} + +// NormalizeLayerForRole sanitises a layer about to be saved for a given org +// role so that descendants cannot encode subtractive settings. +// +// For any role except owner we drop *bool=&false on the three channel +// toggles. The user's intent ("disable email for my tenant") doesn't fit the +// additive model — only Owner can globally turn a channel off, and even +// then descendant layers may bring it back via OR. nil is the correct +// "no opinion" representation; we silently rewrite false → nil to keep the +// stored layer consistent with the contract. +func NormalizeLayerForRole(layer *models.AlertingConfigLayer, orgRole string) { + if layer == nil { + return + } + if strings.EqualFold(orgRole, "owner") { + return + } + if layer.Enabled.Email != nil && !*layer.Enabled.Email { + layer.Enabled.Email = nil + } + if layer.Enabled.Webhook != nil && !*layer.Enabled.Webhook { + layer.Enabled.Webhook = nil + } + if layer.Enabled.Telegram != nil && !*layer.Enabled.Telegram { + layer.Enabled.Telegram = nil + } +} + +// unionSeverities merges two severities slices with widening semantics: +// if either side encodes "all severities" (empty slice), the result is also +// empty (= all). Otherwise the union of the two sets is returned in the +// canonical order (critical, warning, info). +func unionSeverities(a, b []string) []string { + if len(a) == 0 || len(b) == 0 { + return []string{} + } + seen := map[string]struct{}{} + for _, v := range a { + seen[v] = struct{}{} + } + for _, v := range b { + seen[v] = struct{}{} + } + return canonicalSeverityOrder(seen) +} + +// normalizeSeverities returns a copy of `s` in canonical order with duplicates +// dropped and unknown values stripped. Empty (or all-unknown) → empty slice, +// which the renderer interprets as "all severities". +func normalizeSeverities(s []string) []string { + if len(s) == 0 { + return []string{} + } + seen := map[string]struct{}{} + for _, v := range s { + seen[v] = struct{}{} + } + return canonicalSeverityOrder(seen) +} + +func canonicalSeverityOrder(set map[string]struct{}) []string { + out := make([]string, 0, len(set)) + for _, sev := range []string{"critical", "warning", "info"} { + if _, ok := set[sev]; ok { + out = append(out, sev) + } + } + return out +} + +func telegramKey(r models.TelegramRecipient) string { + return r.BotToken + "|" + strconv.FormatInt(r.ChatID, 10) +} + +func boolPtr(b bool) *bool { return &b } diff --git a/backend/services/alerting/merge_test.go b/backend/services/alerting/merge_test.go new file mode 100644 index 00000000..541c4cd0 --- /dev/null +++ b/backend/services/alerting/merge_test.go @@ -0,0 +1,135 @@ +/* +Copyright (C) 2026 Nethesis S.r.l. +SPDX-License-Identifier: AGPL-3.0-or-later +*/ + +package alerting + +import ( + "reflect" + "testing" + + "github.com/nethesis/my/backend/models" +) + +func ptrTrue() *bool { + v := true + return &v +} + +func ptrFalse() *bool { + v := false + return &v +} + +func TestMergeLayers_Empty(t *testing.T) { + out := MergeLayers(nil) + if out.Enabled.Email == nil || *out.Enabled.Email { + t.Errorf("expected email disabled, got %#v", out.Enabled.Email) + } + if len(out.EmailRecipients) != 0 || len(out.WebhookRecipients) != 0 || len(out.TelegramRecipients) != 0 { + t.Errorf("expected empty lists, got %#v", out) + } +} + +func TestMergeLayers_BoolsOR(t *testing.T) { + out := MergeLayers([]models.AlertingConfigLayer{ + {Enabled: models.ChannelToggles{Email: ptrFalse()}}, + {Enabled: models.ChannelToggles{Email: ptrTrue()}}, + }) + if !*out.Enabled.Email { + t.Errorf("OR semantics: expected email=true (one layer turned it on)") + } +} + +func TestMergeLayers_EmailDedupByAddress(t *testing.T) { + out := MergeLayers([]models.AlertingConfigLayer{ + {EmailRecipients: []models.EmailRecipient{{Address: "a@x.com", Severities: []string{"critical"}, Language: "en"}}}, + {EmailRecipients: []models.EmailRecipient{{Address: "a@x.com", Severities: []string{"warning"}, Language: "it"}}}, + }) + if len(out.EmailRecipients) != 1 { + t.Fatalf("expected 1 deduped recipient, got %d", len(out.EmailRecipients)) + } + got := out.EmailRecipients[0] + if !reflect.DeepEqual(got.Severities, []string{"critical", "warning"}) { + t.Errorf("expected merged severities critical+warning, got %#v", got.Severities) + } + if got.Language != "en" { + t.Errorf("expected first-occurrence language en, got %q", got.Language) + } +} + +func TestMergeLayers_AllSeveritiesWidens(t *testing.T) { + out := MergeLayers([]models.AlertingConfigLayer{ + {EmailRecipients: []models.EmailRecipient{{Address: "a@x.com", Severities: []string{"critical"}}}}, + {EmailRecipients: []models.EmailRecipient{{Address: "a@x.com", Severities: []string{}}}}, + }) + if len(out.EmailRecipients) != 1 { + t.Fatalf("expected 1 deduped recipient, got %d", len(out.EmailRecipients)) + } + if len(out.EmailRecipients[0].Severities) != 0 { + t.Errorf("[] widening: expected empty (all severities), got %#v", out.EmailRecipients[0].Severities) + } +} + +func TestMergeLayers_WebhookDedupByURL(t *testing.T) { + out := MergeLayers([]models.AlertingConfigLayer{ + {WebhookRecipients: []models.WebhookRecipient{{Name: "owner-slack", URL: "https://hooks.example/x"}}}, + {WebhookRecipients: []models.WebhookRecipient{{Name: "reseller-slack", URL: "https://hooks.example/x"}}}, + }) + if len(out.WebhookRecipients) != 1 { + t.Fatalf("expected 1 deduped webhook, got %d", len(out.WebhookRecipients)) + } + if out.WebhookRecipients[0].Name != "owner-slack" { + t.Errorf("first-occurrence name expected, got %q", out.WebhookRecipients[0].Name) + } +} + +func TestMergeLayers_TelegramDedupByBotAndChat(t *testing.T) { + out := MergeLayers([]models.AlertingConfigLayer{ + {TelegramRecipients: []models.TelegramRecipient{{BotToken: "tok1", ChatID: -100}}}, + {TelegramRecipients: []models.TelegramRecipient{{BotToken: "tok1", ChatID: -100}}}, + {TelegramRecipients: []models.TelegramRecipient{{BotToken: "tok2", ChatID: -100}}}, + }) + if len(out.TelegramRecipients) != 2 { + t.Fatalf("expected 2 telegram recipients (different bots), got %d", len(out.TelegramRecipients)) + } +} + +func TestNormalizeLayerForRole_StripsFalseForNonOwner(t *testing.T) { + layer := models.AlertingConfigLayer{ + Enabled: models.ChannelToggles{Email: ptrFalse(), Webhook: ptrTrue()}, + } + NormalizeLayerForRole(&layer, "Reseller") + if layer.Enabled.Email != nil { + t.Errorf("non-Owner explicit false must be normalised to nil, got %#v", layer.Enabled.Email) + } + if layer.Enabled.Webhook == nil || !*layer.Enabled.Webhook { + t.Errorf("non-Owner explicit true must be preserved, got %#v", layer.Enabled.Webhook) + } +} + +func TestNormalizeLayerForRole_OwnerKeepsFalse(t *testing.T) { + layer := models.AlertingConfigLayer{ + Enabled: models.ChannelToggles{Telegram: ptrFalse()}, + } + NormalizeLayerForRole(&layer, "Owner") + if layer.Enabled.Telegram == nil || *layer.Enabled.Telegram { + t.Errorf("Owner explicit false must be preserved, got %#v", layer.Enabled.Telegram) + } +} + +func TestNormalizeSeverities_CanonicalOrderAndDrop(t *testing.T) { + got := normalizeSeverities([]string{"info", "critical", "bogus", "critical"}) + want := []string{"critical", "info"} + if !reflect.DeepEqual(got, want) { + t.Errorf("normalizeSeverities: got %v, want %v", got, want) + } +} + +func TestUnionSeverities_EmptyWidens(t *testing.T) { + got := unionSeverities([]string{"critical"}, []string{}) + if len(got) != 0 { + t.Errorf("empty side widens: got %v, want []", got) + } +} diff --git a/backend/services/alerting/provision.go b/backend/services/alerting/provision.go index 8c3c4cd8..c96301fa 100644 --- a/backend/services/alerting/provision.go +++ b/backend/services/alerting/provision.go @@ -11,62 +11,47 @@ import ( "github.com/nethesis/my/backend/configuration" "github.com/nethesis/my/backend/logger" - "github.com/nethesis/my/backend/models" ) // provisionRetryDelays controls the backoff between retry attempts when // pushing the default config to Mimir fails with a transient error. var provisionRetryDelays = []time.Duration{1 * time.Second, 3 * time.Second, 5 * time.Second} -// ProvisionDefaultConfig pushes a minimal default alerting configuration to Mimir -// for the given organization. The built-in history webhook is always active so -// resolved alerts are persisted in the alert_history table. +// ProvisionDefaultConfig is called when a new organization is created. It +// pushes the effective merged config for that org's tenant to Mimir so any +// layers already saved by ancestors (Owner/Distributor/Reseller) take +// effect immediately. The new org itself starts with no layer of its own; +// the admin opts in to notifications by saving a layer via POST /alerts/config. // -// If defaultEmail is non-empty it is stored as the default mail recipient so it -// appears pre-filled in the UI, but mail notifications are always disabled on -// creation. Webhook notifications are also always disabled. -// Both must be explicitly enabled by the user after creation. -// -// defaultLang sets the email template language: "it" or "en". Invalid or empty -// values default to English. -// -// This is typically called when a new organization is created, to ensure that -// alerts received before the user configures alerting manually are still -// captured in the history and that the empty-receiver fallback is never used. -func ProvisionDefaultConfig(orgID, defaultEmail, defaultLang string) error { +// The built-in history webhook is always active so resolved alerts are +// persisted in alert_history regardless of admin choices. +func ProvisionDefaultConfig(orgID string) error { if orgID == "" { return fmt.Errorf("orgID is required") } - // Normalize language: accept only "it" or "en", fall back to "en" otherwise. - lang := "" - switch defaultLang { - case "it", "en": - lang = defaultLang + // Compute the effective merged config from any ancestor layers that + // exist. Fail closed: a misconfigured hierarchy (cycle, missing parent + // row, transient DB error) must NOT silently provision a less-protected + // config than the Owner intended. The org creation flow can retry; the + // alternative — "fall back to local defaults" — risks losing Owner-set + // recipients/severity rules during a window we cannot otherwise detect. + effective, err := computeEffectiveLayer(orgID) + if err != nil { + return fmt.Errorf("compute effective config at provision: %w", err) } cfg := configuration.Config - defaultAlerting := &models.AlertingConfig{ - MailEnabled: false, - WebhookEnabled: false, - MailAddresses: []string{}, - WebhookReceivers: []models.WebhookReceiver{}, - EmailTemplateLang: lang, - } - if defaultEmail != "" { - defaultAlerting.MailAddresses = []string{defaultEmail} - } - yamlConfig, err := RenderConfig( cfg.SMTPHost, cfg.SMTPPort, cfg.SMTPUsername, cfg.SMTPPassword, cfg.SMTPFrom, cfg.SMTPTLS, cfg.AlertingHistoryWebhookURL, cfg.AlertingHistoryWebhookSecret, - defaultAlerting, + &effective, ) if err != nil { return fmt.Errorf("rendering default alerting config: %w", err) } - templateFiles, err := BuildTemplateFiles(lang, cfg.AppURL) + templateFiles, err := BuildTemplateFiles(cfg.AppURL) if err != nil { return fmt.Errorf("building default alerting templates: %w", err) } diff --git a/backend/services/alerting/redaction.go b/backend/services/alerting/redaction.go new file mode 100644 index 00000000..d8297935 --- /dev/null +++ b/backend/services/alerting/redaction.go @@ -0,0 +1,70 @@ +/* +Copyright (C) 2026 Nethesis S.r.l. +SPDX-License-Identifier: AGPL-3.0-or-later +*/ + +package alerting + +import ( + "net/url" + + "github.com/nethesis/my/backend/models" +) + +// RedactedSecretPlaceholder is the literal returned in place of any sensitive +// value (telegram bot token, webhook URL path/query carrying a bearer secret) +// in audit-log snapshots. Audit rows are queried by admins; the unredacted +// values live in alert_config_layers and are read only by the renderer. +const RedactedSecretPlaceholder = "[REDACTED]" + +// RedactLayerForAudit returns a copy of `layer` with secrets scrubbed for +// safe inclusion in audit log details. Used exclusively by the audit +// snapshot helpers — the API never exposes layers other than the caller's +// own and there is no /effective endpoint, so this is the only path on +// which a layer's bytes leave their owning context. +// +// Specifically: +// - telegram_recipients[].bot_token → "[REDACTED]" +// - webhook_recipients[].url → scheme://host/[REDACTED] (path/query stripped) +// +// Email addresses are NOT scrubbed: they're already user-typed PII the +// admin is authorised to see, and they double as the dedup key. +func RedactLayerForAudit(layer models.AlertingConfigLayer) models.AlertingConfigLayer { + out := layer + if len(layer.WebhookRecipients) > 0 { + out.WebhookRecipients = make([]models.WebhookRecipient, len(layer.WebhookRecipients)) + for i, w := range layer.WebhookRecipients { + out.WebhookRecipients[i] = models.WebhookRecipient{ + Name: w.Name, + URL: maskWebhookURL(w.URL), + Severities: w.Severities, + } + } + } + if len(layer.TelegramRecipients) > 0 { + out.TelegramRecipients = make([]models.TelegramRecipient, len(layer.TelegramRecipients)) + for i, t := range layer.TelegramRecipients { + out.TelegramRecipients[i] = models.TelegramRecipient{ + BotToken: RedactedSecretPlaceholder, + ChatID: t.ChatID, + Severities: t.Severities, + } + } + } + return out +} + +// maskWebhookURL keeps scheme + host + port (so the audit log records where +// the webhook went) but strips path, query, and fragment which routinely +// carry bearer-equivalent secrets (e.g. Slack incoming webhook IDs). +func maskWebhookURL(raw string) string { + u, err := url.Parse(raw) + if err != nil || u.Host == "" { + return RedactedSecretPlaceholder + } + masked := url.URL{Scheme: u.Scheme, Host: u.Host} + if u.Path != "" || u.RawQuery != "" || u.Fragment != "" { + return masked.String() + "/" + RedactedSecretPlaceholder + } + return masked.String() +} diff --git a/backend/services/alerting/template.go b/backend/services/alerting/template.go index afa4eb71..c8c7e77d 100644 --- a/backend/services/alerting/template.go +++ b/backend/services/alerting/template.go @@ -8,13 +8,10 @@ package alerting import ( "bytes" "fmt" - "regexp" "strconv" "strings" "text/template" - "gopkg.in/yaml.v3" - "github.com/nethesis/my/backend/models" ) @@ -27,16 +24,37 @@ func yamlEscape(s string) string { return s } -var validSeverityKey = regexp.MustCompile(`^[a-zA-Z0-9_]+$`) - // routeEntry represents a single child route in the Alertmanager routing tree. +// MatcherKey/Value is the primary match expression (severity="X"); empty +// MatcherKey means the catch-all fallback route. type routeEntry struct { - MatcherKey string // "system_key" or "severity"; empty = global fallback + MatcherKey string MatcherValue string - ReceiverName string // "blackhole" when notifications are disabled + ReceiverName string +} + +// emailEntry is a single email destination with its own per-recipient +// template overrides driven by the recipient's language and format +// preferences. format="" or "html" emits our html template plus our text +// template (multipart, html primary, text fallback). format="plain" emits +// our text template plus `html: ”` — the empty html: is mandatory because +// Alertmanager otherwise falls back to its built-in HTML template, which +// would override ours with the generic "Sent by Alertmanager" body. +type emailEntry struct { + To string + Language string // "en" or "it" (resolved; never empty) + UseHTML bool // true → include our html template; false → emit html: '' } -// telegramEntry represents a single Telegram notification target inside a receiver. +// webhookEntry is a single webhook destination as it appears inside a +// receiver's webhook_configs. +type webhookEntry struct { + URL string +} + +// telegramEntry is a single Telegram destination as it appears inside a +// receiver's telegram_configs. Telegram messages currently always render +// in English; extend with a per-recipient Language field when needed. type telegramEntry struct { BotToken string ChatID int64 @@ -45,8 +63,8 @@ type telegramEntry struct { // receiverEntry represents a named Alertmanager receiver. type receiverEntry struct { Name string - Emails []string - Webhooks []string + Emails []emailEntry + Webhooks []webhookEntry Telegrams []telegramEntry } @@ -61,9 +79,11 @@ type templateData struct { HistoryWebhookToken string Routes []routeEntry Receivers []receiverEntry - // EmailTemplateLang is set when custom email templates are configured ("en" or "it"). - // An empty value means Alertmanager's built-in default templates are used. - EmailTemplateLang string + // HasEmailReceivers is true when at least one receiver carries email + // destinations. The Alertmanager `templates:` block is emitted only + // then — pushing template files for tenants with no email recipients + // would just be dead weight in Mimir. + HasEmailReceivers bool } const alertmanagerTemplate = `global: @@ -118,20 +138,22 @@ receivers: {{- if .Emails }} email_configs: {{- range .Emails }} - - to: '{{ yamlEscape . }}' + - to: '{{ yamlEscape .To }}' send_resolved: true -{{- if $.EmailTemplateLang }} - html: '{{ "{{" }} template "alert.html" . {{ "}}" }}' - text: '{{ "{{" }} template "alert.txt" . {{ "}}" }}' - headers: - Subject: '{{ "{{" }} template "alert.subject" . {{ "}}" }}' +{{- if .UseHTML }} + html: '{{ "{{" }} template "alert_{{ .Language }}.html" . {{ "}}" }}' +{{- else }} + html: '' {{- end }} + text: '{{ "{{" }} template "alert_{{ .Language }}.txt" . {{ "}}" }}' + headers: + Subject: '{{ "{{" }} template "alert_{{ .Language }}.subject" . {{ "}}" }}' {{- end }} {{- end }} {{- if .Webhooks }} webhook_configs: {{- range .Webhooks }} - - url: '{{ yamlEscape . }}' + - url: '{{ yamlEscape .URL }}' send_resolved: true {{- end }} {{- end }} @@ -142,127 +164,135 @@ receivers: chat_id: {{ .ChatID }} send_resolved: true parse_mode: 'HTML' - message: '{{ "{{" }} template "telegram.message" . {{ "}}" }}' + message: '{{ "{{" }} template "telegram_en.message" . {{ "}}" }}' {{- end }} {{- end }} {{- end }} -{{- if .EmailTemplateLang }} +{{- if .HasEmailReceivers }} templates: - - 'firing_{{ .EmailTemplateLang }}.html' - - 'resolved_{{ .EmailTemplateLang }}.html' - - 'firing_{{ .EmailTemplateLang }}.txt' - - 'resolved_{{ .EmailTemplateLang }}.txt' + - 'firing_en.html' + - 'resolved_en.html' + - 'firing_en.txt' + - 'resolved_en.txt' + - 'firing_it.html' + - 'resolved_it.html' + - 'firing_it.txt' + - 'resolved_it.txt' - '_dispatcher.tmpl' - - 'telegram_{{ .EmailTemplateLang }}.tmpl' + - 'telegram_en.tmpl' + - 'telegram_it.tmpl' {{- else }} templates: [] {{- end }} ` -// effectiveSettings resolves mail/webhook/telegram settings for a given system_key and -// severity, applying override priority: system > severity > global. -// Returns (mailEnabled, webhookEnabled, telegramEnabled, emails, webhooks, telegrams). -func effectiveSettings(cfg *models.AlertingConfig, systemKey, severity string) (bool, bool, bool, []string, []string, []telegramEntry) { - mailEnabled := cfg.MailEnabled - webhookEnabled := cfg.WebhookEnabled - telegramEnabled := cfg.TelegramEnabled - emails := cfg.MailAddresses - webhooks := make([]string, 0, len(cfg.WebhookReceivers)) - for _, w := range cfg.WebhookReceivers { - webhooks = append(webhooks, w.URL) +// matchesSeverity reports whether a recipient with the given Severities[] +// configuration should receive alerts at the given severity. severities=[] +// means "all severities" — the recipient lands on every per-severity +// receiver. +func matchesSeverity(severities []string, target string) bool { + if len(severities) == 0 { + return true } - telegrams := make([]telegramEntry, 0, len(cfg.TelegramReceivers)) - for _, tg := range cfg.TelegramReceivers { - telegrams = append(telegrams, telegramEntry{BotToken: tg.BotToken, ChatID: tg.ChatID}) - } - - // Check severity override first (lower priority than system) - for _, sv := range cfg.Severities { - if sv.Severity == severity { - if sv.MailEnabled != nil { - mailEnabled = *sv.MailEnabled - } - if sv.WebhookEnabled != nil { - webhookEnabled = *sv.WebhookEnabled - } - if sv.TelegramEnabled != nil { - telegramEnabled = *sv.TelegramEnabled - } - if len(sv.MailAddresses) > 0 { - emails = sv.MailAddresses - } - if len(sv.WebhookReceivers) > 0 { - webhooks = make([]string, 0, len(sv.WebhookReceivers)) - for _, w := range sv.WebhookReceivers { - webhooks = append(webhooks, w.URL) - } - } - if len(sv.TelegramReceivers) > 0 { - telegrams = make([]telegramEntry, 0, len(sv.TelegramReceivers)) - for _, tg := range sv.TelegramReceivers { - telegrams = append(telegrams, telegramEntry{BotToken: tg.BotToken, ChatID: tg.ChatID}) - } - } - break + for _, s := range severities { + if s == target { + return true } } + return false +} - // Check system override (highest priority) - for _, sys := range cfg.Systems { - if sys.SystemKey == systemKey { - if sys.MailEnabled != nil { - mailEnabled = *sys.MailEnabled - } - if sys.WebhookEnabled != nil { - webhookEnabled = *sys.WebhookEnabled - } - if sys.TelegramEnabled != nil { - telegramEnabled = *sys.TelegramEnabled - } - if len(sys.MailAddresses) > 0 { - emails = sys.MailAddresses - } - if len(sys.WebhookReceivers) > 0 { - webhooks = make([]string, 0, len(sys.WebhookReceivers)) - for _, w := range sys.WebhookReceivers { - webhooks = append(webhooks, w.URL) - } - } - if len(sys.TelegramReceivers) > 0 { - telegrams = make([]telegramEntry, 0, len(sys.TelegramReceivers)) - for _, tg := range sys.TelegramReceivers { - telegrams = append(telegrams, telegramEntry{BotToken: tg.BotToken, ChatID: tg.ChatID}) - } - } - break - } +// resolveLanguage returns the language to render email templates with. An +// empty recipient.Language falls back to "en"; an unrecognised value falls +// back to "en" too (the model validator rejects unknown values before +// storage, this guards against any direct in-memory tampering). +func resolveLanguage(lang string) string { + switch lang { + case "it": + return "it" + default: + return "en" } +} - return mailEnabled, webhookEnabled, telegramEnabled, emails, webhooks, telegrams +// resolveUseHTML returns true when the rendered email_config should +// reference our html template. For format="plain" the renderer emits the +// literal `html: ”` instead, suppressing Alertmanager's default HTML +// fallback so only our text body is delivered. +func resolveUseHTML(format string) bool { + return format != "plain" } -// buildReceiver creates a receiverEntry with effective email, webhook, and telegram lists. -func buildReceiver(name string, mailEnabled, webhookEnabled, telegramEnabled bool, emails, webhooks []string, telegrams []telegramEntry) *receiverEntry { - r := &receiverEntry{Name: name} - if mailEnabled { - r.Emails = emails +// buildReceiver materialises a receiver entry for one severity bucket. +// Drops categories whose channel toggle is off at the global layer; drops +// the entire receiver (caller substitutes blackhole) when every list ends +// up empty. +func buildReceiver( + name string, + severity string, + cfg *models.AlertingConfigLayer, +) receiverEntry { + r := receiverEntry{Name: name} + + emailOn := cfg.Enabled.Email != nil && *cfg.Enabled.Email + webhookOn := cfg.Enabled.Webhook != nil && *cfg.Enabled.Webhook + telegramOn := cfg.Enabled.Telegram != nil && *cfg.Enabled.Telegram + + if emailOn { + for _, rcp := range cfg.EmailRecipients { + if !matchesSeverity(rcp.Severities, severity) { + continue + } + r.Emails = append(r.Emails, emailEntry{ + To: rcp.Address, + Language: resolveLanguage(rcp.Language), + UseHTML: resolveUseHTML(rcp.Format), + }) + } } - if webhookEnabled { - r.Webhooks = webhooks + if webhookOn { + for _, rcp := range cfg.WebhookRecipients { + if !matchesSeverity(rcp.Severities, severity) { + continue + } + r.Webhooks = append(r.Webhooks, webhookEntry{URL: rcp.URL}) + } } - if telegramEnabled { - r.Telegrams = telegrams + if telegramOn { + for _, rcp := range cfg.TelegramRecipients { + if !matchesSeverity(rcp.Severities, severity) { + continue + } + r.Telegrams = append(r.Telegrams, telegramEntry{ + BotToken: rcp.BotToken, + ChatID: rcp.ChatID, + }) + } } return r } -// RenderConfig renders the Alertmanager YAML configuration from AlertingConfig -// and SMTP settings. If cfg is nil, it produces a blackhole-only config. -// historyWebhookURL is always included as a non-bypassable builtin receiver. -// historyWebhookToken is the Bearer token for the history webhook (optional). -func RenderConfig(smtpHost string, smtpPort int, smtpUser, smtpPass, smtpFrom string, smtpTLS bool, historyWebhookURL, historyWebhookToken string, cfg *models.AlertingConfig) (string, error) { +// RenderConfig renders the Alertmanager YAML configuration for one tenant +// from a merged AlertingConfigLayer. The renderer fans out per-severity +// routes (critical/warning/info), each pointing to a dedicated receiver +// whose lists are restricted to the recipients in scope for that severity. +// +// Recipients with severities=[] land on every per-severity receiver +// (they apply to "all severities"). Empty buckets are routed to blackhole. +// +// historyWebhookURL is always included as a non-bypassable builtin +// receiver attached at the top of the routes via continue=true so every +// alert is mirrored to the history sink regardless of user config. +func RenderConfig( + smtpHost string, + smtpPort int, + smtpUser, smtpPass, smtpFrom string, + smtpTLS bool, + historyWebhookURL, historyWebhookToken string, + cfg *models.AlertingConfigLayer, +) (string, error) { smarthost := smtpHost if smtpPort > 0 { smarthost = smtpHost + ":" + strconv.Itoa(smtpPort) @@ -279,75 +309,33 @@ func RenderConfig(smtpHost string, smtpPort int, smtpUser, smtpPass, smtpFrom st } if cfg != nil { - // Validate severity keys - for _, sv := range cfg.Severities { - if !validSeverityKey.MatchString(sv.Severity) { - return "", fmt.Errorf("invalid severity key: %q", sv.Severity) - } - } - - // Set email template language (default to "en" when mail is used) - lang := cfg.EmailTemplateLang - if lang == "" { - lang = "en" - } - data.EmailTemplateLang = lang - - // Per-system routes - for _, sys := range cfg.Systems { - mailOn, webhookOn, telegramOn, emails, webhooks, telegrams := effectiveSettings(cfg, sys.SystemKey, "") - recvName := "system-" + sys.SystemKey + "-receiver" - if !mailOn && !webhookOn && !telegramOn { - recvName = "blackhole" - } - data.Routes = append(data.Routes, routeEntry{ - MatcherKey: "system_key", - MatcherValue: sys.SystemKey, - ReceiverName: recvName, - }) - if recvName != "blackhole" { - data.Receivers = append(data.Receivers, *buildReceiver(recvName, mailOn, webhookOn, telegramOn, emails, webhooks, telegrams)) - } - } - - // Per-severity routes - for _, sv := range cfg.Severities { - mailOn, webhookOn, telegramOn, emails, webhooks, telegrams := effectiveSettings(cfg, "", sv.Severity) - recvName := "severity-" + sv.Severity + "-receiver" - if !mailOn && !webhookOn && !telegramOn { + for _, severity := range []string{"critical", "warning", "info"} { + recv := buildReceiver("severity-"+severity+"-receiver", severity, cfg) + recvName := recv.Name + if len(recv.Emails) == 0 && len(recv.Webhooks) == 0 && len(recv.Telegrams) == 0 { recvName = "blackhole" } data.Routes = append(data.Routes, routeEntry{ MatcherKey: "severity", - MatcherValue: sv.Severity, + MatcherValue: severity, ReceiverName: recvName, }) if recvName != "blackhole" { - data.Receivers = append(data.Receivers, *buildReceiver(recvName, mailOn, webhookOn, telegramOn, emails, webhooks, telegrams)) + if len(recv.Emails) > 0 { + data.HasEmailReceivers = true + } + data.Receivers = append(data.Receivers, recv) } } - // Global fallback route - globalRecvName := "global-receiver" - if !cfg.MailEnabled && !cfg.WebhookEnabled && !cfg.TelegramEnabled { - globalRecvName = "blackhole" - } + // Catch-all fallback: alerts that escape the three per-severity + // matchers (missing/unknown severity label) go to blackhole rather + // than leaking to an undefined receiver. data.Routes = append(data.Routes, routeEntry{ MatcherKey: "", - ReceiverName: globalRecvName, + MatcherValue: "", + ReceiverName: "blackhole", }) - if globalRecvName != "blackhole" { - globalEmails := cfg.MailAddresses - globalWebhooks := make([]string, 0, len(cfg.WebhookReceivers)) - for _, w := range cfg.WebhookReceivers { - globalWebhooks = append(globalWebhooks, w.URL) - } - globalTelegrams := make([]telegramEntry, 0, len(cfg.TelegramReceivers)) - for _, tg := range cfg.TelegramReceivers { - globalTelegrams = append(globalTelegrams, telegramEntry{BotToken: tg.BotToken, ChatID: tg.ChatID}) - } - data.Receivers = append(data.Receivers, *buildReceiver(globalRecvName, cfg.MailEnabled, cfg.WebhookEnabled, cfg.TelegramEnabled, globalEmails, globalWebhooks, globalTelegrams)) - } } funcMap := template.FuncMap{"yamlEscape": yamlEscape} @@ -358,196 +346,7 @@ func RenderConfig(smtpHost string, smtpPort int, smtpUser, smtpPass, smtpFrom st var buf bytes.Buffer if err := tmpl.Execute(&buf, data); err != nil { - return "", err + return "", fmt.Errorf("rendering alertmanager template: %w", err) } - return buf.String(), nil } - -// --- YAML parsing structs (used only by ParseConfig) --- - -type amEmailConfig struct { - To string `yaml:"to"` -} -type amWebhookConfig struct { - URL string `yaml:"url"` -} -type amTelegramConfig struct { - BotToken string `yaml:"bot_token"` - ChatID int64 `yaml:"chat_id"` -} -type amReceiver struct { - Name string `yaml:"name"` - EmailConfigs []amEmailConfig `yaml:"email_configs"` - WebhookConfigs []amWebhookConfig `yaml:"webhook_configs"` - TelegramConfigs []amTelegramConfig `yaml:"telegram_configs"` -} -type amRoute struct { - Receiver string `yaml:"receiver"` - Continue bool `yaml:"continue"` - Matchers []string `yaml:"matchers"` - Routes []amRoute `yaml:"routes"` -} -type amConfig struct { - Route amRoute `yaml:"route"` - Receivers []amReceiver `yaml:"receivers"` - Templates []string `yaml:"templates"` -} - -// parseMatcherValue extracts the value from a matcher string like `key="value"`. -func parseMatcherValue(matcher string) (key, value string) { - // Supports both key="value" and key=value - idx := strings.Index(matcher, "=") - if idx < 0 { - return "", "" - } - key = strings.TrimSpace(matcher[:idx]) - value = strings.Trim(strings.TrimSpace(matcher[idx+1:]), `"'`) - return key, value -} - -// ParseConfig parses an Alertmanager YAML configuration (as stored in Mimir) -// back into an AlertingConfig struct. Returns nil if the config is blackhole-only. -func ParseConfig(yamlStr string) (*models.AlertingConfig, error) { - // Mimir wraps the config under alertmanager_config key - var wrapper struct { - AlertmanagerConfig string `yaml:"alertmanager_config"` - } - if err := yaml.Unmarshal([]byte(yamlStr), &wrapper); err == nil && wrapper.AlertmanagerConfig != "" { - yamlStr = wrapper.AlertmanagerConfig - } - - var am amConfig - if err := yaml.Unmarshal([]byte(yamlStr), &am); err != nil { - return nil, fmt.Errorf("parsing alertmanager config: %w", err) - } - - // Build receiver lookup: name -> amReceiver - receiverMap := make(map[string]amReceiver, len(am.Receivers)) - for _, r := range am.Receivers { - receiverMap[r.Name] = r - } - - cfg := &models.AlertingConfig{} - hasAnyConfig := false - - for _, route := range am.Route.Routes { - recv := route.Receiver - - // Skip builtin-history (internal, not user-configurable) - if recv == "builtin-history" { - continue - } - - // Determine what this route matches - var matchKey, matchValue string - for _, m := range route.Matchers { - k, v := parseMatcherValue(m) - if k == "system_key" || k == "severity" { - matchKey = k - matchValue = v - break - } - } - - mailEnabled := recv != "blackhole" - webhookEnabled := recv != "blackhole" - telegramEnabled := recv != "blackhole" - var emails []string - var webhooks []WebhookEntry - var telegramReceivers []models.TelegramReceiver - - if recv != "blackhole" { - r, ok := receiverMap[recv] - if ok { - for _, ec := range r.EmailConfigs { - emails = append(emails, ec.To) - } - for _, wc := range r.WebhookConfigs { - // Infer name from receiver name (best effort) - name := strings.TrimSuffix(recv, "-receiver") - webhooks = append(webhooks, WebhookEntry{Name: name, URL: wc.URL}) - } - for _, tc := range r.TelegramConfigs { - telegramReceivers = append(telegramReceivers, models.TelegramReceiver{BotToken: tc.BotToken, ChatID: tc.ChatID}) - } - } - mailEnabled = len(emails) > 0 - webhookEnabled = len(webhooks) > 0 - telegramEnabled = len(telegramReceivers) > 0 - } - - switch matchKey { - case "system_key": - hasAnyConfig = true - bMailEnabled := mailEnabled - bWebhookEnabled := webhookEnabled - bTelegramEnabled := telegramEnabled - override := models.SystemOverride{ - SystemKey: matchValue, - MailEnabled: &bMailEnabled, - WebhookEnabled: &bWebhookEnabled, - TelegramEnabled: &bTelegramEnabled, - } - override.MailAddresses = append(override.MailAddresses, emails...) - for _, w := range webhooks { - override.WebhookReceivers = append(override.WebhookReceivers, models.WebhookReceiver{Name: w.Name, URL: w.URL}) - } - override.TelegramReceivers = append(override.TelegramReceivers, telegramReceivers...) - cfg.Systems = append(cfg.Systems, override) - - case "severity": - hasAnyConfig = true - bMailEnabled := mailEnabled - bWebhookEnabled := webhookEnabled - bTelegramEnabled := telegramEnabled - override := models.SeverityOverride{ - Severity: matchValue, - MailEnabled: &bMailEnabled, - WebhookEnabled: &bWebhookEnabled, - TelegramEnabled: &bTelegramEnabled, - } - override.MailAddresses = append(override.MailAddresses, emails...) - for _, w := range webhooks { - override.WebhookReceivers = append(override.WebhookReceivers, models.WebhookReceiver{Name: w.Name, URL: w.URL}) - } - override.TelegramReceivers = append(override.TelegramReceivers, telegramReceivers...) - cfg.Severities = append(cfg.Severities, override) - - default: - // Global fallback route - hasAnyConfig = true - cfg.MailEnabled = mailEnabled - cfg.WebhookEnabled = webhookEnabled - cfg.TelegramEnabled = telegramEnabled - cfg.MailAddresses = append(cfg.MailAddresses, emails...) - for _, w := range webhooks { - cfg.WebhookReceivers = append(cfg.WebhookReceivers, models.WebhookReceiver{Name: w.Name, URL: w.URL}) - } - cfg.TelegramReceivers = append(cfg.TelegramReceivers, telegramReceivers...) - } - } - - if !hasAnyConfig { - return nil, nil - } - - // Detect email template language from the templates list. - for _, t := range am.Templates { - if strings.Contains(t, "_en.") { - cfg.EmailTemplateLang = "en" - break - } else if strings.Contains(t, "_it.") { - cfg.EmailTemplateLang = "it" - break - } - } - - return cfg, nil -} - -// WebhookEntry is a temporary struct used during YAML parsing. -type WebhookEntry struct { - Name string - URL string -} diff --git a/backend/services/alerting/template_test.go b/backend/services/alerting/template_test.go index e4005834..9b2e0910 100644 --- a/backend/services/alerting/template_test.go +++ b/backend/services/alerting/template_test.go @@ -9,752 +9,202 @@ import ( "strings" "testing" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" - "gopkg.in/yaml.v3" - "github.com/nethesis/my/backend/models" ) -// smtpArgs returns the common SMTP args used across tests. -func smtpArgs() (string, int, string, string, string, bool) { - return "smtp.example.com", 587, "user", "pass", "from@example.com", true -} - -func boolPtr(b bool) *bool { return &b } - -// isValidYAML checks that s is parseable YAML. -func isValidYAML(t *testing.T, s string) { +func renderForTest(t *testing.T, cfg *models.AlertingConfigLayer) string { t.Helper() - var out interface{} - require.NoError(t, yaml.Unmarshal([]byte(s), &out), "YAML must be valid") -} - -// --- RenderConfig tests --- - -func TestRenderConfig_NilCfg_BlackholeOnly(t *testing.T) { - host, port, user, pass, from, tls := smtpArgs() - out, err := RenderConfig(host, port, user, pass, from, tls, "", "", nil) - require.NoError(t, err) - isValidYAML(t, out) - - assert.Contains(t, out, "receiver: 'blackhole'") - assert.NotContains(t, out, "builtin-history") - assert.NotContains(t, out, "system_key=") - assert.NotContains(t, out, "severity=") -} - -func TestRenderConfig_NilCfg_WithHistoryURL(t *testing.T) { - host, port, user, pass, from, tls := smtpArgs() - out, err := RenderConfig(host, port, user, pass, from, tls, "http://history.example.com/hook", "", nil) - require.NoError(t, err) - isValidYAML(t, out) - - assert.Contains(t, out, "builtin-history") - assert.Contains(t, out, "url: 'http://history.example.com/hook'") - assert.Contains(t, out, "continue: true") - // No user routes - assert.NotContains(t, out, "system_key=") - assert.NotContains(t, out, "severity=") -} - -func TestRenderConfig_GlobalMailOnly(t *testing.T) { - host, port, user, pass, from, tls := smtpArgs() - cfg := &models.AlertingConfig{ - MailEnabled: true, - MailAddresses: []string{"admin@example.com"}, - } - out, err := RenderConfig(host, port, user, pass, from, tls, "", "", cfg) - require.NoError(t, err) - isValidYAML(t, out) - - assert.Contains(t, out, "receiver: 'global-receiver'") - assert.Contains(t, out, "to: 'admin@example.com'") - assert.NotContains(t, out, "webhook_configs") -} - -func TestRenderConfig_GlobalWebhookOnly(t *testing.T) { - host, port, user, pass, from, tls := smtpArgs() - cfg := &models.AlertingConfig{ - WebhookEnabled: true, - WebhookReceivers: []models.WebhookReceiver{ - {Name: "slack", URL: "https://hooks.slack.com/abc"}, - }, + out, err := RenderConfig( + "smtp.example", 587, "u", "p", "from@example", true, + "", "", + cfg, + ) + if err != nil { + t.Fatalf("RenderConfig error: %v", err) } - out, err := RenderConfig(host, port, user, pass, from, tls, "", "", cfg) - require.NoError(t, err) - isValidYAML(t, out) - - assert.Contains(t, out, "receiver: 'global-receiver'") - assert.Contains(t, out, "url: 'https://hooks.slack.com/abc'") - assert.NotContains(t, out, "email_configs") + return out } -func TestRenderConfig_GlobalDisabled_BlackholeRoute(t *testing.T) { - host, port, user, pass, from, tls := smtpArgs() - cfg := &models.AlertingConfig{ - MailEnabled: false, - WebhookEnabled: false, +func TestRenderConfig_NilCfg_BlackholeOnly(t *testing.T) { + out := renderForTest(t, nil) + if !strings.Contains(out, "receiver: 'blackhole'") { + t.Errorf("expected blackhole receiver, got:\n%s", out) } - out, err := RenderConfig(host, port, user, pass, from, tls, "", "", cfg) - require.NoError(t, err) - isValidYAML(t, out) - - // Global route must point to blackhole - assert.Contains(t, out, "- receiver: 'blackhole'") - // No named global-receiver - assert.NotContains(t, out, "global-receiver") -} - -func TestRenderConfig_HistoryAlwaysFires_EvenWhenDisabled(t *testing.T) { - host, port, user, pass, from, tls := smtpArgs() - cfg := &models.AlertingConfig{ - MailEnabled: false, - WebhookEnabled: false, + // No per-severity routes when cfg is nil. + if strings.Contains(out, "severity=") { + t.Errorf("nil cfg should not emit severity matchers, got:\n%s", out) } - out, err := RenderConfig(host, port, user, pass, from, tls, "http://history.example.com/hook", "", cfg) - require.NoError(t, err) - isValidYAML(t, out) - - assert.Contains(t, out, "builtin-history") - assert.Contains(t, out, "continue: true") } -func TestRenderConfig_SeverityOverride_DisablesWarning(t *testing.T) { - host, port, user, pass, from, tls := smtpArgs() - cfg := &models.AlertingConfig{ - MailEnabled: true, - MailAddresses: []string{"global@example.com"}, - Severities: []models.SeverityOverride{ - { - Severity: "warning", - MailEnabled: boolPtr(false), - }, +func TestRenderConfig_PerSeverityFanout(t *testing.T) { + cfg := &models.AlertingConfigLayer{ + Enabled: models.ChannelToggles{Email: ptrTrue()}, + EmailRecipients: []models.EmailRecipient{ + {Address: "all@x.com", Severities: []string{}, Language: "en", Format: "html"}, + {Address: "crit@x.com", Severities: []string{"critical"}, Language: "en", Format: "html"}, }, } - out, err := RenderConfig(host, port, user, pass, from, tls, "", "", cfg) - require.NoError(t, err) - isValidYAML(t, out) - - // Warning severity route must point to blackhole - lines := strings.Split(out, "\n") - inWarningBlock := false - for _, line := range lines { - if strings.Contains(line, `severity="warning"`) { - inWarningBlock = true - } - if inWarningBlock && strings.Contains(line, "receiver:") { - assert.Contains(t, line, "blackhole", "warning severity should route to blackhole") - break + out := renderForTest(t, cfg) + for _, sev := range []string{"critical", "warning", "info"} { + if !strings.Contains(out, "severity=\""+sev+"\"") { + t.Errorf("expected route matcher for severity=%s, got:\n%s", sev, out) } } - assert.Contains(t, out, `severity="warning"`) -} - -func TestRenderConfig_SeverityOverride_CustomAddresses(t *testing.T) { - host, port, user, pass, from, tls := smtpArgs() - cfg := &models.AlertingConfig{ - MailEnabled: true, - MailAddresses: []string{"global@example.com"}, - Severities: []models.SeverityOverride{ - { - Severity: "critical", - MailEnabled: boolPtr(true), - MailAddresses: []string{"oncall@example.com"}, - }, - }, + // all@ goes into every severity bucket; crit@ only into critical. + // Locate the *receiver definition* (not the route match) by the + // `name: '…-receiver'` line — the first occurrence is inside `routes:` + // which only references the name, not the email_configs. + criticalIdx := strings.Index(out, "name: 'severity-critical-receiver'") + warningIdx := strings.Index(out, "name: 'severity-warning-receiver'") + infoIdx := strings.Index(out, "name: 'severity-info-receiver'") + if criticalIdx < 0 { + t.Fatalf("missing critical receiver definition; got:\n%s", out) } - out, err := RenderConfig(host, port, user, pass, from, tls, "", "", cfg) - require.NoError(t, err) - isValidYAML(t, out) - - assert.Contains(t, out, "severity-critical-receiver") - assert.Contains(t, out, "oncall@example.com") - // Global address must not appear in the critical receiver - idx := strings.Index(out, "severity-critical-receiver") - afterCritical := out[idx:] - nextReceiver := strings.Index(afterCritical[1:], "- name:") - if nextReceiver > 0 { - criticalSection := afterCritical[:nextReceiver] - assert.NotContains(t, criticalSection, "global@example.com") + endCritical := len(out) + if warningIdx > criticalIdx { + endCritical = warningIdx + } else if infoIdx > criticalIdx { + endCritical = infoIdx } -} - -func TestRenderConfig_SeverityOverride_InheritsGlobalAddresses(t *testing.T) { - host, port, user, pass, from, tls := smtpArgs() - cfg := &models.AlertingConfig{ - MailEnabled: true, - MailAddresses: []string{"global@example.com"}, - Severities: []models.SeverityOverride{ - { - Severity: "critical", - MailEnabled: boolPtr(true), - // No MailAddresses → should inherit global - }, - }, + critBlock := out[criticalIdx:endCritical] + if !strings.Contains(critBlock, "to: 'crit@x.com'") { + t.Errorf("crit@ should be on critical receiver, got:\n%s", critBlock) } - out, err := RenderConfig(host, port, user, pass, from, tls, "", "", cfg) - require.NoError(t, err) - isValidYAML(t, out) - - assert.Contains(t, out, "severity-critical-receiver") - assert.Contains(t, out, "global@example.com") -} - -func TestRenderConfig_SystemOverride(t *testing.T) { - host, port, user, pass, from, tls := smtpArgs() - cfg := &models.AlertingConfig{ - MailEnabled: true, - MailAddresses: []string{"global@example.com"}, - Systems: []models.SystemOverride{ - { - SystemKey: "ns8-prod", - MailEnabled: boolPtr(true), - MailAddresses: []string{"ops@example.com"}, - }, - }, + if !strings.Contains(critBlock, "to: 'all@x.com'") { + t.Errorf("all@ (severities=[]) should be on critical receiver, got:\n%s", critBlock) } - out, err := RenderConfig(host, port, user, pass, from, tls, "", "", cfg) - require.NoError(t, err) - isValidYAML(t, out) - - assert.Contains(t, out, `system_key="ns8-prod"`) - assert.Contains(t, out, "system-ns8-prod-receiver") - assert.Contains(t, out, "ops@example.com") } -func TestRenderConfig_SystemOverride_Disabled(t *testing.T) { - host, port, user, pass, from, tls := smtpArgs() - cfg := &models.AlertingConfig{ - MailEnabled: true, - MailAddresses: []string{"global@example.com"}, - Systems: []models.SystemOverride{ - { - SystemKey: "ns8-silent", - MailEnabled: boolPtr(false), - }, +func TestRenderConfig_FormatPlain_EmitsEmptyHTML(t *testing.T) { + cfg := &models.AlertingConfigLayer{ + Enabled: models.ChannelToggles{Email: ptrTrue()}, + EmailRecipients: []models.EmailRecipient{ + {Address: "p@x.com", Severities: []string{"critical"}, Language: "en", Format: "plain"}, }, } - out, err := RenderConfig(host, port, user, pass, from, tls, "", "", cfg) - require.NoError(t, err) - isValidYAML(t, out) - - assert.Contains(t, out, `system_key="ns8-silent"`) - // System route must point to blackhole - lines := strings.Split(out, "\n") - inSystemBlock := false - for _, line := range lines { - if strings.Contains(line, `system_key="ns8-silent"`) { - inSystemBlock = true - } - if inSystemBlock && strings.Contains(line, "receiver:") { - assert.Contains(t, line, "blackhole") - break - } + out := renderForTest(t, cfg) + critIdx := strings.Index(out, "to: 'p@x.com'") + if critIdx < 0 { + t.Fatalf("recipient not found, got:\n%s", out) } -} - -func TestRenderConfig_InvalidSeverityKey(t *testing.T) { - host, port, user, pass, from, tls := smtpArgs() - cfg := &models.AlertingConfig{ - MailEnabled: true, - MailAddresses: []string{"a@b.com"}, - Severities: []models.SeverityOverride{ - {Severity: "bad severity!"}, - }, + tail := out[critIdx : critIdx+400] + // Plain format must emit `html: ''` explicitly — Alertmanager's default + // HTML template overrides ours when html: is absent (see emailEntry doc). + if !strings.Contains(tail, "html: ''") { + t.Errorf("plain format must emit html: '' to suppress Alertmanager default, got:\n%s", tail) } - _, err := RenderConfig(host, port, user, pass, from, tls, "", "", cfg) - require.Error(t, err) - assert.Contains(t, err.Error(), "invalid severity key") -} - -func TestRenderConfig_SmtpCredentialsRedacted(t *testing.T) { - host, port, user, pass, from, tls := smtpArgs() - out, err := RenderConfig(host, port, user, pass, from, tls, "", "", nil) - require.NoError(t, err) - - // SMTP creds appear in raw output; RedactSensitiveConfig removes them - assert.Contains(t, out, "smtp_auth_username: 'user'") - assert.Contains(t, out, "smtp_auth_password: 'pass'") -} - -func TestRedactSensitiveConfig_BearerToken(t *testing.T) { - input := `global: - smtp_smarthost: 'smtp.example.com' - smtp_auth_username: 'testuser' - smtp_auth_password: 'testpass' - -receivers: - - name: 'builtin-history' - webhook_configs: - - url: 'http://example.com/webhook' - http_config: - authorization: - type: Bearer - credentials: 'secret-token-12345'` - - output := RedactSensitiveConfig(input) - - // Bearer token should be redacted - assert.NotContains(t, output, "secret-token-12345") - assert.Contains(t, output, "credentials: '[REDACTED]'") - - // SMTP credentials should also be redacted - assert.NotContains(t, output, "testpass") - assert.Contains(t, output, "smtp_auth_password: '[REDACTED]'") -} - -// --- ParseConfig tests --- - -func TestParseConfig_BlackholeOnly_ReturnsNil(t *testing.T) { - host, port, user, pass, from, tls := smtpArgs() - yamlStr, err := RenderConfig(host, port, user, pass, from, tls, "", "", nil) - require.NoError(t, err) - - cfg, err := ParseConfig(yamlStr) - require.NoError(t, err) - assert.Nil(t, cfg, "blackhole-only config should parse to nil") -} - -func TestParseConfig_HistoryOnly_ReturnsNil(t *testing.T) { - host, port, user, pass, from, tls := smtpArgs() - yamlStr, err := RenderConfig(host, port, user, pass, from, tls, "http://history.example.com/hook", "", nil) - require.NoError(t, err) - - cfg, err := ParseConfig(yamlStr) - require.NoError(t, err) - assert.Nil(t, cfg, "history-only config should parse to nil") -} - -func TestParseConfig_GlobalMail_Roundtrip(t *testing.T) { - host, port, user, pass, from, tls := smtpArgs() - original := &models.AlertingConfig{ - MailEnabled: true, - MailAddresses: []string{"admin@example.com"}, + if !strings.Contains(tail, "text: '{{ template \"alert_en.txt\"") { + t.Errorf("plain format must still include text: dispatcher reference, got:\n%s", tail) } - yamlStr, err := RenderConfig(host, port, user, pass, from, tls, "", "", original) - require.NoError(t, err) - - parsed, err := ParseConfig(yamlStr) - require.NoError(t, err) - require.NotNil(t, parsed) - - assert.True(t, parsed.MailEnabled) - assert.False(t, parsed.WebhookEnabled) - assert.Equal(t, []string{"admin@example.com"}, parsed.MailAddresses) } -func TestParseConfig_GlobalWebhook_Roundtrip(t *testing.T) { - host, port, user, pass, from, tls := smtpArgs() - original := &models.AlertingConfig{ - WebhookEnabled: true, - WebhookReceivers: []models.WebhookReceiver{ - {Name: "slack", URL: "https://hooks.slack.com/abc"}, +func TestRenderConfig_FormatHTMLDefault_IncludesBoth(t *testing.T) { + cfg := &models.AlertingConfigLayer{ + Enabled: models.ChannelToggles{Email: ptrTrue()}, + EmailRecipients: []models.EmailRecipient{ + {Address: "h@x.com", Severities: []string{"critical"}, Language: "it"}, }, } - yamlStr, err := RenderConfig(host, port, user, pass, from, tls, "", "", original) - require.NoError(t, err) - - parsed, err := ParseConfig(yamlStr) - require.NoError(t, err) - require.NotNil(t, parsed) - - assert.False(t, parsed.MailEnabled) - assert.True(t, parsed.WebhookEnabled) - require.Len(t, parsed.WebhookReceivers, 1) - assert.Equal(t, "https://hooks.slack.com/abc", parsed.WebhookReceivers[0].URL) -} - -func TestParseConfig_SeverityOverride_Roundtrip(t *testing.T) { - host, port, user, pass, from, tls := smtpArgs() - original := &models.AlertingConfig{ - MailEnabled: true, - MailAddresses: []string{"global@example.com"}, - Severities: []models.SeverityOverride{ - { - Severity: "critical", - MailEnabled: boolPtr(true), - MailAddresses: []string{"oncall@example.com"}, - }, - }, + out := renderForTest(t, cfg) + if !strings.Contains(out, "html: '{{ template \"alert_it.html\"") { + t.Errorf("default format=html missing html: reference, got:\n%s", out) + } + if !strings.Contains(out, "text: '{{ template \"alert_it.txt\"") { + t.Errorf("default format=html missing text: fallback, got:\n%s", out) } - yamlStr, err := RenderConfig(host, port, user, pass, from, tls, "", "", original) - require.NoError(t, err) - - parsed, err := ParseConfig(yamlStr) - require.NoError(t, err) - require.NotNil(t, parsed) - - require.Len(t, parsed.Severities, 1) - assert.Equal(t, "critical", parsed.Severities[0].Severity) - require.NotNil(t, parsed.Severities[0].MailEnabled) - assert.True(t, *parsed.Severities[0].MailEnabled) - assert.Equal(t, []string{"oncall@example.com"}, parsed.Severities[0].MailAddresses) } -func TestParseConfig_SystemOverride_Roundtrip(t *testing.T) { - host, port, user, pass, from, tls := smtpArgs() - original := &models.AlertingConfig{ - MailEnabled: true, - MailAddresses: []string{"global@example.com"}, - Systems: []models.SystemOverride{ - { - SystemKey: "ns8-prod", - MailEnabled: boolPtr(true), - MailAddresses: []string{"ops@example.com"}, - }, +func TestRenderConfig_EmailDisabledDropsEmailConfigs(t *testing.T) { + cfg := &models.AlertingConfigLayer{ + Enabled: models.ChannelToggles{Email: ptrFalse(), Webhook: ptrTrue()}, + EmailRecipients: []models.EmailRecipient{ + {Address: "a@x.com", Severities: []string{"critical"}}, + }, + WebhookRecipients: []models.WebhookRecipient{ + {Name: "w", URL: "https://hooks.example/x", Severities: []string{"critical"}}, }, } - yamlStr, err := RenderConfig(host, port, user, pass, from, tls, "", "", original) - require.NoError(t, err) - - parsed, err := ParseConfig(yamlStr) - require.NoError(t, err) - require.NotNil(t, parsed) - - require.Len(t, parsed.Systems, 1) - assert.Equal(t, "ns8-prod", parsed.Systems[0].SystemKey) - require.NotNil(t, parsed.Systems[0].MailEnabled) - assert.True(t, *parsed.Systems[0].MailEnabled) - assert.Equal(t, []string{"ops@example.com"}, parsed.Systems[0].MailAddresses) -} - -func TestParseConfig_MimirWrapperFormat(t *testing.T) { - host, port, user, pass, from, tls := smtpArgs() - inner, err := RenderConfig(host, port, user, pass, from, tls, "", "", &models.AlertingConfig{ - MailEnabled: true, - MailAddresses: []string{"a@b.com"}, - }) - require.NoError(t, err) - - // Simulate Mimir wrapper format - wrapped := "alertmanager_config: |\n" - for _, line := range strings.Split(inner, "\n") { - wrapped += " " + line + "\n" + out := renderForTest(t, cfg) + if strings.Contains(out, "to: 'a@x.com'") { + t.Errorf("channel toggle off must drop email_configs, got:\n%s", out) } - - cfg, err := ParseConfig(wrapped) - require.NoError(t, err) - require.NotNil(t, cfg) - assert.True(t, cfg.MailEnabled) - assert.Equal(t, []string{"a@b.com"}, cfg.MailAddresses) -} - -func TestParseConfig_GlobalDisabled_NotNil(t *testing.T) { - host, port, user, pass, from, tls := smtpArgs() - cfg := &models.AlertingConfig{ - MailEnabled: false, - WebhookEnabled: false, + if !strings.Contains(out, "url: 'https://hooks.example/x'") { + t.Errorf("webhook toggle on must still emit webhook_configs, got:\n%s", out) } - yamlStr, err := RenderConfig(host, port, user, pass, from, tls, "", "", cfg) - require.NoError(t, err) - - parsed, err := ParseConfig(yamlStr) - require.NoError(t, err) - // Globally disabled but explicitly configured → non-nil - require.NotNil(t, parsed) - assert.False(t, parsed.MailEnabled) - assert.False(t, parsed.WebhookEnabled) } -func TestParseConfig_DisabledSeverity_BlackholeRoute(t *testing.T) { - host, port, user, pass, from, tls := smtpArgs() - cfg := &models.AlertingConfig{ - MailEnabled: true, - MailAddresses: []string{"global@example.com"}, - Severities: []models.SeverityOverride{ - { - Severity: "warning", - MailEnabled: boolPtr(false), - }, +func TestRenderConfig_TelegramReferencesEnTemplate(t *testing.T) { + cfg := &models.AlertingConfigLayer{ + Enabled: models.ChannelToggles{Telegram: ptrTrue()}, + TelegramRecipients: []models.TelegramRecipient{ + {BotToken: "tok", ChatID: -100, Severities: []string{}}, }, } - yamlStr, err := RenderConfig(host, port, user, pass, from, tls, "", "", cfg) - require.NoError(t, err) - - parsed, err := ParseConfig(yamlStr) - require.NoError(t, err) - require.NotNil(t, parsed) - - require.Len(t, parsed.Severities, 1) - assert.Equal(t, "warning", parsed.Severities[0].Severity) - require.NotNil(t, parsed.Severities[0].MailEnabled) - assert.False(t, *parsed.Severities[0].MailEnabled) -} - -func TestParseConfig_InvalidYAML(t *testing.T) { - _, err := ParseConfig("not: valid: yaml: ::::") - require.Error(t, err) -} - -// --- yamlEscape unit tests --- - -func TestYamlEscape_SingleQuote(t *testing.T) { - assert.Equal(t, "it''s", yamlEscape("it's")) -} - -func TestYamlEscape_Newlines(t *testing.T) { - assert.Equal(t, "noline", yamlEscape("no\nline")) - assert.Equal(t, "noline", yamlEscape("no\rline")) -} - -func TestYamlEscape_Empty(t *testing.T) { - assert.Equal(t, "", yamlEscape("")) -} - -// --- parseMatcherValue unit tests --- - -func TestParseMatcherValue_DoubleQuoted(t *testing.T) { - k, v := parseMatcherValue(`system_key="ns8-prod"`) - assert.Equal(t, "system_key", k) - assert.Equal(t, "ns8-prod", v) -} - -func TestParseMatcherValue_Unquoted(t *testing.T) { - k, v := parseMatcherValue("severity=critical") - assert.Equal(t, "severity", k) - assert.Equal(t, "critical", v) -} - -func TestParseMatcherValue_NoEquals(t *testing.T) { - k, v := parseMatcherValue("invalid") - assert.Empty(t, k) - assert.Empty(t, v) -} - -// --- EmailTemplateLang / BuildTemplateFiles tests --- - -func TestRenderConfig_DefaultsToEnglishTemplates(t *testing.T) { - host, port, user, pass, from, tls := smtpArgs() - cfg := &models.AlertingConfig{ - MailEnabled: true, - MailAddresses: []string{"admin@example.com"}, - // EmailTemplateLang not set → should default to "en" - } - out, err := RenderConfig(host, port, user, pass, from, tls, "", "", cfg) - require.NoError(t, err) - isValidYAML(t, out) - - assert.Contains(t, out, "firing_en.html") - assert.Contains(t, out, "resolved_en.html") - assert.Contains(t, out, "firing_en.txt") - assert.Contains(t, out, "resolved_en.txt") - assert.Contains(t, out, "_dispatcher.tmpl") - assert.Contains(t, out, `template "alert.html"`) - assert.Contains(t, out, `template "alert.txt"`) - assert.Contains(t, out, `template "alert.subject"`) -} - -func TestRenderConfig_ItalianTemplates(t *testing.T) { - host, port, user, pass, from, tls := smtpArgs() - cfg := &models.AlertingConfig{ - MailEnabled: true, - MailAddresses: []string{"admin@example.com"}, - EmailTemplateLang: "it", + out := renderForTest(t, cfg) + if !strings.Contains(out, "{{ template \"telegram_en.message\"") { + t.Errorf("telegram message dispatcher should be english-only, got:\n%s", out) } - out, err := RenderConfig(host, port, user, pass, from, tls, "", "", cfg) - require.NoError(t, err) - isValidYAML(t, out) - - assert.Contains(t, out, "firing_it.html") - assert.Contains(t, out, "resolved_it.html") - assert.Contains(t, out, "firing_it.txt") - assert.Contains(t, out, "resolved_it.txt") - assert.NotContains(t, out, "firing_en.html") } -func TestRenderConfig_NilCfg_NoTemplateSection(t *testing.T) { - host, port, user, pass, from, tls := smtpArgs() - out, err := RenderConfig(host, port, user, pass, from, tls, "", "", nil) - require.NoError(t, err) - - // nil cfg → blackhole-only, no email templates needed - assert.Contains(t, out, "templates: []") - assert.NotContains(t, out, "firing_en.html") -} - -func TestRenderConfig_WebhookOnly_NoHtmlField(t *testing.T) { - host, port, user, pass, from, tls := smtpArgs() - cfg := &models.AlertingConfig{ - WebhookEnabled: true, - WebhookReceivers: []models.WebhookReceiver{ - {Name: "slack", URL: "https://hooks.slack.com/abc"}, - }, +func TestRenderConfig_HistoryWebhook(t *testing.T) { + out, err := RenderConfig( + "smtp.example", 587, "u", "p", "from@example", true, + "https://history.example/sink", "secret-bearer", + nil, + ) + if err != nil { + t.Fatalf("err: %v", err) } - out, err := RenderConfig(host, port, user, pass, from, tls, "", "", cfg) - require.NoError(t, err) - isValidYAML(t, out) - - // No email_configs → no html: field in output - assert.NotContains(t, out, `html:`) -} - -func TestBuildTemplateFiles_English(t *testing.T) { - files, err := BuildTemplateFiles("en", "https://my.nethesis.it") - require.NoError(t, err) - - expected := []string{ - "firing_en.html", - "resolved_en.html", - "firing_en.txt", - "resolved_en.txt", - "_dispatcher.tmpl", + if !strings.Contains(out, "name: 'builtin-history'") { + t.Errorf("history receiver should be emitted when URL set, got:\n%s", out) } - for _, name := range expected { - content, ok := files[name] - require.True(t, ok, "missing template file: %s", name) - assert.NotEmpty(t, content, "template file %s is empty", name) + if !strings.Contains(out, "credentials: 'secret-bearer'") { + t.Errorf("bearer token should propagate to webhook_configs, got:\n%s", out) } - - // Dispatcher must route to en templates - assert.Contains(t, files["_dispatcher.tmpl"], `firing_en.html`) - assert.Contains(t, files["_dispatcher.tmpl"], `resolved_en.html`) - assert.Contains(t, files["_dispatcher.tmpl"], `firing_en.txt`) - assert.Contains(t, files["_dispatcher.tmpl"], `resolved_en.txt`) - assert.Contains(t, files["_dispatcher.tmpl"], `firing_en.subject`) - assert.Contains(t, files["_dispatcher.tmpl"], `resolved_en.subject`) -} - -func TestBuildTemplateFiles_Italian(t *testing.T) { - files, err := BuildTemplateFiles("it", "https://my.nethesis.it") - require.NoError(t, err) - - assert.Contains(t, files, "firing_it.html") - assert.Contains(t, files, "resolved_it.html") - assert.Contains(t, files["_dispatcher.tmpl"], `firing_it.html`) -} - -func TestBuildTemplateFiles_EmptyLang_DefaultsToEnglish(t *testing.T) { - files, err := BuildTemplateFiles("", "https://my.nethesis.it") - require.NoError(t, err) - assert.Contains(t, files, "firing_en.html") } -func TestBuildTemplateFiles_InvalidLang_ReturnsError(t *testing.T) { - _, err := BuildTemplateFiles("zz", "https://my.nethesis.it") - require.Error(t, err) - assert.Contains(t, err.Error(), "zz") -} - -func TestTemplateFiles_DefineNamedTemplates(t *testing.T) { - files, err := BuildTemplateFiles("en", "https://my.nethesis.it") - require.NoError(t, err) - - assert.Contains(t, files["firing_en.html"], `define "firing_en.html"`) - assert.Contains(t, files["firing_en.html"], `define "firing_en.subject"`) - assert.Contains(t, files["resolved_en.html"], `define "resolved_en.html"`) - assert.Contains(t, files["resolved_en.html"], `define "resolved_en.subject"`) - assert.Contains(t, files["firing_en.txt"], `define "firing_en.txt"`) - assert.Contains(t, files["resolved_en.txt"], `define "resolved_en.txt"`) -} - -func TestParseConfig_EmailTemplateLang_English(t *testing.T) { - host, port, user, pass, from, tls := smtpArgs() - original := &models.AlertingConfig{ - MailEnabled: true, - MailAddresses: []string{"a@b.com"}, - EmailTemplateLang: "en", +func TestBuildTemplateFiles_AllLanguagesPresent(t *testing.T) { + files, err := BuildTemplateFiles("https://app.example") + if err != nil { + t.Fatalf("BuildTemplateFiles error: %v", err) } - yamlStr, err := RenderConfig(host, port, user, pass, from, tls, "", "", original) - require.NoError(t, err) - - parsed, err := ParseConfig(yamlStr) - require.NoError(t, err) - require.NotNil(t, parsed) - assert.Equal(t, "en", parsed.EmailTemplateLang) -} - -func TestParseConfig_EmailTemplateLang_Italian(t *testing.T) { - host, port, user, pass, from, tls := smtpArgs() - original := &models.AlertingConfig{ - MailEnabled: true, - MailAddresses: []string{"a@b.com"}, - EmailTemplateLang: "it", + want := []string{ + "firing_en.html", "resolved_en.html", "firing_en.txt", "resolved_en.txt", + "firing_it.html", "resolved_it.html", "firing_it.txt", "resolved_it.txt", + "telegram_en.tmpl", "telegram_it.tmpl", + "_dispatcher.tmpl", } - yamlStr, err := RenderConfig(host, port, user, pass, from, tls, "", "", original) - require.NoError(t, err) - - parsed, err := ParseConfig(yamlStr) - require.NoError(t, err) - require.NotNil(t, parsed) - assert.Equal(t, "it", parsed.EmailTemplateLang) -} - -func TestWrapForMimirWithTemplates(t *testing.T) { - templateFiles := map[string]string{ - "firing_en.html": "{{ define \"firing_en.html\" }}test{{ end }}", - "_dispatcher.tmpl": "{{ define \"alert.html\" }}x{{ end }}", + for _, name := range want { + if _, ok := files[name]; !ok { + t.Errorf("missing template file %s", name) + } } - out := wrapForMimir("route:\n receiver: blackhole", templateFiles) - - assert.Contains(t, out, "alertmanager_config: |") - assert.Contains(t, out, "template_files:") - assert.Contains(t, out, "firing_en.html:") - assert.Contains(t, out, "_dispatcher.tmpl:") } -func TestWrapForMimirWithoutTemplates(t *testing.T) { - out := wrapForMimir("route:\n receiver: blackhole", nil) - - assert.Contains(t, out, "alertmanager_config: |") - assert.NotContains(t, out, "template_files:") -} - -// --- Telegram tests --- - -func TestRenderConfig_GlobalTelegram_ParseModeHTML(t *testing.T) { - host, port, user, pass, from, tls := smtpArgs() - cfg := &models.AlertingConfig{ - TelegramEnabled: true, - TelegramReceivers: []models.TelegramReceiver{ - {BotToken: "1234567890:AABBCCDDEEFFaabbccddeeff", ChatID: -100123456789}, - }, +func TestBuildTemplateFiles_DispatcherDefinesPerLanguage(t *testing.T) { + files, err := BuildTemplateFiles("https://app.example") + if err != nil { + t.Fatalf("err: %v", err) + } + disp := files["_dispatcher.tmpl"] + for _, name := range []string{ + `define "alert_en.html"`, `define "alert_en.txt"`, `define "alert_en.subject"`, + `define "alert_it.html"`, `define "alert_it.txt"`, `define "alert_it.subject"`, + } { + if !strings.Contains(disp, name) { + t.Errorf("dispatcher missing %q", name) + } } - out, err := RenderConfig(host, port, user, pass, from, tls, "", "", cfg) - require.NoError(t, err) - isValidYAML(t, out) - - assert.Contains(t, out, "telegram_configs") - assert.Contains(t, out, "bot_token: '1234567890:AABBCCDDEEFFaabbccddeeff'") - assert.Contains(t, out, "chat_id: -100123456789") - assert.Contains(t, out, "parse_mode: 'HTML'") - assert.Contains(t, out, `template "telegram.message"`) } -func TestRenderConfig_GlobalTelegram_Roundtrip(t *testing.T) { - host, port, user, pass, from, tls := smtpArgs() - original := &models.AlertingConfig{ - TelegramEnabled: true, - TelegramReceivers: []models.TelegramReceiver{ - {BotToken: "bot-token-abc", ChatID: 42}, - }, +func TestYamlEscape_SingleQuote(t *testing.T) { + if got := yamlEscape("a'b"); got != "a''b" { + t.Errorf("yamlEscape doubled single quote: got %q want %q", got, "a''b") } - out, err := RenderConfig(host, port, user, pass, from, tls, "", "", original) - require.NoError(t, err) - - parsed, err := ParseConfig(out) - require.NoError(t, err) - require.NotNil(t, parsed) - - assert.True(t, parsed.TelegramEnabled) - require.Len(t, parsed.TelegramReceivers, 1) - assert.Equal(t, "bot-token-abc", parsed.TelegramReceivers[0].BotToken) - assert.Equal(t, int64(42), parsed.TelegramReceivers[0].ChatID) } -func TestBuildTemplateFiles_IncludesTelegramTemplate(t *testing.T) { - firingWords := map[string]string{"en": "FIRING", "it": "ATTIV"} - resolvedWords := map[string]string{"en": "RESOLV", "it": "RISOLTO"} - - for _, lang := range []string{"en", "it"} { - files, err := BuildTemplateFiles(lang, "https://my.nethesis.it") - require.NoError(t, err, "lang=%s", lang) - - name := "telegram_" + lang + ".tmpl" - content, ok := files[name] - require.True(t, ok, "missing %s", name) - assert.Contains(t, content, `define "telegram.message"`, "lang=%s", lang) - assert.Contains(t, content, firingWords[lang], "lang=%s: firing state must be present", lang) - assert.Contains(t, content, resolvedWords[lang], "lang=%s: resolved state must be present", lang) +func TestYamlEscape_NewlineStripped(t *testing.T) { + if got := yamlEscape("a\nb\rc"); got != "abc" { + t.Errorf("yamlEscape stripped newlines: got %q want %q", got, "abc") } } diff --git a/backend/services/alerting/templates/telegram_en.tmpl b/backend/services/alerting/templates/telegram_en.tmpl index 2b669700..8b8192d7 100644 --- a/backend/services/alerting/templates/telegram_en.tmpl +++ b/backend/services/alerting/templates/telegram_en.tmpl @@ -1,4 +1,4 @@ -{{ define "telegram.message" -}} +{{ define "telegram_en.message" -}} {{ if .Alerts.Firing -}} {{ if eq (len .Alerts.Firing) 1 }}🔴 ALERT FIRING{{ else }}🚨 {{ len .Alerts.Firing }} ALERTS FIRING{{ end }} diff --git a/backend/services/alerting/templates/telegram_it.tmpl b/backend/services/alerting/templates/telegram_it.tmpl index cfe1cc43..a02497eb 100644 --- a/backend/services/alerting/templates/telegram_it.tmpl +++ b/backend/services/alerting/templates/telegram_it.tmpl @@ -1,4 +1,4 @@ -{{ define "telegram.message" -}} +{{ define "telegram_it.message" -}} {{ if .Alerts.Firing -}} {{ if eq (len .Alerts.Firing) 1 }}🔴 ALLARME ATTIVO{{ else }}🚨 {{ len .Alerts.Firing }} ALLARMI ATTIVI{{ end }} From b4addbdc4430d01f788a8079d6b69bb33baed388 Mon Sep 17 00:00:00 2001 From: Edoardo Spadoni Date: Tue, 12 May 2026 11:40:51 +0200 Subject: [PATCH 03/10] docs(alerts): update user docs and AGENTS.md for flat config model MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The user-facing alerting docs and the AGENTS reference were stuck on the previous shape (global mail_addresses/webhook_receivers + per- severity + per-system overrides + per-tenant email_template_lang). Rewrite the 'Alerting Configuration' section in both en and it locales to describe the new layer model: - flat shape: enabled tri-state + email_recipients/webhook_recipients/ telegram_recipients with per-recipient severities[] - email recipients additionally carry language (en|it) and format (html|plain) - merge across the org hierarchy stays server-side; /alerts/config returns only the caller's own layer (no inherited / merged view) - additive-only contract; non-Owner explicit false on enabled.X is normalised to null at save time - RBAC: the Alerting Configuration tab is gated on read:alerts / manage:alerts (admin/super only); the alerts list stays on read:systems / manage:systems Refresh the Telegram step-3 example to use the new shape and update the email-notifications section to reflect per-recipient language and format. Realign AGENTS.md §3.5 with the same wording. --- AGENTS.md | 3 +- docs/docs/features/alerting.md | 202 +++++++++--------- .../current/features/alerting.md | 202 +++++++++--------- 3 files changed, 206 insertions(+), 201 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 00697b83..c46d6dd2 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -147,9 +147,10 @@ Vue 3 + TypeScript, Vite, Tailwind, Pinia. Alerting UI present (`src/views/Alert Single-node Grafana Mimir with S3-compatible backend and multi-tenant Alertmanager. Containerfile, Makefile, docker-compose.yml + docker-compose.local.yml. `scripts/` contains Python helpers (`alert.py`, `alerting_config.py`) for manual testing. **Alerting integration**: -- Backend (`backend/services/alerting/`) renders Alertmanager YAML from `AlertingConfig` models and pushes via `POST /api/v1/alerts` per tenant. Email templates are Go `html/template`-embedded, en/it locales, firing + resolved variants. +- Backend (`backend/services/alerting/`) holds one `AlertingConfigLayer` per organization in `alert_config_layers` (flat recipient-based shape: `enabled`, `email_recipients[]`, `webhook_recipients[]`, `telegram_recipients[]`, each recipient carries its own `severities[]`; email also `language` + `format`). The effective per-tenant Mimir YAML is the server-side merge of every layer from Owner down to the tenant (union dedup, additive-only). `/alerts/config` only ever returns the caller's own layer — the merged view is internal and never leaves the backend. Templates are Go `html/template`-embedded, en/it locales, firing + resolved variants; both languages ship with every tenant push and the renderer picks per email recipient via per-language dispatchers (`alert_.html|txt|subject`). - Collect proxies systems to Alertmanager `alerts`/`silences` with `X-Scope-OrgID` from the authenticated system's org. - Alertmanager webhooks resolved alerts back to collect `/api/alert_history`, which persists them scoped by `organization_id` (column on `alert_history`, populated from the DB via `system_key` lookup — never trusted from the payload). +- RBAC: `/alerts/config*` is gated on a dedicated `alerts` resource (`read:alerts` for GET, `manage:alerts` for POST/DELETE) — admin/super only. The list/silence endpoints (`/alerts`, `/alerts/history`, `/systems/:id/alerts*`) stay on `read:systems`/`manage:systems`. ### 3.6 Proxy (`proxy/`) diff --git a/docs/docs/features/alerting.md b/docs/docs/features/alerting.md index bf405521..bc19e7a3 100644 --- a/docs/docs/features/alerting.md +++ b/docs/docs/features/alerting.md @@ -17,17 +17,21 @@ The Alerting feature provides a centralized view of all active alerts from your From the Alerting page you can: - View active alerts filtered by state, severity, or specific system -- Configure email, webhook, and Telegram notifications per organization -- Define per-severity and per-system notification overrides +- Configure email, webhook, and Telegram notifications for your own organization +- Tag each recipient with the severities it should receive (`critical`, `warning`, `info`, or all of them) +- Choose per-recipient language and body format for email notifications - Review alert history for each system ## Access -The Alerting page is accessible from the side menu at **Alerting**. Access requires the `read:systems` permission to view alerts and `manage:systems` permission to modify the alerting configuration. +The Alerting page is accessible from the side menu at **Alerting**. The two tabs are gated by different permissions: + +- **Alerts** tab — requires `read:systems` (admin, support, reader). `manage:systems` is required to create or remove silences. +- **Alerting Configuration** tab — requires `read:alerts` (admin/super only); `manage:alerts` to save or remove a configuration. ## Organization selector -Since alerting is configured per-organization, the page includes an organization selector at the top. Choose the customer organization whose alerts and configuration you want to manage. The selector lists only non-owner organizations available within your hierarchy. +The organization selector at the top of the **Alerts** tab is used by the Owner role to filter the alerts list by tenant. The **Alerting Configuration** tab is always scoped to the caller's own organization — the page never displays another organization's configuration, regardless of the selector value. ## Active Alerts @@ -67,126 +71,122 @@ Click **Reset filters** to clear all active filters, or **Refresh** to manually ## Alerting Configuration -The **Configuration** tab lets you define how alerts are routed to recipients. The configuration is pushed to Alertmanager and persists until you change it again. +The **Alerting Configuration** tab lets you define who gets notified when an alert fires for your organization. The configuration you save here is your **layer** — the server merges it with the layers of every organization above you in the hierarchy (Owner → Distributor → Reseller → Customer) and pushes the resulting Alertmanager YAML to Mimir. + +### What you see vs. what Mimir sees + +What you see in this tab is always **your own layer** — nothing more. You never see the layers of organizations above you, and organizations below you never see yours. The merged effective configuration is computed server-side at render time and stays inside the backend; it never leaves your tenant boundary. -### Viewing the configuration +This isolation is deliberate: it keeps webhook URLs, Telegram tokens, and recipient email addresses confined to the organization that typed them. -The configuration is shown in two modes: +### Additive-only contract -- **Structured view**: organized sections showing current mail and webhook settings, per-severity overrides, and per-system overrides in a readable format -- **Raw YAML view**: the complete Alertmanager configuration in YAML, with sensitive fields (SMTP credentials, webhook tokens) automatically redacted. Click **Copy YAML** to copy the full configuration to the clipboard. +Layers are additive. A descendant can **add** recipients on top of what their ancestor configured, but cannot remove or disable channels that an ancestor enabled. For example: -If no configuration exists yet, the page shows a "No configuration found" message with an **Edit configuration** button to create the initial setup. +- Owner enables `email` globally and adds `noc@msp.example` → every tenant below inherits both. +- A Reseller can add `noc@reseller.example` on top — both addresses now receive matching alerts. +- The same Reseller cannot turn `email` off for their own subtree. Only the Owner can globally disable a channel; for non-Owner roles, an explicit `false` on a toggle is normalised to `null` ("no opinion") on save. -### Configuration fields +### Configuration shape + +The layer is a flat JSON object with three channel toggles and three recipient lists: + +```json +{ + "enabled": { "email": true, "webhook": null, "telegram": null }, + "email_recipients": [ + { "address": "noc@org.example", "severities": ["critical","warning"], "language": "it", "format": "html" } + ], + "webhook_recipients": [ + { "name": "ops-slack", "url": "https://hooks.slack.com/services/T000/B000/XXX", "severities": ["critical"] } + ], + "telegram_recipients": [ + { "bot_token": "123456789:ABCDEFabcdef...", "chat_id": -1001234567890, "severities": [] } + ] +} +``` -The configuration is edited as a JSON object with the following fields: +#### Channel toggles (`enabled`) -#### Global settings +Each channel is tri-state: + +| Value | Meaning | +|-------|---------| +| `true` | Channel enabled at this layer | +| `false` | Channel disabled at this layer (Owner only; non-Owner `false` is normalised to `null` on save) | +| `null` | No opinion at this layer; the effective state inherits from any ancestor that took a position. If no layer enables the channel, it stays off. | + +#### Email recipients (`email_recipients`) | Field | Type | Description | |-------|------|-------------| -| `mail_enabled` | boolean | Enable or disable email notifications globally | -| `mail_addresses` | string[] | List of email addresses that receive all alerts | -| `webhook_enabled` | boolean | Enable or disable webhook notifications globally | -| `webhook_receivers` | object[] | List of webhook endpoints, each with `name` and `url` | -| `telegram_enabled` | boolean | Enable or disable Telegram notifications globally | -| `telegram_receivers` | object[] | List of Telegram receivers, each with `bot_token` and `chat_id` | -| `email_template_lang` | string | Language for email templates: `en` or `it` (default: `en`) | +| `address` | string | Email address that receives the notification | +| `severities` | string[] | Subset of `["critical","warning","info"]`. Empty array means "all severities" | +| `language` | string | `en` or `it`. Controls subject + body language of the rendered template | +| `format` | string | `html` (default, multipart with HTML primary + text fallback) or `plain` (text-only body) | -#### Per-severity overrides +#### Webhook recipients (`webhook_recipients`) -The `severities` field lets you customize notification behavior for each severity level. This is useful when you want critical alerts to reach a different set of recipients than informational alerts. +| Field | Type | Description | +|-------|------|-------------| +| `name` | string | Descriptive label for the webhook target (shown in the UI) | +| `url` | string | HTTPS/HTTP endpoint. Validated server-side: loopback, RFC1918, RFC6598 CGNAT, link-local, multicast, and cloud-metadata destinations are rejected | +| `severities` | string[] | Same semantics as email | -Each severity override includes: +#### Telegram recipients (`telegram_recipients`) -- `severity`: one of `critical`, `warning`, `info` -- `mail_enabled` (optional): override the global email setting for this severity -- `webhook_enabled` (optional): override the global webhook setting -- `telegram_enabled` (optional): override the global Telegram setting -- `mail_addresses` (optional): list of email addresses for this severity -- `webhook_receivers` (optional): list of webhook receivers for this severity -- `telegram_receivers` (optional): list of Telegram receivers for this severity +| Field | Type | Description | +|-------|------|-------------| +| `bot_token` | string | Token obtained from `@BotFather` | +| `chat_id` | integer | Numeric chat id (positive for users, negative for groups/channels) | +| `severities` | string[] | Same semantics as email | -If an override's address list is empty, the global addresses are used as fallback. +### Severity scoping -#### Per-system overrides +The `severities` array on each recipient controls which severities it receives: -The `systems` field lets you customize notification behavior for specific systems. Useful when different systems should alert different teams. +- **Empty (`[]`)** — recipient receives **every** severity. This is the default for a "catch-all" address. +- **Subset (e.g. `["critical"]`)** — recipient receives **only** those severity levels. -Each system override includes: +Mimir Alertmanager fans out one receiver per severity (`severity-critical-receiver`, `severity-warning-receiver`, `severity-info-receiver`); a recipient with `severities=[]` lands on all three. -- `system_key`: the identifier of the target system -- `mail_enabled` (optional): override for this system -- `webhook_enabled` (optional): override for this system -- `telegram_enabled` (optional): override for this system -- `mail_addresses` (optional): additional recipients for alerts from this system -- `webhook_receivers` (optional): additional webhooks for alerts from this system -- `telegram_receivers` (optional): additional Telegram receivers for alerts from this system +### Merge across the hierarchy -### Override priority +When the server renders the effective configuration for a tenant, it walks the chain from Owner down to that tenant and unions the layers using these rules: -When routing an alert, the priority is: +- **Channel toggles** — logical OR: if any layer in the chain enables a channel, the channel is on for the tenant. +- **Recipients** — union with stable dedup. Dedup keys are `address` for email, `url` for webhook, `(bot_token, chat_id)` for Telegram. The first occurrence (closer to Owner) wins for `language` and `format` collisions. +- **Per-recipient severities** — union; if any contributing copy has `severities=[]` (all), the merged copy widens back to `[]` (the broader scope always wins). -1. **Per-system override** (most specific) -2. **Per-severity override** -3. **Global settings** (fallback) +### Example: customer-level layer that adds notifications -### Example configuration +Suppose the Owner already enables email with `noc@msp.example` for all severities. A Customer adds the following layer for their own organization: ```json { - "mail_enabled": true, - "webhook_enabled": false, - "telegram_enabled": true, - "mail_addresses": ["ops@example.com"], - "webhook_receivers": [], - "telegram_receivers": [ - { "bot_token": "123456789:ABCDEFabcdef...", "chat_id": -1001234567890 } + "enabled": { "email": null, "webhook": null, "telegram": null }, + "email_recipients": [ + { "address": "oncall@customer.example", "severities": ["critical"], "language": "en", "format": "plain" }, + { "address": "manager@customer.example", "severities": [], "language": "it", "format": "html" } ], - "email_template_lang": "it", - "severities": [ - { - "severity": "critical", - "mail_addresses": ["oncall@example.com", "ops@example.com"] - }, - { - "severity": "info", - "mail_enabled": false, - "telegram_enabled": false - } - ], - "systems": [ - { - "system_key": "NETH-ABCD-1234", - "mail_addresses": ["platform-team@example.com"] - } + "webhook_recipients": [], + "telegram_recipients": [ + { "bot_token": "123456789:ABCDEFabcdef...", "chat_id": -1001234567890, "severities": ["critical","warning"] } ] } ``` -In this example: - -- All warning alerts go to `ops@example.com` and the configured Telegram chat -- Critical alerts go to both `oncall@example.com` and `ops@example.com` -- Info alerts are suppressed (email and Telegram disabled) -- Alerts from system `NETH-ABCD-1234` also go to `platform-team@example.com` -- Email templates are rendered in Italian - -### Editing the configuration - -1. Click **Edit configuration** in the structured view -2. Modify the JSON in the editor -3. Click **Save configuration** — invalid JSON is rejected with a validation error -4. On success, the configuration view refreshes and a confirmation notification appears - -To cancel without saving, click **Cancel**. +What Mimir delivers for this customer: -### Disabling all alerts +- `oncall@customer.example` receives **only critical** alerts as plain-text English emails. +- `manager@customer.example` receives **all** alerts as HTML Italian emails. +- The Owner's `noc@msp.example` still receives all alerts (inherited). +- The Telegram chat receives **critical and warning** alerts (Owner's `telegram` toggle was not on, so this Customer's `telegram_recipients` won't actually fire until an ancestor enables Telegram — only the Owner can enable a channel globally). -At the bottom of the configuration page you can find a **Disable all alerts** action. This replaces the current configuration with a "blackhole" routing that silences all notifications for the organization, without losing your previous configuration permanently — you can re-create it by editing the configuration again. +### Saving and removing a layer -When clicked, a confirmation step appears before the action is executed. +- **Save configuration** persists your layer and triggers a re-render + push to Mimir for every tenant in your hierarchy. The response reports `affected_tenants` and `propagated_to`; any per-tenant push failures appear as warnings without rolling back the save (Mimir can be reconciled by saving again). +- **Remove this configuration** deletes your layer entirely. Your contributions disappear from the merged config; ancestor layers (Owner / Distributor / Reseller) remain intact and continue to fire. To fully silence a tenant, every layer in its chain must drop its contribution. ## System-level alerts @@ -222,11 +222,11 @@ When email notifications are enabled, alerts are delivered from Alertmanager usi - The alert name and severity - The system key and service label (if present) -- A localized summary and description (based on the configured `email_template_lang`) +- A localized summary and description (in the language picked per recipient) - The firing or resolution timestamp - A **View system** button linking directly to the system's detail page -Templates are available in **English** and **Italian**, selected via the `email_template_lang` configuration field. +Templates are available in **English** and **Italian**. The language is picked **per recipient** via the `language` field on each `email_recipients[]` entry — different recipients in the same organization can receive different language renderings. Likewise, each recipient picks its own `format`: `html` for a multipart body with HTML primary, `plain` for a text-only body (useful for ticketing systems or mail-to-chat bridges). ## Telegram notifications @@ -265,28 +265,30 @@ The `chat_id` is the numeric identifier of the destination (a private user, a gr 2. Send a message in the group so Alertmanager has something to read 3. Call `getUpdates` as above — the `chat_id` for groups and channels is a **negative** number (e.g. `-1001234567890`). Eventually, you could find the `chat_id` also in the URL of the conversation with the bot, in the format `https://web.telegram.org/z/#-` (note the negative sign for groups/channels) -### Step 3 — Configure the alerting JSON +### Step 3 — Add the Telegram recipient to your layer -Add `telegram_enabled` and `telegram_receivers` to your alerting configuration. Each entry in `telegram_receivers` requires: +Add an entry to `telegram_recipients`; turning the channel on with `enabled.telegram = true` is required only at the Owner level (the channel propagates additively downstream). | Field | Type | Description | |-------|------|-------------| | `bot_token` | string | The token provided by BotFather | | `chat_id` | integer | The numeric Telegram chat ID (positive for users, negative for groups/channels) | +| `severities` | string[] | Subset of `["critical","warning","info"]`. Empty array = all severities | -Example: +Example (Owner layer enabling Telegram for the whole tree): ```json { - "mail_enabled": false, - "telegram_enabled": true, - "telegram_receivers": [ - { "bot_token": "123456789:ABCDEFabcdef...", "chat_id": -1001234567890 } + "enabled": { "email": null, "webhook": null, "telegram": true }, + "email_recipients": [], + "webhook_recipients": [], + "telegram_recipients": [ + { "bot_token": "123456789:ABCDEFabcdef...", "chat_id": -1001234567890, "severities": [] } ] } ``` -You can define multiple receivers to send alerts to multiple bots or chats simultaneously. +You can define multiple receivers to send alerts to multiple bots or chats simultaneously. Telegram messages are currently always rendered in English. ## Related topics diff --git a/docs/i18n/it/docusaurus-plugin-content-docs/current/features/alerting.md b/docs/i18n/it/docusaurus-plugin-content-docs/current/features/alerting.md index c3a81a8a..3c2f3612 100644 --- a/docs/i18n/it/docusaurus-plugin-content-docs/current/features/alerting.md +++ b/docs/i18n/it/docusaurus-plugin-content-docs/current/features/alerting.md @@ -17,17 +17,21 @@ La funzionalità di Alerting fornisce una vista centralizzata di tutti gli allar Dalla pagina Alerting puoi: - Visualizzare gli allarmi attivi filtrati per stato, severità o sistema specifico -- Configurare notifiche email, webhook e Telegram per ogni organizzazione -- Definire regole di notifica personalizzate per severità e per sistema +- Configurare notifiche email, webhook e Telegram per la tua organizzazione +- Marcare ogni destinatario con le severità da ricevere (`critical`, `warning`, `info`, oppure tutte) +- Scegliere lingua e formato del corpo email per ogni destinatario - Consultare lo storico degli allarmi di ciascun sistema ## Accesso -La pagina Alerting è accessibile dal menu laterale alla voce **Alerting**. L'accesso richiede il permesso `read:systems` per visualizzare gli allarmi e `manage:systems` per modificare la configurazione. +La pagina Alerting è accessibile dal menu laterale alla voce **Alerting**. I due tab hanno permessi diversi: + +- Tab **Allarmi** — richiede `read:systems` (admin, support, reader). Per creare o rimuovere silences serve `manage:systems`. +- Tab **Configurazione alerting** — richiede `read:alerts` (solo admin/super); `manage:alerts` per salvare o rimuovere una configurazione. ## Selezione organizzazione -Poiché l'alerting si configura per organizzazione, la pagina include un selettore in alto. Scegli l'organizzazione cliente di cui vuoi gestire allarmi e configurazione. Il selettore elenca solo le organizzazioni non-owner disponibili nella tua gerarchia. +Il selettore in cima al tab **Allarmi** è usato dall'Owner per filtrare la lista degli allarmi per tenant. Il tab **Configurazione alerting** invece opera **sempre** sulla propria organizzazione — non mostra mai la configurazione di un'altra organizzazione, indipendentemente dal valore del selettore. ## Allarmi attivi @@ -67,126 +71,122 @@ Clicca **Reset filtri** per rimuovere tutti i filtri attivi, oppure **Aggiorna** ## Configurazione alerting -Il tab **Configurazione** ti permette di definire come vengono instradati gli allarmi ai destinatari. La configurazione viene pushata ad Alertmanager e persiste finché non la cambi. +Il tab **Configurazione alerting** ti permette di definire chi viene notificato quando un allarme scatta nella tua organizzazione. Quello che salvi qui è il tuo **layer** — il server lo unisce ai layer di tutte le organizzazioni sopra di te nella gerarchia (Owner → Distributor → Reseller → Customer) e pusha la YAML Alertmanager risultante a Mimir. + +### Cosa vedi tu vs. cosa vede Mimir + +Quello che vedi in questo tab è sempre **e solo** il tuo layer. Non vedi mai i layer delle organizzazioni sopra di te, e le organizzazioni sotto di te non vedono mai il tuo. La configurazione effettiva mergiata viene calcolata server-side al momento del render e non lascia mai il backend; non oltrepassa mai il confine del tenant. -### Visualizzare la configurazione +Questo isolamento è intenzionale: confina URL dei webhook, token Telegram e indirizzi email destinatari all'organizzazione che li ha digitati. -La configurazione viene mostrata in due modalità: +### Contratto additivo -- **Vista strutturata**: sezioni organizzate che mostrano le impostazioni mail e webhook correnti, le override per severità e per sistema in formato leggibile -- **Vista YAML raw**: la configurazione Alertmanager completa in YAML, con i campi sensibili (credenziali SMTP, token webhook) automaticamente offuscati. Clicca **Copia YAML** per copiare l'intera configurazione negli appunti. +I layer sono additivi. Un discendente può **aggiungere** destinatari su quanto configurato dall'antenato, ma non può rimuovere o disabilitare canali che un antenato ha abilitato. Per esempio: -Se non esiste ancora alcuna configurazione, la pagina mostra un messaggio "Nessuna configurazione trovata" con un pulsante **Modifica configurazione** per creare l'impostazione iniziale. +- L'Owner abilita `email` globalmente e aggiunge `noc@msp.example` → ogni tenant sotto eredita entrambi. +- Un Reseller può aggiungere `noc@reseller.example` sopra — ora entrambi gli indirizzi ricevono gli allarmi che li riguardano. +- Lo stesso Reseller **non può** spegnere `email` per il proprio sottoalbero. Solo l'Owner può disabilitare globalmente un canale; per i ruoli non-Owner, un `false` esplicito su un toggle viene normalizzato a `null` ("nessuna opinione") al salvataggio. -### Campi di configurazione +### Forma della configurazione + +Il layer è un oggetto JSON flat con tre toggle di canale e tre liste di destinatari: + +```json +{ + "enabled": { "email": true, "webhook": null, "telegram": null }, + "email_recipients": [ + { "address": "noc@org.example", "severities": ["critical","warning"], "language": "it", "format": "html" } + ], + "webhook_recipients": [ + { "name": "ops-slack", "url": "https://hooks.slack.com/services/T000/B000/XXX", "severities": ["critical"] } + ], + "telegram_recipients": [ + { "bot_token": "123456789:ABCDEFabcdef...", "chat_id": -1001234567890, "severities": [] } + ] +} +``` -La configurazione viene modificata come oggetto JSON con i seguenti campi: +#### Toggle dei canali (`enabled`) -#### Impostazioni globali +Ogni canale è tri-stato: + +| Valore | Significato | +|--------|-------------| +| `true` | Canale abilitato a questo layer | +| `false` | Canale disabilitato a questo layer (solo Owner; per i non-Owner `false` viene normalizzato a `null` al salvataggio) | +| `null` | Nessuna opinione a questo layer; lo stato effettivo eredita da un eventuale antenato che ha preso posizione. Se nessun layer abilita il canale, resta off. | + +#### Destinatari email (`email_recipients`) | Campo | Tipo | Descrizione | |-------|------|-------------| -| `mail_enabled` | boolean | Abilita o disabilita le notifiche email globalmente | -| `mail_addresses` | string[] | Lista di indirizzi email che ricevono tutti gli allarmi | -| `webhook_enabled` | boolean | Abilita o disabilita le notifiche webhook globalmente | -| `webhook_receivers` | object[] | Lista di endpoint webhook, ciascuno con `name` e `url` | -| `telegram_enabled` | boolean | Abilita o disabilita le notifiche Telegram globalmente | -| `telegram_receivers` | object[] | Lista di receiver Telegram, ciascuno con `bot_token` e `chat_id` | -| `email_template_lang` | string | Lingua dei template email: `en` o `it` (default: `en`) | +| `address` | string | Indirizzo email che riceve la notifica | +| `severities` | string[] | Sottoinsieme di `["critical","warning","info"]`. Array vuoto = "tutte le severità" | +| `language` | string | `en` o `it`. Determina lingua del subject e del body del template renderizzato | +| `format` | string | `html` (default, multipart HTML primario + text fallback) o `plain` (body solo testo) | -#### Override per severità +#### Destinatari webhook (`webhook_recipients`) -Il campo `severities` ti permette di personalizzare il comportamento delle notifiche per ciascun livello di severità. Utile quando vuoi che gli allarmi critici raggiungano destinatari diversi rispetto a quelli informativi. +| Campo | Tipo | Descrizione | +|-------|------|-------------| +| `name` | string | Etichetta descrittiva del target (mostrata nella UI) | +| `url` | string | Endpoint HTTPS/HTTP. Validato lato server: loopback, RFC1918, RFC6598 CGNAT, link-local, multicast e metadata cloud sono rifiutati | +| `severities` | string[] | Stessa semantica delle email | -Ogni override include: +#### Destinatari Telegram (`telegram_recipients`) -- `severity`: uno tra `critical`, `warning`, `info` -- `mail_enabled` (opzionale): override dell'impostazione email globale per questa severità -- `webhook_enabled` (opzionale): override dell'impostazione webhook globale -- `telegram_enabled` (opzionale): override dell'impostazione Telegram globale -- `mail_addresses` (opzionale): lista di indirizzi email per questa severità -- `webhook_receivers` (opzionale): lista di webhook receiver per questa severità -- `telegram_receivers` (opzionale): lista di receiver Telegram per questa severità +| Campo | Tipo | Descrizione | +|-------|------|-------------| +| `bot_token` | string | Token ottenuto da `@BotFather` | +| `chat_id` | integer | Id numerico della chat (positivo per utenti, negativo per gruppi/canali) | +| `severities` | string[] | Stessa semantica delle email | -Se la lista degli indirizzi di un override è vuota, vengono usati gli indirizzi globali come fallback. +### Scope per severità -#### Override per sistema +L'array `severities` di ciascun destinatario controlla quali severità riceve: -Il campo `systems` ti permette di personalizzare il comportamento per sistemi specifici. Utile quando sistemi diversi devono notificare team diversi. +- **Vuoto (`[]`)** — il destinatario riceve **ogni** severità. È il default per un indirizzo "catch-all". +- **Sottoinsieme (es. `["critical"]`)** — il destinatario riceve **solo** quelle severità. -Ogni override include: +Mimir Alertmanager espande un receiver per severità (`severity-critical-receiver`, `severity-warning-receiver`, `severity-info-receiver`); un destinatario con `severities=[]` finisce in tutti e tre. -- `system_key`: l'identificatore del sistema target -- `mail_enabled` (opzionale): override per questo sistema -- `webhook_enabled` (opzionale): override per questo sistema -- `telegram_enabled` (opzionale): override per questo sistema -- `mail_addresses` (opzionale): destinatari aggiuntivi per gli allarmi di questo sistema -- `webhook_receivers` (opzionale): webhook aggiuntivi per gli allarmi di questo sistema -- `telegram_receivers` (opzionale): receiver Telegram aggiuntivi per gli allarmi di questo sistema +### Merge nella gerarchia -### Priorità delle override +Quando il server renderizza la configurazione effettiva per un tenant, percorre la catena da Owner fino al tenant e unisce i layer con queste regole: -Quando si instrada un allarme, la priorità è: +- **Toggle dei canali** — OR logico: se un qualunque layer della catena abilita un canale, il canale è on per il tenant. +- **Destinatari** — union con dedup stabile. Chiavi di dedup: `address` per email, `url` per webhook, `(bot_token, chat_id)` per Telegram. In caso di collisione di `language` o `format`, vince la prima occorrenza (più vicina all'Owner). +- **`severities` per destinatario** — union; se una qualunque copia che contribuisce ha `severities=[]` (tutte), la copia mergiata si allarga a `[]` (lo scope più largo vince sempre). -1. **Override per sistema** (la più specifica) -2. **Override per severità** -3. **Impostazioni globali** (fallback) +### Esempio: layer di livello customer che aggiunge notifiche -### Esempio di configurazione +Supponi che l'Owner abbia già abilitato l'email con `noc@msp.example` per tutte le severità. Un Customer aggiunge questo layer per la propria organizzazione: ```json { - "mail_enabled": true, - "webhook_enabled": false, - "telegram_enabled": true, - "mail_addresses": ["ops@example.com"], - "webhook_receivers": [], - "telegram_receivers": [ - { "bot_token": "123456789:ABCDEFabcdef...", "chat_id": -1001234567890 } + "enabled": { "email": null, "webhook": null, "telegram": null }, + "email_recipients": [ + { "address": "oncall@customer.example", "severities": ["critical"], "language": "en", "format": "plain" }, + { "address": "manager@customer.example", "severities": [], "language": "it", "format": "html" } ], - "email_template_lang": "it", - "severities": [ - { - "severity": "critical", - "mail_addresses": ["oncall@example.com", "ops@example.com"] - }, - { - "severity": "info", - "mail_enabled": false, - "telegram_enabled": false - } - ], - "systems": [ - { - "system_key": "NETH-ABCD-1234", - "mail_addresses": ["platform-team@example.com"] - } + "webhook_recipients": [], + "telegram_recipients": [ + { "bot_token": "123456789:ABCDEFabcdef...", "chat_id": -1001234567890, "severities": ["critical","warning"] } ] } ``` -In questo esempio: - -- Tutti gli allarmi warning vanno a `ops@example.com` e alla chat Telegram configurata -- Gli allarmi critical vanno sia a `oncall@example.com` che a `ops@example.com` -- Gli allarmi info sono soppressi (email e Telegram disabilitati) -- Gli allarmi dal sistema `NETH-ABCD-1234` vanno anche a `platform-team@example.com` -- I template email vengono renderizzati in italiano - -### Modificare la configurazione - -1. Clicca **Modifica configurazione** nella vista strutturata -2. Modifica il JSON nell'editor -3. Clicca **Salva configurazione** — il JSON non valido viene rifiutato con un errore di validazione -4. Al successo, la vista si aggiorna e appare una notifica di conferma - -Per annullare senza salvare, clicca **Annulla**. +Cosa consegna Mimir per questo customer: -### Disabilitare tutti gli allarmi +- `oncall@customer.example` riceve **solo** gli allarmi critical come email in inglese plain text. +- `manager@customer.example` riceve **tutti** gli allarmi come email HTML in italiano. +- `noc@msp.example` dell'Owner continua a ricevere tutti gli allarmi (ereditato). +- La chat Telegram riceve gli allarmi **critical e warning** (il toggle `telegram` dell'Owner non era on, quindi questi `telegram_recipients` non scattano finché un antenato non abilita il canale Telegram — solo l'Owner può abilitare globalmente un canale). -In fondo alla pagina di configurazione trovi l'azione **Disabilita tutti gli allarmi**. Questa sostituisce la configurazione corrente con un routing "blackhole" che silenzia tutte le notifiche per l'organizzazione, senza perdere permanentemente la configurazione precedente — puoi ricrearla modificando di nuovo la configurazione. +### Salvare e rimuovere un layer -Cliccando appare un passaggio di conferma prima dell'esecuzione. +- **Salva configurazione** persiste il tuo layer e triggera un re-render + push a Mimir per ogni tenant della tua gerarchia. La response riporta `affected_tenants` e `propagated_to`; eventuali fallimenti per tenant compaiono come warning senza rollback del salvataggio (Mimir può essere riconciliato salvando di nuovo). +- **Rimuovi questa configurazione** cancella il tuo layer per intero. Il tuo contributo sparisce dalla config mergiata; i layer degli antenati (Owner / Distributor / Reseller) restano intatti e continuano a scattare. Per silenziare completamente un tenant, ogni layer della sua catena deve eliminare il proprio contributo. ## Allarmi a livello di sistema @@ -212,11 +212,11 @@ Quando le notifiche email sono abilitate, gli allarmi vengono consegnati da Aler - Il nome e la severità dell'allarme - La system key e l'eventuale label service -- Riepilogo e descrizione localizzati (in base al `email_template_lang` configurato) +- Riepilogo e descrizione localizzati (nella lingua scelta per ciascun destinatario) - Il timestamp di firing o risoluzione - Un pulsante **Visualizza sistema** che linka direttamente alla pagina di dettaglio del sistema -I template sono disponibili in **inglese** e **italiano**, selezionati tramite il campo `email_template_lang`. +I template sono disponibili in **inglese** e **italiano**. La lingua viene scelta **per destinatario** tramite il campo `language` di ogni voce in `email_recipients[]` — destinatari diversi della stessa organizzazione possono ricevere rendering in lingue diverse. Allo stesso modo, ogni destinatario sceglie il proprio `format`: `html` per un corpo multipart con HTML primario, `plain` per un corpo solo testo (utile per sistemi di ticketing o bridge mail-to-chat). ## Notifiche Telegram @@ -255,28 +255,30 @@ Il `chat_id` è l'identificatore numerico della destinazione (un utente privato, 2. Invia un messaggio nel gruppo in modo che Alertmanager abbia qualcosa da leggere 3. Chiama `getUpdates` come sopra — il `chat_id` per gruppi e canali è un numero **negativo** (es. `-1001234567890`). Eventualmente, potresti trovare il `chat_id` anche nell'URL della conversazione con il bot, nel formato `https://web.telegram.org/z/#-` (nota il segno negativo per gruppi/canali) -### Passaggio 3 — Configura il JSON di alerting +### Passaggio 3 — Aggiungi il destinatario Telegram al tuo layer -Aggiungi `telegram_enabled` e `telegram_receivers` alla configurazione di alerting. Ogni voce in `telegram_receivers` richiede: +Aggiungi una voce a `telegram_recipients`; abilitare il canale con `enabled.telegram = true` è necessario solo a livello Owner (il canale si propaga additivamente verso il basso). | Campo | Tipo | Descrizione | |-------|------|-------------| | `bot_token` | string | Il token fornito da BotFather | | `chat_id` | integer | L'ID numerico della chat Telegram (positivo per utenti, negativo per gruppi/canali) | +| `severities` | string[] | Sottoinsieme di `["critical","warning","info"]`. Array vuoto = tutte le severità | -Esempio: +Esempio (layer Owner che abilita Telegram per tutto l'albero): ```json { - "mail_enabled": false, - "telegram_enabled": true, - "telegram_receivers": [ - { "bot_token": "123456789:ABCDEFabcdef...", "chat_id": -1001234567890 } + "enabled": { "email": null, "webhook": null, "telegram": true }, + "email_recipients": [], + "webhook_recipients": [], + "telegram_recipients": [ + { "bot_token": "123456789:ABCDEFabcdef...", "chat_id": -1001234567890, "severities": [] } ] } ``` -È possibile definire più receiver per inviare gli allarmi a più bot o chat contemporaneamente. +È possibile definire più destinatari per inviare gli allarmi a più bot o chat contemporaneamente. I messaggi Telegram sono attualmente sempre renderizzati in inglese. ## Argomenti correlati From 1147c0f573f44843c2db5be0069f48bbbc068fd4 Mon Sep 17 00:00:00 2001 From: Edoardo Spadoni Date: Tue, 12 May 2026 16:31:50 +0200 Subject: [PATCH 04/10] refactor(alerts): carry system_type as label, drop system block from GET /alerts Stamp system_type at ingest (collect) alongside the other system_* labels and drop the per-request DB lookup that enriched each alert with a separate system object. Saves a SELECT on every GET /alerts and removes a redundant field the frontend never read. --- backend/methods/alerting.go | 99 ++---------------------------- backend/openapi.yaml | 84 +++++++++---------------- collect/alerting/mimir.go | 6 ++ collect/alerting/mimir_test.go | 4 ++ collect/cron/linkfailed_monitor.go | 4 ++ 5 files changed, 50 insertions(+), 147 deletions(-) diff --git a/backend/methods/alerting.go b/backend/methods/alerting.go index e7fa0dde..9b0d44cd 100644 --- a/backend/methods/alerting.go +++ b/backend/methods/alerting.go @@ -7,7 +7,6 @@ package methods import ( "context" - "database/sql" "encoding/json" "errors" "fmt" @@ -23,9 +22,7 @@ import ( "time" "github.com/gin-gonic/gin" - "github.com/lib/pq" - "github.com/nethesis/my/backend/database" "github.com/nethesis/my/backend/entities" "github.com/nethesis/my/backend/helpers" "github.com/nethesis/my/backend/logger" @@ -644,11 +641,9 @@ func GetAlertActivity(c *gin.Context) { // failures (timeout, 5xx, parse error) are collected as warnings; the rest of // the result is returned. // -// Each alert is enriched with a top-level `system` object containing the -// owning system's `name` and `type` (product, e.g. "nsec") looked up in the -// local `systems` table by (org_id, system_key). This saves the frontend a -// per-row round-trip to /systems just to render the table cell. If the lookup -// fails or the alert has no system_key, the field is simply omitted. +// System identity (id, key, name, type) is carried as labels on the alert +// itself (system_id, system_key, system_name, system_type), stamped at ingest +// time by collect. No per-request DB lookup is needed here. func fanOutMimirAlerts(parent context.Context, orgIDs []string) ([]map[string]interface{}, []string) { var ( all []map[string]interface{} @@ -693,8 +688,6 @@ func fanOutMimirAlerts(parent context.Context, orgIDs []string) ([]map[string]in return } - enrichAlertsWithSystemInfo(orgID, alerts) - mu.Lock() all = append(all, alerts...) mu.Unlock() @@ -704,77 +697,6 @@ func fanOutMimirAlerts(parent context.Context, orgIDs []string) ([]map[string]in return all, warnings } -// enrichAlertsWithSystemInfo decorates each alert with a `system` object -// (id, name, type, system_key) by issuing a single SELECT against the local -// systems table for the distinct system_key values it sees. Best-effort: -// a DB hiccup or an unmatched key just leaves the system field unset on -// that alert. The `id` is the local DB UUID — what the frontend uses to -// build the system-detail link (/systems/:id). -func enrichAlertsWithSystemInfo(orgID string, alerts []map[string]interface{}) { - if len(alerts) == 0 { - return - } - keys := make(map[string]struct{}, len(alerts)) - for _, a := range alerts { - labels, _ := a["labels"].(map[string]interface{}) - if k, ok := labels["system_key"].(string); ok && k != "" { - keys[k] = struct{}{} - } - } - if len(keys) == 0 { - return - } - keyList := make([]string, 0, len(keys)) - for k := range keys { - keyList = append(keyList, k) - } - rows, err := database.DB.Query( - `SELECT id, system_key, name, type FROM systems WHERE organization_id = $1 AND system_key = ANY($2)`, - orgID, pq.Array(keyList), - ) - if err != nil { - logger.Warn().Err(err).Str("org_id", orgID).Msg("failed to lookup system info for alert enrichment") - return - } - defer func() { _ = rows.Close() }() - - type sysInfo struct { - ID string - Name string - Type sql.NullString - } - infoBy := make(map[string]sysInfo, len(keyList)) - for rows.Next() { - var id, k, n string - var t sql.NullString - if err := rows.Scan(&id, &k, &n, &t); err != nil { - continue - } - infoBy[k] = sysInfo{ID: id, Name: n, Type: t} - } - - for _, a := range alerts { - labels, _ := a["labels"].(map[string]interface{}) - k, _ := labels["system_key"].(string) - if k == "" { - continue - } - info, ok := infoBy[k] - if !ok { - continue - } - s := map[string]interface{}{ - "id": info.ID, - "system_key": k, - "name": info.Name, - } - if info.Type.Valid { - s["type"] = info.Type.String - } - a["system"] = s - } -} - // GetAlertingConfig handles GET /api/alerts/config — returns the CALLER's // own alerting layer. Returns an empty layer (with audit metadata absent) // when the caller has never saved one; the frontend renders the empty-state @@ -1201,22 +1123,13 @@ func GetSystemAlerts(c *gin.Context) { return } - // Filter alerts by this system's key and decorate each with the same - // `system` enrichment shape as /api/alerts. The system was already loaded - // at the start of the handler so no extra query is needed. - sysInfo := map[string]interface{}{ - "id": system.ID, - "system_key": system.SystemKey, - "name": system.Name, - } - if system.Type != nil { - sysInfo["type"] = *system.Type - } + // Filter alerts by this system's key. System identity is already carried + // as labels on the alert (system_id, system_key, system_name, system_type) + // stamped by collect at ingest time. filtered := make([]map[string]interface{}, 0, len(alerts)) for _, alert := range alerts { labels, _ := alert["labels"].(map[string]interface{}) if sk, ok := labels["system_key"].(string); ok && sk == system.SystemKey { - alert["system"] = sysInfo filtered = append(filtered, alert) } } diff --git a/backend/openapi.yaml b/backend/openapi.yaml index 91bcb48b..90f58de3 100644 --- a/backend/openapi.yaml +++ b/backend/openapi.yaml @@ -375,10 +375,11 @@ components: ActiveAlert: type: object description: | - Active alert returned by Mimir Alertmanager, enriched with local-DB - system info before being sent to the client. The fan-out includes + Active alert returned by Mimir Alertmanager. The fan-out includes silenced and inhibited alerts (not just `active` ones), so callers can - render the muted/suppressed state. + render the muted/suppressed state. System identity (id, key, name, + type) is carried as labels (`system_id`, `system_key`, `system_name`, + `system_type`), stamped server-side at ingest time. properties: fingerprint: type: string @@ -388,11 +389,20 @@ components: type: object additionalProperties: type: string - description: Alert labels (alertname, severity, system_key, ...) + description: | + Alert labels. Always includes server-stamped identity labels + (`system_id`, `system_key`, `system_name`, `system_type`, + `system_fqdn`, `system_ipv4`, `organization_name`, + `organization_vat`, `organization_type`) plus the alert's own + labels (`alertname`, `severity`, ...). Use `system_id` to link + to the system detail page (/systems/:id). example: alertname: "DiskFilling" severity: "warning" + system_id: "35aa0d84-08c1-4013-b1fd-d5f6ef3e0541" system_key: "NETH-FBB2-1A6E-7CAD-44A4-A772-B3EE-F0F6-F371" + system_name: "cust1-sys-A" + system_type: "ns8" annotations: type: object additionalProperties: @@ -421,34 +431,6 @@ components: generatorURL: type: string description: Source URL of the alert (set by the pushing agent), if any - system: - type: object - nullable: true - description: | - Local-DB enrichment: the owning system's id, name and product - type, resolved by (organization_id, labels.system_key). Omitted - when the system_key has no matching row in the local systems - table (e.g., system unregistered or alert pushed against a - stale key). The frontend uses `id` (local DB UUID) to link to - the system detail page (/systems/:id); `system_key` is the - Mimir tenant label and is shown in the UI for reference. - properties: - id: - type: string - format: uuid - description: Local DB UUID. Use this to build links to /systems/:id. - example: "35aa0d84-08c1-4013-b1fd-d5f6ef3e0541" - system_key: - type: string - description: System key as used in alert labels (Mimir tenant identifier on this system). - example: "NETH-FBB2-1A6E-7CAD-44A4-A772-B3EE-F0F6-F371" - name: - type: string - example: "cust1-sys-A" - type: - type: string - nullable: true - description: Product type (e.g. "nsec"). Null until the agent reports it on first inventory. AlertActivityEntry: type: object @@ -8919,8 +8901,8 @@ paths: description: | Returns current alerts from Mimir for a specific system, filtered by the system's key. Suppressed alerts remain visible so silenced alerts can still be inspected in the system detail view. - Each alert is enriched with a `system` object (same shape as `/api/alerts`) — for this - endpoint it always points to the system identified by the path `id`. + System identity is carried as labels on each alert (`system_id`, `system_key`, + `system_name`, `system_type`), stamped at ingest time. Requires `read:systems` permission. security: - BearerAuth: [] @@ -8970,7 +8952,10 @@ paths: labels: alertname: "DiskFilling" severity: "warning" + system_id: "e4eb4844-46f6-448c-8279-7cfedf5e1037" system_key: "NETH-D417-A2C2-7810-43D2-984B-2164-34C1-B22E" + system_name: "test-sys" + system_type: "ns8" annotations: summary: "/var is 92% full" description: "Disk usage exceeded warning threshold." @@ -8981,16 +8966,14 @@ paths: inhibitedBy: [] startsAt: "2026-05-12T08:14:00Z" endsAt: "2026-05-12T08:44:00Z" - system: - id: "e4eb4844-46f6-448c-8279-7cfedf5e1037" - system_key: "NETH-D417-A2C2-7810-43D2-984B-2164-34C1-B22E" - name: "test-sys" - type: "ns8" - fingerprint: "11a9302b0fa6526e" labels: alertname: "HighCPU" severity: "critical" + system_id: "e4eb4844-46f6-448c-8279-7cfedf5e1037" system_key: "NETH-D417-A2C2-7810-43D2-984B-2164-34C1-B22E" + system_name: "test-sys" + system_type: "ns8" annotations: summary: "CPU usage 98%" description: "Sustained high CPU." @@ -9000,11 +8983,6 @@ paths: inhibitedBy: [] startsAt: "2026-05-12T08:20:00Z" endsAt: "2026-05-12T08:50:00Z" - system: - id: "e4eb4844-46f6-448c-8279-7cfedf5e1037" - system_key: "NETH-D417-A2C2-7810-43D2-984B-2164-34C1-B22E" - name: "test-sys" - type: "ns8" NoActiveAlerts: summary: System has no firing alerts value: @@ -12506,13 +12484,13 @@ paths: items: type: string examples: - ActiveAlertWithEnrichment: + ActiveAlertExample: summary: One active warning across the caller's hierarchy description: | - A single warning alert returned with local-DB system enrichment. - `state="active"` means Mimir has not been told to silence it; an - actively-muted alert would have `state="suppressed"` and a - non-empty `silencedBy`. + A single warning alert. System identity (id, key, name, type) is + carried as labels, stamped at ingest time. `state="active"` means + Mimir has not been told to silence it; an actively-muted alert + would have `state="suppressed"` and a non-empty `silencedBy`. value: code: 200 message: alerts retrieved successfully @@ -12522,7 +12500,10 @@ paths: labels: alertname: "DiskFilling" severity: "warning" + system_id: "e4eb4844-46f6-448c-8279-7cfedf5e1037" system_key: "NETH-D417-A2C2-7810-43D2-984B-2164-34C1-B22E" + system_name: "test-sys" + system_type: "ns8" annotations: summary: "/var is 92% full on test-sys" description: "Disk usage exceeded the warning threshold." @@ -12532,11 +12513,6 @@ paths: inhibitedBy: [] startsAt: "2026-05-12T08:14:00Z" endsAt: "2026-05-12T08:44:00Z" - system: - id: "e4eb4844-46f6-448c-8279-7cfedf5e1037" - system_key: "NETH-D417-A2C2-7810-43D2-984B-2164-34C1-B22E" - name: "test-sys" - type: "ns8" pagination: page: 1 page_size: 50 diff --git a/collect/alerting/mimir.go b/collect/alerting/mimir.go index d2d2e37b..172f5f8a 100644 --- a/collect/alerting/mimir.go +++ b/collect/alerting/mimir.go @@ -37,6 +37,7 @@ type SystemAlertMetadata struct { OrganizationID string SystemKey string SystemName string + SystemType string SystemFQDN string SystemIPv4 string OrganizationName string @@ -56,6 +57,7 @@ type SystemAlertContext struct { func LookupSystemAlertContext(ctx context.Context, db *sql.DB, systemID string) (*SystemAlertContext, error) { var ( metadata SystemAlertMetadata + systemType sql.NullString systemFQDN sql.NullString systemIPv4 sql.NullString organizationName sql.NullString @@ -68,6 +70,7 @@ func LookupSystemAlertContext(ctx context.Context, db *sql.DB, systemID string) s.organization_id, s.system_key, s.name, + s.type, s.fqdn, s.ipv4_address::text, COALESCE(d.name, r.name, c.name), @@ -88,6 +91,7 @@ func LookupSystemAlertContext(ctx context.Context, db *sql.DB, systemID string) &metadata.OrganizationID, &metadata.SystemKey, &metadata.SystemName, + &systemType, &systemFQDN, &systemIPv4, &organizationName, @@ -98,6 +102,7 @@ func LookupSystemAlertContext(ctx context.Context, db *sql.DB, systemID string) return nil, err } + metadata.SystemType = nullStringValue(systemType) metadata.SystemFQDN = nullStringValue(systemFQDN) metadata.SystemIPv4 = nullStringValue(systemIPv4) metadata.OrganizationName = nullStringValue(organizationName) @@ -116,6 +121,7 @@ func BuildSystemAlertContext(metadata SystemAlertMetadata) *SystemAlertContext { "system_id": metadata.SystemID, "system_key": metadata.SystemKey, "system_name": metadata.SystemName, + "system_type": metadata.SystemType, "system_fqdn": metadata.SystemFQDN, "system_ipv4": metadata.SystemIPv4, "organization_name": metadata.OrganizationName, diff --git a/collect/alerting/mimir_test.go b/collect/alerting/mimir_test.go index c3968b82..9c62de9f 100644 --- a/collect/alerting/mimir_test.go +++ b/collect/alerting/mimir_test.go @@ -21,6 +21,7 @@ func TestBuildSystemAlertContext(t *testing.T) { OrganizationID: "org-1", SystemKey: "SYS-001", SystemName: "web-01", + SystemType: "ns8", SystemFQDN: "web-01.example.com", SystemIPv4: "192.0.2.10", OrganizationName: "Acme Corp", @@ -36,6 +37,7 @@ func TestBuildSystemAlertContext(t *testing.T) { "system_id": "system-1", "system_key": "SYS-001", "system_name": "web-01", + "system_type": "ns8", "system_fqdn": "web-01.example.com", "system_ipv4": "192.0.2.10", "organization_name": "Acme Corp", @@ -55,8 +57,10 @@ func TestBuildSystemAlertContext_EmptyFields(t *testing.T) { require.NotNil(t, ctx) // All identity labels must be present so InjectLabels can strip // client-supplied spoofed values for keys the server has no value for. + assert.Contains(t, ctx.Labels, "system_type") assert.Contains(t, ctx.Labels, "system_fqdn") assert.Contains(t, ctx.Labels, "organization_name") + assert.Equal(t, "", ctx.Labels["system_type"]) assert.Equal(t, "", ctx.Labels["system_fqdn"]) assert.Equal(t, "", ctx.Labels["organization_name"]) } diff --git a/collect/cron/linkfailed_monitor.go b/collect/cron/linkfailed_monitor.go index d4adc365..9cafa010 100644 --- a/collect/cron/linkfailed_monitor.go +++ b/collect/cron/linkfailed_monitor.go @@ -94,6 +94,7 @@ func (m *LinkFailedMonitor) loadInactiveSystems(ctx context.Context) (map[string s.organization_id, s.system_key, s.name, + s.type, s.fqdn, s.ipv4_address::text, COALESCE(d.name, r.name, c.name), @@ -124,6 +125,7 @@ func (m *LinkFailedMonitor) loadInactiveSystems(ctx context.Context) (map[string for rows.Next() { var ( metadata collectalerting.SystemAlertMetadata + systemType sql.NullString systemFQDN sql.NullString systemIPv4 sql.NullString organizationName sql.NullString @@ -137,6 +139,7 @@ func (m *LinkFailedMonitor) loadInactiveSystems(ctx context.Context) (map[string &metadata.OrganizationID, &metadata.SystemKey, &metadata.SystemName, + &systemType, &systemFQDN, &systemIPv4, &organizationName, @@ -147,6 +150,7 @@ func (m *LinkFailedMonitor) loadInactiveSystems(ctx context.Context) (map[string return nil, fmt.Errorf("scan inactive system: %w", err) } + metadata.SystemType = nullStringValue(systemType) metadata.SystemFQDN = nullStringValue(systemFQDN) metadata.SystemIPv4 = nullStringValue(systemIPv4) metadata.OrganizationName = nullStringValue(organizationName) From 08a07dd7d062266a0a302372bfac784bcb3e6685 Mon Sep 17 00:00:00 2001 From: Edoardo Spadoni Date: Wed, 13 May 2026 15:35:39 +0200 Subject: [PATCH 05/10] feat(alerts): cross-system silences + filter/sort parity MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add /alerts/silences* mirroring /systems/:id/alerts/silences* so the cross- system list page can mute without routing through a system path. Silences are interoperable between the two routes (same buildSystemAlertSilenceRequest underneath) and share the alert_activity timeline. Bring /systems/:id/alerts to parity with /alerts: severity/alertname/status multi-value filters, pagination wrapper, sort_by allowlist with default starts_at desc and page_size 50. Add multi-value system_key filter to /alerts/history; override default sort_direction to desc on both history endpoints so the pagination object reflects the actual repo ordering. Add status to /alerts allowed sort columns; rename internal alertFilter.states to .statuses and stateOf to statusOf for naming coherence with the public status query param. Rename /alerts/:fingerprint/activity to /alerts/activity/:fingerprint to resolve OpenAPI path ambiguity with /alerts/silences/:silence_id. Drop the legacy ?state= alias and the deprecated OpenAPI parameters — this is pre-release, no consumers exist. --- AGENTS.md | 6 +- .../entities/local_alertmanager_history.go | 22 +- .../local_alertmanager_history_test.go | 6 +- backend/main.go | 17 +- backend/methods/alerting.go | 542 ++++++++++++++++- backend/methods/alerting_unit_test.go | 158 ++++- backend/openapi.yaml | 556 +++++++++++++++++- 7 files changed, 1246 insertions(+), 61 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index c46d6dd2..06244b19 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -150,7 +150,7 @@ Single-node Grafana Mimir with S3-compatible backend and multi-tenant Alertmanag - Backend (`backend/services/alerting/`) holds one `AlertingConfigLayer` per organization in `alert_config_layers` (flat recipient-based shape: `enabled`, `email_recipients[]`, `webhook_recipients[]`, `telegram_recipients[]`, each recipient carries its own `severities[]`; email also `language` + `format`). The effective per-tenant Mimir YAML is the server-side merge of every layer from Owner down to the tenant (union dedup, additive-only). `/alerts/config` only ever returns the caller's own layer — the merged view is internal and never leaves the backend. Templates are Go `html/template`-embedded, en/it locales, firing + resolved variants; both languages ship with every tenant push and the renderer picks per email recipient via per-language dispatchers (`alert_.html|txt|subject`). - Collect proxies systems to Alertmanager `alerts`/`silences` with `X-Scope-OrgID` from the authenticated system's org. - Alertmanager webhooks resolved alerts back to collect `/api/alert_history`, which persists them scoped by `organization_id` (column on `alert_history`, populated from the DB via `system_key` lookup — never trusted from the payload). -- RBAC: `/alerts/config*` is gated on a dedicated `alerts` resource (`read:alerts` for GET, `manage:alerts` for POST/DELETE) — admin/super only. The list/silence endpoints (`/alerts`, `/alerts/history`, `/systems/:id/alerts*`) stay on `read:systems`/`manage:systems`. +- RBAC: `/alerts/config*` is gated on a dedicated `alerts` resource (`read:alerts` for GET, `manage:alerts` for POST/DELETE) — admin/super only. The list/silence endpoints (`/alerts`, `/alerts/history`, `/alerts/silences*`, `/alerts/activity/:fingerprint`, `/systems/:id/alerts*`) stay on `read:systems`/`manage:systems`. The cross-system `/alerts/silences*` set mirrors `/systems/:id/alerts/silences*` 1:1 — same backend `buildSystemAlertSilenceRequest` builds the Mimir payload, so a silence created via either route is interoperable with the other. ### 3.6 Proxy (`proxy/`) @@ -371,7 +371,9 @@ Authoritative: `backend/openapi.yaml` (also `make docs` / redocly). High-level r /api/users/* CRUD + avatar + import/export + password reset + suspend/reactivate /api/systems/* CRUD + inventory + alerts + regenerate-secret + reachability + export /api/applications/* CRUD + assign/unassign org + totals/summary/trend -/api/alerts, /api/alerts/{totals,trend,config} active alerts + config + aggregates +/api/alerts, /api/alerts/{totals,trend,stats,history,config} active alerts + config + aggregates + history +/api/alerts/silences/* cross-system silences (mute/unmute) — parallel to /systems/:id/alerts/silences +/api/alerts/activity/:fingerprint per-alert audit timeline (silence created/updated/removed) /api/filters/{systems,applications,users} UI filter aggregation /api/rebranding/* per-org per-product asset management /api/organizations, /api/roles, /api/organization-roles metadata diff --git a/backend/entities/local_alertmanager_history.go b/backend/entities/local_alertmanager_history.go index 9987f89f..e96ab48d 100644 --- a/backend/entities/local_alertmanager_history.go +++ b/backend/entities/local_alertmanager_history.go @@ -29,12 +29,14 @@ func NewLocalAlertHistoryRepository() *LocalAlertHistoryRepository { // AlertHistoryQuery captures all the optional filters and pagination params // for QueryAlertHistory. OrgIDs is the only required field (callers resolve -// scope upstream via resolveOrgScope or equivalent). SystemKey limits to one -// system (used by /api/systems/:id/alerts/history); when empty the query is -// org-level (used by /api/alerts/history). +// scope upstream via resolveOrgScope or equivalent). SystemKeys narrows the +// result to one or more systems: a single-element slice is the per-system +// case (/api/systems/:id/alerts/history); a multi-value slice is the +// cross-system filter exposed on /api/alerts/history; an empty slice means +// "all systems in scope". type AlertHistoryQuery struct { OrgIDs []string - SystemKey string + SystemKeys []string Alertnames []string Severities []string Statuses []string @@ -88,10 +90,14 @@ func (r *LocalAlertHistoryRepository) QueryAlertHistory(q AlertHistoryQuery) ([] } conds = append(conds, fmt.Sprintf("organization_id IN (%s)", strings.Join(ph, ","))) } - if q.SystemKey != "" { - conds = append(conds, fmt.Sprintf("system_key = $%d", idx)) - args = append(args, q.SystemKey) - idx++ + if len(q.SystemKeys) > 0 { + ph := make([]string, len(q.SystemKeys)) + for i, v := range q.SystemKeys { + ph[i] = fmt.Sprintf("$%d", idx) + args = append(args, v) + idx++ + } + conds = append(conds, fmt.Sprintf("system_key IN (%s)", strings.Join(ph, ","))) } if len(q.Alertnames) > 0 { ph := make([]string, len(q.Alertnames)) diff --git a/backend/entities/local_alertmanager_history_test.go b/backend/entities/local_alertmanager_history_test.go index 94f41861..93a4f624 100644 --- a/backend/entities/local_alertmanager_history_test.go +++ b/backend/entities/local_alertmanager_history_test.go @@ -36,7 +36,7 @@ func TestQueryAlertHistory_PerSystem(t *testing.T) { now := time.Now().UTC() - mock.ExpectQuery(`SELECT COUNT\(\*\) FROM alert_history WHERE organization_id IN \(\$1\) AND system_key = \$2`). + mock.ExpectQuery(`SELECT COUNT\(\*\) FROM alert_history WHERE organization_id IN \(\$1\) AND system_key IN \(\$2\)`). WithArgs("org-1", "SYS-001"). WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(2)) @@ -54,7 +54,7 @@ func TestQueryAlertHistory_PerSystem(t *testing.T) { WillReturnRows(rows) records, totalCount, err := repo.QueryAlertHistory(AlertHistoryQuery{ - OrgIDs: []string{"org-1"}, SystemKey: "SYS-001", + OrgIDs: []string{"org-1"}, SystemKeys: []string{"SYS-001"}, Page: 1, PageSize: 20, SortBy: "created_at", SortDirection: "desc", }) @@ -133,7 +133,7 @@ func TestQueryAlertHistory_SortValidation(t *testing.T) { // Invalid sort_by/sort_direction must fall back to created_at desc. _, _, err := repo.QueryAlertHistory(AlertHistoryQuery{ - OrgIDs: []string{"org-1"}, SystemKey: "SYS-001", + OrgIDs: []string{"org-1"}, SystemKeys: []string{"SYS-001"}, Page: 1, PageSize: 10, SortBy: "invalid_column", SortDirection: "INVALID", }) assert.NoError(t, err) diff --git a/backend/main.go b/backend/main.go index 58a75f28..7fe55262 100644 --- a/backend/main.go +++ b/backend/main.go @@ -286,8 +286,21 @@ func main() { alertsGroup.GET("/trend", methods.GetAlertsTrend) // Alert history trend with daily data points alertsGroup.GET("/stats", methods.GetAlertsStats) // Aggregate stats: severity buckets, top-N alertname/system_key, MTTR/MTBF - // Per-alert audit timeline (silence created/updated/removed events for the alert detail drawer) - alertsGroup.GET("/:fingerprint/activity", methods.GetAlertActivity) + // Per-alert audit timeline (silence created/updated/removed events for the alert detail drawer). + // The "activity" literal segment comes BEFORE the param so this path + // doesn't collide with /alerts/silences/{silence_id} (3-segment param-second pattern). + alertsGroup.GET("/activity/:fingerprint", methods.GetAlertActivity) + + // Silences (cross-system mute). Mirrors /systems/:id/alerts/silences* + // but takes ?organization_id= for the per-id ops and resolves the + // system_key from the alert labels (POST) or the silence matchers + // (PUT/DELETE). RBAC stays on `systems`: read:systems for GET, + // manage:systems for POST/PUT/DELETE. + alertsGroup.GET("/silences", methods.GetAlertSilences) // List active+pending silences across the caller's hierarchy + alertsGroup.POST("/silences", methods.CreateAlertSilence) // Mute an alert (body: { fingerprint, end_at, comment, duration_minutes? }) + alertsGroup.GET("/silences/:silence_id", methods.GetAlertSilence) // Get a single silence (requires ?organization_id=) + alertsGroup.PUT("/silences/:silence_id", methods.UpdateAlertSilence) // Update a silence's end time / comment (requires ?organization_id=) + alertsGroup.DELETE("/silences/:silence_id", methods.DeleteAlertSilence) // Unmute (requires ?organization_id=) // Configuration management (per-org layered model) — gated on the // dedicated `alerts` resource. GET → read:alerts, POST/DELETE → manage:alerts. diff --git a/backend/methods/alerting.go b/backend/methods/alerting.go index 9b0d44cd..7cebd24a 100644 --- a/backend/methods/alerting.go +++ b/backend/methods/alerting.go @@ -444,10 +444,16 @@ const alertsListDefaultPageSize = 50 // active /api/alerts list. Default is starts_at desc (most recent first). // severity is sorted by criticality rank (critical > warning > info > other), // not lexicographically. Anything outside this set falls back to starts_at. +// +// The intersection with /api/alerts/history (starts_at, severity, alertname, +// status) is intentional: the UI can offer a single "Sort by" dropdown that +// works on both tabs without sending column names that one side would +// silently fall back to its default. var alertsListAllowedSortBy = map[string]bool{ "starts_at": true, "severity": true, "alertname": true, + "status": true, } // severityRank maps severity labels to a comparable integer (higher = more @@ -500,7 +506,7 @@ func GetAlerts(c *gin.Context) { all, warnings := fanOutMimirAlerts(c.Request.Context(), orgIDs) all = filterAlerts(all, alertFilter{ - states: c.QueryArray("state"), + statuses: c.QueryArray("status"), severities: c.QueryArray("severity"), systemKeys: c.QueryArray("system_key"), alertnames: c.QueryArray("alertname"), @@ -554,6 +560,11 @@ func sortAlertsList(alerts []map[string]interface{}, sortBy, sortDirection strin aj := alertnameOf(alerts[j]) primaryEqual = ai == aj primaryLess = ai < aj + case "status": + ai := statusOf(alerts[i]) + aj := statusOf(alerts[j]) + primaryEqual = ai == aj + primaryLess = ai < aj default: // starts_at si, _ := alerts[i]["startsAt"].(string) sj, _ := alerts[j]["startsAt"].(string) @@ -585,17 +596,29 @@ func alertnameOf(alert map[string]interface{}) string { return s } +// statusOf returns the Alertmanager status.state of an active alert +// ("active", "suppressed", "unprocessed"). Named after the public query +// param (`?status=`) and the sort column (`sort_by=status`) rather than +// the underlying JSON field; the alert payload still nests it under +// `.status.state` because that's the upstream Alertmanager shape. +func statusOf(alert map[string]interface{}) string { + status, _ := alert["status"].(map[string]interface{}) + s, _ := status["state"].(string) + return s +} + // alertFingerprintPattern restricts the fingerprint path param to safe chars. // Alertmanager fingerprints are 16-char lowercase hex but we allow a slightly // looser charset to accommodate test fixtures and any future format change. var alertFingerprintPattern = regexp.MustCompile(`^[A-Za-z0-9._:-]{1,128}$`) -// GetAlertActivity handles GET /api/alerts/:fingerprint/activity +// GetAlertActivity handles GET /api/alerts/activity/:fingerprint // Returns the per-alert audit timeline (silence created/updated/removed) for // the alert identified by fingerprint within the resolved tenant. Most recent // first. Operator notes are stored as the comment of the silence the action // produced, so the timeline is the source of truth for "what happened, when, -// by whom". +// by whom". Literal `activity` precedes `:fingerprint` so the route does not +// collide with /alerts/silences/{silence_id}. func GetAlertActivity(c *gin.Context) { user, ok := helpers.GetUserFromContext(c) if !ok { @@ -915,7 +938,7 @@ func GetAlertsTrend(c *gin.Context) { // it's a binding detail of one handler. Multiple values within a single field // are matched as OR; different fields AND together. type alertFilter struct { - states []string + statuses []string severities []string systemKeys []string alertnames []string @@ -926,19 +949,19 @@ type alertFilter struct { // or does not match any of the requested values; this prevents silent leakage // of unrelated alerts when the caller narrows the query. func filterAlerts(alerts []map[string]interface{}, f alertFilter) []map[string]interface{} { - if len(f.states) == 0 && len(f.severities) == 0 && len(f.systemKeys) == 0 && len(f.alertnames) == 0 { + if len(f.statuses) == 0 && len(f.severities) == 0 && len(f.systemKeys) == 0 && len(f.alertnames) == 0 { return alerts } filtered := make([]map[string]interface{}, 0, len(alerts)) for _, alert := range alerts { - if len(f.states) > 0 { + if len(f.statuses) > 0 { status, ok := alert["status"].(map[string]interface{}) if !ok { continue } state, ok := status["state"].(string) - if !ok || !slices.Contains(f.states, state) { + if !ok || !slices.Contains(f.statuses, state) { continue } } @@ -1089,7 +1112,17 @@ func silenceBelongsToSystem(silence *models.AlertmanagerSilence, systemKey strin } // GetSystemAlerts handles GET /api/systems/:id/alerts -// Returns active alerts from Mimir for a specific system, filtered by system_key. +// Returns active alerts from Mimir scoped to a single system, with the same +// filters, pagination, and sorting surface as the cross-system /api/alerts +// list. The system_key in the URL acts as a hard scope: the multi-value +// `system_key` query filter that /api/alerts accepts is not exposed here +// since it would either be redundant (same value) or rejected. +// +// Accepted query params (all optional): +// - severity, alertname, status (multi-value, OR within, AND across) +// - page, page_size (default 50, cap 100) +// - sort_by (starts_at | severity | alertname | status), default starts_at +// - sort_direction (asc | desc), default desc func GetSystemAlerts(c *gin.Context) { systemID := c.Param("id") if systemID == "" { @@ -1109,6 +1142,19 @@ func GetSystemAlerts(c *gin.Context) { return } + page, pageSize := helpers.GetPaginationFromQuery(c) + if c.Query("page_size") == "" { + pageSize = alertsListDefaultPageSize + } + + sortBy, sortDirection := helpers.GetSortingFromQuery(c) + if !alertsListAllowedSortBy[sortBy] { + sortBy = "starts_at" + } + if c.Query("sort_direction") == "" { + sortDirection = "desc" + } + body, err := alerting.GetAlerts(getSystemAlertOrgID(system)) if err != nil { c.JSON(http.StatusInternalServerError, response.InternalServerError("failed to fetch alerts from mimir: "+err.Error(), nil)) @@ -1118,24 +1164,49 @@ func GetSystemAlerts(c *gin.Context) { var alerts []map[string]interface{} if err := json.Unmarshal(body, &alerts); err != nil { c.JSON(http.StatusOK, response.OK("alerts retrieved successfully", gin.H{ - "alerts": []interface{}{}, + "alerts": []map[string]interface{}{}, + "pagination": helpers.BuildPaginationInfoWithSorting(page, pageSize, 0, sortBy, sortDirection), })) return } - // Filter alerts by this system's key. System identity is already carried - // as labels on the alert (system_id, system_key, system_name, system_type) - // stamped by collect at ingest time. - filtered := make([]map[string]interface{}, 0, len(alerts)) + // Hard scope by system_key. System identity is carried as labels on the + // alert (system_id, system_key, system_name, system_type) stamped at + // ingest time by collect, so no DB join is needed. + scoped := make([]map[string]interface{}, 0, len(alerts)) for _, alert := range alerts { labels, _ := alert["labels"].(map[string]interface{}) if sk, ok := labels["system_key"].(string); ok && sk == system.SystemKey { - filtered = append(filtered, alert) + scoped = append(scoped, alert) } } + filtered := filterAlerts(scoped, alertFilter{ + statuses: c.QueryArray("status"), + severities: c.QueryArray("severity"), + alertnames: c.QueryArray("alertname"), + // systemKeys intentionally omitted: the URL path is the source of truth. + }) + + sortAlertsList(filtered, sortBy, sortDirection) + + totalCount := len(filtered) + start := (page - 1) * pageSize + end := start + pageSize + if start > totalCount { + start = totalCount + } + if end > totalCount { + end = totalCount + } + pageAlerts := filtered[start:end] + if pageAlerts == nil { + pageAlerts = []map[string]interface{}{} + } + c.JSON(http.StatusOK, response.OK("alerts retrieved successfully", gin.H{ - "alerts": filtered, + "alerts": pageAlerts, + "pagination": helpers.BuildPaginationInfoWithSorting(page, pageSize, totalCount, sortBy, sortDirection), })) } @@ -1499,6 +1570,425 @@ func UpdateSystemAlertSilence(c *gin.Context) { })) } +// systemKeyFromSilence returns the `system_key` matcher value of a silence, +// or "" if the silence has no exact (non-regex) system_key matcher. The +// cross-system silence endpoints use it to scope operations to silences that +// were created against a specific system (the only ones our UI ever creates) +// and to ignore generic Alertmanager silences that may exist for the tenant. +func systemKeyFromSilence(s *models.AlertmanagerSilence) string { + if s == nil { + return "" + } + for _, m := range s.Matchers { + if m.Name == "system_key" && !m.IsRegex { + return m.Value + } + } + return "" +} + +// resolveAlertSilenceContext looks up an active alert by fingerprint inside +// a single tenant's Mimir and returns the alert plus the `system_key` label +// the cross-system silence handler needs to attach as a matcher. Used by +// POST /api/alerts/silences; not used by the per-system silence handlers +// (those already know the system_key from the URL path). +// +// Returns (alert, systemKey, true) on success. On any failure the response +// is already written and the caller must just return. +func resolveAlertSilenceContext(c *gin.Context, orgID, fingerprint string) (*models.ActiveAlert, string, bool) { + body, err := alerting.GetAlerts(orgID) + if err != nil { + c.JSON(http.StatusInternalServerError, response.InternalServerError("failed to fetch alerts from mimir: "+err.Error(), nil)) + return nil, "", false + } + var alerts []models.ActiveAlert + if err := json.Unmarshal(body, &alerts); err != nil { + c.JSON(http.StatusInternalServerError, response.InternalServerError("failed to parse alerts from mimir: "+err.Error(), nil)) + return nil, "", false + } + for i := range alerts { + if alerts[i].Fingerprint != fingerprint { + continue + } + systemKey := alerts[i].Labels["system_key"] + if systemKey == "" { + c.JSON(http.StatusBadRequest, response.BadRequest("alert is not system-scoped (missing system_key label)", nil)) + return nil, "", false + } + return &alerts[i], systemKey, true + } + c.JSON(http.StatusNotFound, response.NotFound("alert not found", nil)) + return nil, "", false +} + +// CreateAlertSilence handles POST /api/alerts/silences +// Cross-system mute: body is { fingerprint, end_at, comment, duration_minutes? }. +// The tenant comes from ?organization_id= (mandatory for non-Owner) and the +// system_key matcher is resolved from the alert's labels in Mimir. The actual +// silence creation reuses the same buildSystemAlertSilenceRequest + +// alerting.CreateSilence path used by the per-system endpoint, so the silence +// object stored in Mimir is byte-identical regardless of which route created it. +func CreateAlertSilence(c *gin.Context) { + user, ok := helpers.GetUserFromContext(c) + if !ok { + return + } + orgID, ok := resolveOrgID(c, user) + if !ok { + return + } + if !requireOrgID(c, orgID) { + return + } + + var req models.CreateSystemAlertSilenceRequest + if err := c.ShouldBindJSON(&req); err != nil { + c.JSON(http.StatusBadRequest, response.BadRequest("invalid request body: "+err.Error(), nil)) + return + } + if !alertFingerprintPattern.MatchString(req.Fingerprint) { + c.JSON(http.StatusBadRequest, response.BadRequest("invalid fingerprint", nil)) + return + } + + alert, systemKey, ok := resolveAlertSilenceContext(c, orgID, req.Fingerprint) + if !ok { + return + } + if len(alert.Status.SilencedBy) > 0 { + c.JSON(http.StatusBadRequest, response.BadRequest("alert is already silenced", nil)) + return + } + + now := time.Now().UTC() + var endsAt time.Time + if req.EndAt != "" { + parsed, err := time.Parse(time.RFC3339, req.EndAt) + if err != nil { + c.JSON(http.StatusBadRequest, response.BadRequest("invalid end_at: must be RFC3339 datetime", nil)) + return + } + if !parsed.After(now) { + c.JSON(http.StatusBadRequest, response.BadRequest("invalid end_at: must be in the future", nil)) + return + } + endsAt = parsed.UTC() + } + + silenceReq := buildSystemAlertSilenceRequest( + alert, + systemKey, + getAlertSilenceCreatedBy(user), + req.Comment, + req.DurationMinutes, + now, + endsAt, + ) + + silenceResp, err := alerting.CreateSilence(orgID, silenceReq) + if err != nil { + c.JSON(http.StatusInternalServerError, response.InternalServerError("failed to create silence in mimir: "+err.Error(), nil)) + return + } + + logAlertActivity(c, orgID, req.Fingerprint, entities.AlertActivitySilenced, user, silenceResp.SilenceID, map[string]interface{}{ + "comment": normalizeAlertSilenceComment(req.Comment), + "duration_minutes": req.DurationMinutes, + "end_at": req.EndAt, + }) + + c.JSON(http.StatusOK, response.OK("alert silenced successfully", gin.H{ + "silence_id": silenceResp.SilenceID, + })) +} + +// alertSilenceWithOrg is the per-row payload returned by GET /api/alerts/silences. +// We extend AlertmanagerSilence with the originating organization_id (Mimir +// stores silences per-tenant, so this isn't on the silence object itself) and +// with the system_key extracted from the matchers, so the FE can render the +// "muted on system X" pill without re-parsing matchers. +type alertSilenceWithOrg struct { + models.AlertmanagerSilence + OrganizationID string `json:"organization_id"` + SystemKey string `json:"system_key"` +} + +// GetAlertSilences handles GET /api/alerts/silences +// Cross-hierarchy list of active and pending silences. Scope follows the same +// three modes as /api/alerts/totals (no organization_id / single tenant / +// descendants). Only system-scoped silences (those with a `system_key` matcher) +// are returned; generic Alertmanager silences are filtered out because they +// don't belong to our domain model. Expired silences are also excluded. +// +// Per-tenant fan-out failures are non-fatal and surface in `warnings`. +// +// Optional filters: `system_key` (multi-value, exact match on matcher value). +func GetAlertSilences(c *gin.Context) { + user, ok := helpers.GetUserFromContext(c) + if !ok { + return + } + orgIDs, ok := resolveOrgScope(c, user) + if !ok { + return + } + + systemKeyFilter := c.QueryArray("system_key") + + var ( + out []alertSilenceWithOrg + warnings []string + mu sync.Mutex + wg sync.WaitGroup + ) + ctx, cancel := context.WithTimeout(c.Request.Context(), alertsTotalsFanoutTimeout) + defer cancel() + sem := make(chan struct{}, alertsTotalsFanoutConcurrency) + + for _, orgID := range orgIDs { + wg.Add(1) + go func(orgID string) { + defer wg.Done() + select { + case sem <- struct{}{}: + defer func() { <-sem }() + case <-ctx.Done(): + mu.Lock() + warnings = append(warnings, fmt.Sprintf("org %s: timed out waiting for slot", orgID)) + mu.Unlock() + return + } + + silences, err := alerting.GetSilences(orgID) + if err != nil { + logger.Warn().Err(err).Str("org_id", orgID).Msg("failed to fetch silences from mimir for cross-system list") + mu.Lock() + warnings = append(warnings, fmt.Sprintf("org %s: %s", orgID, err.Error())) + mu.Unlock() + return + } + + local := make([]alertSilenceWithOrg, 0, len(silences)) + for i := range silences { + if silences[i].Status != nil && silences[i].Status.State == "expired" { + continue + } + sk := systemKeyFromSilence(&silences[i]) + if sk == "" { + continue + } + if len(systemKeyFilter) > 0 && !slices.Contains(systemKeyFilter, sk) { + continue + } + local = append(local, alertSilenceWithOrg{ + AlertmanagerSilence: silences[i], + OrganizationID: orgID, + SystemKey: sk, + }) + } + + if len(local) == 0 { + return + } + mu.Lock() + out = append(out, local...) + mu.Unlock() + }(orgID) + } + wg.Wait() + + if out == nil { + out = []alertSilenceWithOrg{} + } + if warnings == nil { + warnings = []string{} + } + + c.JSON(http.StatusOK, response.OK("silences retrieved successfully", gin.H{ + "silences": out, + "warnings": warnings, + })) +} + +// GetAlertSilence handles GET /api/alerts/silences/:silence_id +// Single-silence read across the caller's scope. The tenant is resolved from +// ?organization_id= (mandatory for non-Owner). Refuses to return silences +// that aren't system-scoped (no `system_key` matcher) — those don't belong to +// our domain and a generic 404 keeps the surface tight. +func GetAlertSilence(c *gin.Context) { + user, ok := helpers.GetUserFromContext(c) + if !ok { + return + } + orgID, ok := resolveOrgID(c, user) + if !ok { + return + } + if !requireOrgID(c, orgID) { + return + } + silenceID := c.Param("silence_id") + if silenceID == "" { + c.JSON(http.StatusBadRequest, response.BadRequest("silence id required", nil)) + return + } + + silence, err := alerting.GetSilence(orgID, silenceID) + if errors.Is(err, alerting.ErrSilenceNotFound) { + c.JSON(http.StatusNotFound, response.NotFound("silence not found", nil)) + return + } + if err != nil { + c.JSON(http.StatusInternalServerError, response.InternalServerError("failed to fetch silence from mimir: "+err.Error(), nil)) + return + } + systemKey := systemKeyFromSilence(silence) + if systemKey == "" { + c.JSON(http.StatusNotFound, response.NotFound("silence not found", nil)) + return + } + + c.JSON(http.StatusOK, response.OK("silence retrieved successfully", gin.H{ + "silence": alertSilenceWithOrg{ + AlertmanagerSilence: *silence, + OrganizationID: orgID, + SystemKey: systemKey, + }, + })) +} + +// UpdateAlertSilence handles PUT /api/alerts/silences/:silence_id +// Cross-system silence edit (change end time / comment). Mirrors +// UpdateSystemAlertSilence but discovers system_key from the silence matchers +// instead of taking it from the URL. +func UpdateAlertSilence(c *gin.Context) { + user, ok := helpers.GetUserFromContext(c) + if !ok { + return + } + orgID, ok := resolveOrgID(c, user) + if !ok { + return + } + if !requireOrgID(c, orgID) { + return + } + silenceID := c.Param("silence_id") + if silenceID == "" { + c.JSON(http.StatusBadRequest, response.BadRequest("silence id required", nil)) + return + } + + var req models.UpdateSystemAlertSilenceRequest + if err := c.ShouldBindJSON(&req); err != nil { + c.JSON(http.StatusBadRequest, response.BadRequest("invalid request body: "+err.Error(), nil)) + return + } + + now := time.Now().UTC() + endsAt, err := time.Parse(time.RFC3339, req.EndAt) + if err != nil { + c.JSON(http.StatusBadRequest, response.BadRequest("invalid end_at: must be RFC3339 datetime", nil)) + return + } + if !endsAt.After(now) { + c.JSON(http.StatusBadRequest, response.BadRequest("invalid end_at: must be in the future", nil)) + return + } + + existing, err := alerting.GetSilence(orgID, silenceID) + if errors.Is(err, alerting.ErrSilenceNotFound) { + c.JSON(http.StatusNotFound, response.NotFound("silence not found", nil)) + return + } + if err != nil { + c.JSON(http.StatusInternalServerError, response.InternalServerError("failed to fetch silence from mimir: "+err.Error(), nil)) + return + } + if systemKeyFromSilence(existing) == "" { + c.JSON(http.StatusNotFound, response.NotFound("silence not found", nil)) + return + } + + updateReq := &models.AlertmanagerSilenceRequest{ + ID: existing.ID, + Matchers: existing.Matchers, + StartsAt: existing.StartsAt, + EndsAt: endsAt.UTC().Format(time.RFC3339), + Comment: normalizeAlertSilenceComment(req.Comment), + CreatedBy: existing.CreatedBy, + } + + silenceResp, err := alerting.CreateSilence(orgID, updateReq) + if err != nil { + c.JSON(http.StatusInternalServerError, response.InternalServerError("failed to update silence in mimir: "+err.Error(), nil)) + return + } + + activityRepo := entities.NewLocalAlertActivityRepository() + fingerprint, _ := activityRepo.FindFingerprintBySilenceID(orgID, silenceID) + logAlertActivity(c, orgID, fingerprint, entities.AlertActivitySilenceUpdated, user, silenceResp.SilenceID, map[string]interface{}{ + "comment": normalizeAlertSilenceComment(req.Comment), + "end_at": req.EndAt, + }) + + c.JSON(http.StatusOK, response.OK("silence updated successfully", gin.H{ + "silence_id": silenceResp.SilenceID, + })) +} + +// DeleteAlertSilence handles DELETE /api/alerts/silences/:silence_id +// Cross-system unmute. Same ownership rule as UpdateAlertSilence: only +// silences carrying a `system_key` matcher are addressable through this +// endpoint, so generic Alertmanager silences cannot be removed via the +// public API. +func DeleteAlertSilence(c *gin.Context) { + user, ok := helpers.GetUserFromContext(c) + if !ok { + return + } + orgID, ok := resolveOrgID(c, user) + if !ok { + return + } + if !requireOrgID(c, orgID) { + return + } + silenceID := c.Param("silence_id") + if silenceID == "" { + c.JSON(http.StatusBadRequest, response.BadRequest("silence id required", nil)) + return + } + + silence, err := alerting.GetSilence(orgID, silenceID) + if errors.Is(err, alerting.ErrSilenceNotFound) { + c.JSON(http.StatusNotFound, response.NotFound("silence not found", nil)) + return + } + if err != nil { + c.JSON(http.StatusInternalServerError, response.InternalServerError("failed to fetch silence from mimir: "+err.Error(), nil)) + return + } + if systemKeyFromSilence(silence) == "" { + c.JSON(http.StatusNotFound, response.NotFound("silence not found", nil)) + return + } + + if err := alerting.DeleteSilence(orgID, silenceID); errors.Is(err, alerting.ErrSilenceNotFound) { + c.JSON(http.StatusNotFound, response.NotFound("silence not found", nil)) + return + } else if err != nil { + c.JSON(http.StatusInternalServerError, response.InternalServerError("failed to delete silence in mimir: "+err.Error(), nil)) + return + } + + activityRepo := entities.NewLocalAlertActivityRepository() + fingerprint, _ := activityRepo.FindFingerprintBySilenceID(orgID, silenceID) + logAlertActivity(c, orgID, fingerprint, entities.AlertActivityUnsilenced, user, silenceID, nil) + + c.JSON(http.StatusOK, response.OK("silence disabled successfully", nil)) +} + // GetSystemAlertHistory handles GET /api/systems/:id/alerts/history // Returns paginated resolved/inactive alert history for a system, with // optional date range (?from_date=, ?to_date=, RFC3339) and multi-value @@ -1524,6 +2014,16 @@ func GetSystemAlertHistory(c *gin.Context) { } page, pageSize, sortBy, sortDirection := helpers.GetPaginationAndSortingFromQuery(c) + // Natural defaults for history are "by ingest time, most recent first". + // The shared helpers default sort_by to "" and sort_direction to asc, + // which (a) hide what the repo actually sorts on from the pagination + // response and (b) would surface the oldest events first. + if c.Query("sort_by") == "" { + sortBy = "created_at" + } + if c.Query("sort_direction") == "" { + sortDirection = "desc" + } from, to, perr := parseDateRange(c) if perr != nil { @@ -1534,7 +2034,7 @@ func GetSystemAlertHistory(c *gin.Context) { repo := entities.NewLocalAlertHistoryRepository() records, totalCount, err := repo.QueryAlertHistory(entities.AlertHistoryQuery{ OrgIDs: []string{system.Organization.LogtoID}, - SystemKey: system.SystemKey, + SystemKeys: []string{system.SystemKey}, Alertnames: c.QueryArray("alertname"), Severities: c.QueryArray("severity"), Statuses: c.QueryArray("status"), @@ -1578,6 +2078,15 @@ func GetAlertsHistory(c *gin.Context) { } page, pageSize, sortBy, sortDirection := helpers.GetPaginationAndSortingFromQuery(c) + // See GetSystemAlertHistory for the rationale: default sort_by to + // created_at so the pagination response reflects what the repo actually + // sorts on, and flip the direction default to desc ("most recent first"). + if c.Query("sort_by") == "" { + sortBy = "created_at" + } + if c.Query("sort_direction") == "" { + sortDirection = "desc" + } from, to, perr := parseDateRange(c) if perr != nil { @@ -1588,6 +2097,7 @@ func GetAlertsHistory(c *gin.Context) { repo := entities.NewLocalAlertHistoryRepository() records, totalCount, err := repo.QueryAlertHistory(entities.AlertHistoryQuery{ OrgIDs: orgIDs, + SystemKeys: c.QueryArray("system_key"), Alertnames: c.QueryArray("alertname"), Severities: c.QueryArray("severity"), Statuses: c.QueryArray("status"), diff --git a/backend/methods/alerting_unit_test.go b/backend/methods/alerting_unit_test.go index 5939e58d..c20a7305 100644 --- a/backend/methods/alerting_unit_test.go +++ b/backend/methods/alerting_unit_test.go @@ -59,13 +59,13 @@ func TestFilterAlerts(t *testing.T) { expected: 3, }, { - name: "filter by state active", - params: alertFilter{states: []string{"active"}}, + name: "filter by status active", + params: alertFilter{statuses: []string{"active"}}, expected: 2, }, { - name: "filter by state suppressed", - params: alertFilter{states: []string{"suppressed"}}, + name: "filter by status suppressed", + params: alertFilter{statuses: []string{"suppressed"}}, expected: 1, }, { @@ -95,17 +95,17 @@ func TestFilterAlerts(t *testing.T) { }, { name: "combined filters: active + critical", - params: alertFilter{states: []string{"active"}, severities: []string{"critical"}}, + params: alertFilter{statuses: []string{"active"}, severities: []string{"critical"}}, expected: 2, }, { name: "combined filters: active + warning", - params: alertFilter{states: []string{"active"}, severities: []string{"warning"}}, + params: alertFilter{statuses: []string{"active"}, severities: []string{"warning"}}, expected: 0, }, { name: "combined filters: active + SYS-001 + critical", - params: alertFilter{states: []string{"active"}, systemKeys: []string{"SYS-001"}, severities: []string{"critical"}}, + params: alertFilter{statuses: []string{"active"}, systemKeys: []string{"SYS-001"}, severities: []string{"critical"}}, expected: 2, }, { @@ -119,18 +119,18 @@ func TestFilterAlerts(t *testing.T) { expected: 2, }, { - name: "multi-value state (active OR suppressed)", - params: alertFilter{states: []string{"active", "suppressed"}}, + name: "multi-value status (active OR suppressed)", + params: alertFilter{statuses: []string{"active", "suppressed"}}, expected: 3, }, { name: "multi-value AND single-value combo", - params: alertFilter{severities: []string{"critical", "warning"}, states: []string{"active"}}, + params: alertFilter{severities: []string{"critical", "warning"}, statuses: []string{"active"}}, expected: 2, }, { - name: "non-existent state", - params: alertFilter{states: []string{"unknown"}}, + name: "non-existent status", + params: alertFilter{statuses: []string{"unknown"}}, expected: 0, }, { @@ -185,13 +185,13 @@ func TestFilterAlerts_MissingLabels(t *testing.T) { assert.Equal(t, 1, len(result)) // Filter by state — both have "active" state, so both match. - result = filterAlerts(alerts, alertFilter{states: []string{"active"}}) + result = filterAlerts(alerts, alertFilter{statuses: []string{"active"}}) assert.Equal(t, 2, len(result)) } func TestFilterAlerts_EmptyInput(t *testing.T) { var empty []map[string]interface{} - result := filterAlerts(empty, alertFilter{states: []string{"active"}}) + result := filterAlerts(empty, alertFilter{statuses: []string{"active"}}) assert.Equal(t, 0, len(result)) result = filterAlerts(nil, alertFilter{}) @@ -334,6 +334,136 @@ func TestSilenceBelongsToSystem(t *testing.T) { } } +func TestSystemKeyFromSilence(t *testing.T) { + tests := []struct { + name string + silence *models.AlertmanagerSilence + expected string + }{ + { + name: "nil silence returns empty", + silence: nil, + expected: "", + }, + { + name: "exact system_key matcher returns value", + silence: &models.AlertmanagerSilence{ + Matchers: []models.AlertmanagerMatcher{ + {Name: "alertname", Value: "DiskFull", IsRegex: false}, + {Name: "system_key", Value: "SYS-001", IsRegex: false}, + }, + }, + expected: "SYS-001", + }, + { + name: "regex system_key matcher is ignored", + silence: &models.AlertmanagerSilence{ + Matchers: []models.AlertmanagerMatcher{ + {Name: "system_key", Value: "SYS-.+", IsRegex: true}, + }, + }, + expected: "", + }, + { + name: "no system_key matcher returns empty", + silence: &models.AlertmanagerSilence{ + Matchers: []models.AlertmanagerMatcher{ + {Name: "alertname", Value: "DiskFull", IsRegex: false}, + }, + }, + expected: "", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, tt.expected, systemKeyFromSilence(tt.silence)) + }) + } +} + +func TestStatusOf(t *testing.T) { + tests := []struct { + name string + alert map[string]interface{} + expected string + }{ + { + name: "active state", + alert: map[string]interface{}{ + "status": map[string]interface{}{"state": "active"}, + }, + expected: "active", + }, + { + name: "suppressed state", + alert: map[string]interface{}{ + "status": map[string]interface{}{"state": "suppressed"}, + }, + expected: "suppressed", + }, + { + name: "missing status returns empty", + alert: map[string]interface{}{"labels": map[string]interface{}{"alertname": "X"}}, + expected: "", + }, + { + name: "missing state field returns empty", + alert: map[string]interface{}{ + "status": map[string]interface{}{"silencedBy": []interface{}{}}, + }, + expected: "", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, tt.expected, statusOf(tt.alert)) + }) + } +} + +func TestSortAlertsListByStatus(t *testing.T) { + alerts := []map[string]interface{}{ + { + "fingerprint": "fp-z", + "status": map[string]interface{}{"state": "suppressed"}, + }, + { + "fingerprint": "fp-a", + "status": map[string]interface{}{"state": "active"}, + }, + { + "fingerprint": "fp-b", + "status": map[string]interface{}{"state": "active"}, + }, + } + + sortAlertsList(alerts, "status", "asc") + + // asc: active before suppressed (alphabetical on state string). + // Tiebreaker is fingerprint asc and is stable regardless of direction, + // so the two "active" rows stay in fp-a, fp-b order. + assert.Equal(t, "fp-a", alerts[0]["fingerprint"]) + assert.Equal(t, "fp-b", alerts[1]["fingerprint"]) + assert.Equal(t, "fp-z", alerts[2]["fingerprint"]) + + sortAlertsList(alerts, "status", "desc") + assert.Equal(t, "fp-z", alerts[0]["fingerprint"]) + // Tiebreaker fingerprint asc stays in place even on desc primary. + assert.Equal(t, "fp-a", alerts[1]["fingerprint"]) + assert.Equal(t, "fp-b", alerts[2]["fingerprint"]) +} + +func TestAlertsListAllowedSortByIncludesStatus(t *testing.T) { + // The active-alerts list and the history list must share at least + // severity/alertname/starts_at/status so the UI can offer a single + // "Sort by" dropdown that works on both tabs. + for _, col := range []string{"starts_at", "severity", "alertname", "status"} { + assert.Truef(t, alertsListAllowedSortBy[col], "expected %q in alertsListAllowedSortBy", col) + } +} + func TestValidateWebhookURL(t *testing.T) { tests := []struct { name string diff --git a/backend/openapi.yaml b/backend/openapi.yaml index 90f58de3..8e3c7c5f 100644 --- a/backend/openapi.yaml +++ b/backend/openapi.yaml @@ -8899,10 +8899,18 @@ paths: - Backend - Alerts summary: /systems/{id}/alerts - Get active alerts for a system description: | - Returns current alerts from Mimir for a specific system, filtered by the system's key. - Suppressed alerts remain visible so silenced alerts can still be inspected in the system detail view. - System identity is carried as labels on each alert (`system_id`, `system_key`, - `system_name`, `system_type`), stamped at ingest time. + Returns current alerts from Mimir scoped to a single system. Mirrors + the filter, pagination, and sort surface of `GET /alerts`: the only + difference is that `system_key` is pinned to the URL path (the + multi-value `system_key` query filter is therefore not exposed). + Suppressed alerts remain visible so silenced alerts can still be + inspected in the system detail view. + + System identity is carried as labels on each alert (`system_id`, + `system_key`, `system_name`, `system_type`), stamped at ingest time. + + Multi-value filters: OR within the same filter, AND across filters. + Requires `read:systems` permission. security: - BearerAuth: [] @@ -8914,6 +8922,63 @@ paths: schema: type: string example: "sys_123456789" + - name: page + in: query + description: 1-based page number. + schema: + type: integer + minimum: 1 + default: 1 + - name: page_size + in: query + description: Page size. Default 50, max 100. + schema: + type: integer + minimum: 1 + maximum: 100 + default: 50 + - name: sort_by + in: query + description: Sort column (allowlist matches `/alerts`). + schema: + type: string + enum: [starts_at, severity, alertname, status] + default: starts_at + - name: sort_direction + in: query + schema: + type: string + enum: [asc, desc] + default: desc + - name: status + in: query + description: Filter by Alertmanager state. Supports multiple values. + schema: + type: array + items: + type: string + enum: [active, suppressed, unprocessed] + style: form + explode: true + - name: severity + in: query + description: Filter by severity. Supports multiple values. + schema: + type: array + items: + type: string + enum: [critical, warning, info] + style: form + explode: true + - name: alertname + in: query + description: Filter by alertname. Supports multiple values. + schema: + type: array + items: + type: string + style: form + explode: true responses: '200': description: Active alerts retrieved successfully @@ -8935,6 +9000,8 @@ paths: type: array items: $ref: '#/components/schemas/ActiveAlert' + pagination: + $ref: '#/components/schemas/Pagination' examples: ActiveAndSuppressedOnSystem: summary: Two firing alerts on the same system, one silenced @@ -8983,6 +9050,15 @@ paths: inhibitedBy: [] startsAt: "2026-05-12T08:20:00Z" endsAt: "2026-05-12T08:50:00Z" + pagination: + page: 1 + page_size: 50 + total_count: 2 + total_pages: 1 + has_next: false + has_prev: false + sort_by: "starts_at" + sort_direction: "desc" NoActiveAlerts: summary: System has no firing alerts value: @@ -8990,6 +9066,15 @@ paths: message: "alerts retrieved successfully" data: alerts: [] + pagination: + page: 1 + page_size: 50 + total_count: 0 + total_pages: 0 + has_next: false + has_prev: false + sort_by: "starts_at" + sort_direction: "desc" '401': $ref: '#/components/responses/Unauthorized' '403': @@ -9181,7 +9266,7 @@ paths: `silence_id` is the Alertmanager-assigned UUID; use it to look up, update, or delete the silence later. The corresponding `silenced` event is appended to the alert's - activity timeline (`GET /alerts/{fingerprint}/activity`). + activity timeline (`GET /alerts/activity/{fingerprint}`). value: code: 200 message: "alert silenced successfully" @@ -9457,7 +9542,17 @@ paths: type: string enum: [id, alertname, severity, status, starts_at, ends_at, created_at] default: created_at - - $ref: '#/components/parameters/SortDirectionParam' + - name: sort_direction + in: query + required: false + description: | + Sort direction. Unlike the shared default of `asc`, this endpoint + defaults to `desc` so the natural "most recent first" ordering is + applied when the caller omits the param. + schema: + type: string + enum: [asc, desc] + default: desc - name: from_date in: query required: false @@ -11725,8 +11820,8 @@ paths: `/alerts/totals` and `/alerts/trend`. Supports date range (`from_date`/`to_date`, RFC3339) and multi-value label - filters (`alertname`, `severity`, `status`). All multi-value filters: OR within - the same filter, AND across filters. + filters (`system_key`, `alertname`, `severity`, `status`). All multi-value + filters: OR within the same filter, AND across filters. Customer callers are always pinned to their own organization regardless of params. security: @@ -11758,7 +11853,17 @@ paths: type: string enum: [id, alertname, severity, status, starts_at, ends_at, created_at] default: created_at - - $ref: '#/components/parameters/SortDirectionParam' + - name: sort_direction + in: query + required: false + description: | + Sort direction. Unlike the shared default of `asc`, this endpoint + defaults to `desc` so the natural "most recent first" ordering is + applied when the caller omits the param. + schema: + type: string + enum: [asc, desc] + default: desc - name: from_date in: query description: Lower bound on `created_at` (inclusive). RFC3339 timestamp. @@ -11773,6 +11878,17 @@ paths: type: string format: date-time example: "2026-05-08T00:00:00Z" + - name: system_key + in: query + description: | + Filter by one or more system keys. Repeat the param to pass multiple + values; results are matched as `system_key IN (...)`. + schema: + type: array + items: + type: string + style: form + explode: true - name: alertname in: query description: Filter by alertname. Supports multiple values. @@ -11908,7 +12024,7 @@ paths: # ALERTING ENDPOINTS (Per-alert audit timeline) # =========================================== - /alerts/{fingerprint}/activity: + /alerts/activity/{fingerprint}: parameters: - name: fingerprint in: path @@ -11933,7 +12049,7 @@ paths: description: | Returns the audit timeline for the alert identified by `fingerprint`, most recent first. Events are written transparently as silences are created, updated, or - removed via the `/api/systems/:id/alerts/silences` endpoints. + removed via the `/api/alerts/silences` and `/api/systems/:id/alerts/silences` endpoints. Operator notes are stored as the silence `comment` (Alertmanager native), so a note edit appears here as a `silence_updated` event whose `details` payload @@ -12024,6 +12140,414 @@ paths: '403': $ref: '#/components/responses/Forbidden' + # =========================================== + # ALERTING ENDPOINTS (Backend - Cross-system silences) + # =========================================== + + /alerts/silences: + get: + operationId: getAlertSilences + tags: + - Backend - Alerts + summary: List active+pending silences across the caller's hierarchy + description: | + Cross-system parallel of `GET /systems/{id}/alerts/silences`. Returns + every active or pending Alertmanager silence in the caller's scope, + enriched with `organization_id` (the tenant that owns the silence) and + `system_key` (extracted from the silence matchers). Expired silences + and silences without a `system_key` matcher are excluded — only + silences our UI ever creates are addressable. + + Scope follows the same three modes as `/alerts/totals`: + - `organization_id` omitted → caller's full hierarchy (cross-tenant fan-out). + - `organization_id=X` → single tenant `X`. + - `organization_id=X&include=descendants` → `X` plus its sub-tree. + + Requires `read:systems` permission. + security: + - BearerAuth: [] + parameters: + - name: organization_id + in: query + description: | + Target organization ID(s). Repeat the param for multiple values. + Optional for all roles except Customer (where it is ignored). + schema: + type: array + items: + type: string + style: form + explode: true + - name: include + in: query + description: Set to `descendants` together with `organization_id` to expand each value to its sub-tree. + schema: + type: string + enum: [descendants] + - name: system_key + in: query + description: | + Filter silences by one or more system keys (exact match on the + `system_key` matcher). Repeat the param for multiple values. + schema: + type: array + items: + type: string + style: form + explode: true + responses: + '200': + description: Paginated list of system-scoped silences + content: + application/json: + schema: + type: object + properties: + code: + type: integer + example: 200 + message: + type: string + example: silences retrieved successfully + data: + type: object + properties: + silences: + type: array + items: + allOf: + - $ref: '#/components/schemas/AlertmanagerSilence' + - type: object + properties: + organization_id: + type: string + description: Tenant that owns the silence (Mimir stores silences per-tenant). + example: "m4m3mdjdiizs" + system_key: + type: string + description: System key extracted from the silence matchers. + example: "NETH-D417-A2C2-7810-43D2-984B-2164-34C1-B22E" + warnings: + type: array + description: | + Per-tenant fan-out errors. Always present (empty when + every tenant responded OK). Each entry is a string + `org : `. + items: + type: string + examples: + OneActive: + summary: One active silence on a customer system + value: + code: 200 + message: silences retrieved successfully + data: + silences: + - id: "d9f91c6e-1b33-484e-befa-bfb41020e178" + organization_id: "m4m3mdjdiizs" + system_key: "NETH-D417-A2C2-7810-43D2-984B-2164-34C1-B22E" + matchers: + - name: "system_key" + value: "NETH-D417-A2C2-7810-43D2-984B-2164-34C1-B22E" + isRegex: false + - name: "alertname" + value: "HighCPUUsage" + isRegex: false + - name: "severity" + value: "warning" + isRegex: false + startsAt: "2026-05-12T08:16:36Z" + endsAt: "2026-05-12T09:16:36Z" + updatedAt: "2026-05-12T08:16:36Z" + createdBy: "amelia.foster" + comment: "muted during maintenance window" + status: + state: "active" + warnings: [] + '401': + $ref: '#/components/responses/Unauthorized' + '403': + $ref: '#/components/responses/Forbidden' + '500': + $ref: '#/components/responses/InternalServerError' + post: + operationId: createAlertSilence + tags: + - Backend - Alerts + summary: Mute an alert across systems + description: | + Cross-system parallel of `POST /systems/{id}/alerts/silences`. Mutes + an active alert identified by fingerprint inside a single tenant + (`?organization_id=`). The backend looks up the alert in Mimir, + extracts `system_key` from its labels, builds the matchers + server-side, and delegates to the same silence-creation path used by + the per-system endpoint — so the silence object stored in Mimir is + byte-identical regardless of which route created it. + + If `end_at` is set it takes precedence over `duration_minutes`. + + Requires `manage:systems` permission. + security: + - BearerAuth: [] + parameters: + - name: organization_id + in: query + required: true + description: | + Tenant that owns the alert. Mandatory for every role except + Customer (where it is ignored — they're always pinned to their + own organization). Owners can address any tenant in the system. + schema: + type: string + example: "m4m3mdjdiizs" + requestBody: + required: true + content: + application/json: + schema: + type: object + required: + - fingerprint + properties: + fingerprint: + type: string + description: Fingerprint of the active alert to silence. + example: "0a9d04bb6eed523f" + comment: + type: string + description: Optional silence comment. Defaults to a system-generated value when empty. + example: "silenced during maintenance" + duration_minutes: + type: integer + minimum: 1 + maximum: 10080 + description: Optional duration in minutes. Defaults to 60 when omitted. Ignored when end_at is set. + example: 60 + end_at: + type: string + format: date-time + description: Optional explicit end time (RFC3339). Takes precedence over duration_minutes. + example: "2026-05-12T09:16:36Z" + examples: + ExplicitEndAt: + summary: Silence until a specific date/time + value: + fingerprint: "0a9d04bb6eed523f" + comment: "silenced during maintenance window" + end_at: "2026-05-12T09:16:36Z" + responses: + '200': + description: Alert silenced successfully + content: + application/json: + schema: + type: object + properties: + code: + type: integer + example: 200 + message: + type: string + example: "alert silenced successfully" + data: + type: object + properties: + silence_id: + type: string + example: "d9f91c6e-1b33-484e-befa-bfb41020e178" + '400': + $ref: '#/components/responses/BadRequest' + '401': + $ref: '#/components/responses/Unauthorized' + '403': + $ref: '#/components/responses/Forbidden' + '404': + $ref: '#/components/responses/NotFound' + + /alerts/silences/{silence_id}: + get: + operationId: getAlertSilence + tags: + - Backend - Alerts + summary: Read a single silence + description: | + Cross-system parallel of `GET /systems/{id}/alerts/silences/{silence_id}`. + Looks up a silence inside a single tenant (`?organization_id=`) and + returns it enriched with `organization_id` and `system_key`. Silences + without a `system_key` matcher are reported as 404 — they don't belong + to our domain. + + Requires `read:systems` permission. + security: + - BearerAuth: [] + parameters: + - name: silence_id + in: path + required: true + description: Alertmanager silence ID. + schema: + type: string + example: "d9f91c6e-1b33-484e-befa-bfb41020e178" + - name: organization_id + in: query + required: true + description: | + Tenant that owns the silence. Mandatory for every role except + Customer (where it is ignored). + schema: + type: string + example: "m4m3mdjdiizs" + responses: + '200': + description: Silence retrieved successfully + content: + application/json: + schema: + type: object + properties: + code: + type: integer + example: 200 + message: + type: string + example: "silence retrieved successfully" + data: + type: object + properties: + silence: + allOf: + - $ref: '#/components/schemas/AlertmanagerSilence' + - type: object + properties: + organization_id: + type: string + example: "m4m3mdjdiizs" + system_key: + type: string + example: "NETH-D417-A2C2-7810-43D2-984B-2164-34C1-B22E" + '401': + $ref: '#/components/responses/Unauthorized' + '403': + $ref: '#/components/responses/Forbidden' + '404': + $ref: '#/components/responses/NotFound' + put: + operationId: updateAlertSilence + tags: + - Backend - Alerts + summary: Update a silence's end time / comment + description: | + Cross-system parallel of `PUT /systems/{id}/alerts/silences/{silence_id}`. + Preserves the original matchers and start time; only `end_at` and + `comment` change. Refuses to operate on silences without a + `system_key` matcher (404). Requires `manage:systems` permission. + security: + - BearerAuth: [] + parameters: + - name: silence_id + in: path + required: true + schema: + type: string + example: "d9f91c6e-1b33-484e-befa-bfb41020e178" + - name: organization_id + in: query + required: true + schema: + type: string + example: "m4m3mdjdiizs" + requestBody: + required: true + content: + application/json: + schema: + type: object + required: + - end_at + properties: + comment: + type: string + example: "extended for maintenance window" + end_at: + type: string + format: date-time + example: "2026-05-12T12:16:36Z" + responses: + '200': + description: Silence updated successfully + content: + application/json: + schema: + type: object + properties: + code: + type: integer + example: 200 + message: + type: string + example: "silence updated successfully" + data: + type: object + properties: + silence_id: + type: string + example: "f1e1c2a4-7e57-4b1a-aaa0-2b96c8b5a3aa" + '400': + $ref: '#/components/responses/BadRequest' + '401': + $ref: '#/components/responses/Unauthorized' + '403': + $ref: '#/components/responses/Forbidden' + '404': + $ref: '#/components/responses/NotFound' + delete: + operationId: deleteAlertSilence + tags: + - Backend - Alerts + summary: Unmute an alert + description: | + Cross-system parallel of `DELETE /systems/{id}/alerts/silences/{silence_id}`. + Removes a system-scoped silence; generic Alertmanager silences (no + `system_key` matcher) are not addressable through this endpoint and + return 404. Requires `manage:systems` permission. + security: + - BearerAuth: [] + parameters: + - name: silence_id + in: path + required: true + schema: + type: string + example: "d9f91c6e-1b33-484e-befa-bfb41020e178" + - name: organization_id + in: query + required: true + schema: + type: string + example: "m4m3mdjdiizs" + responses: + '200': + description: Silence disabled successfully + content: + application/json: + schema: + type: object + properties: + code: + type: integer + example: 200 + message: + type: string + example: "silence disabled successfully" + '400': + $ref: '#/components/responses/BadRequest' + '401': + $ref: '#/components/responses/Unauthorized' + '403': + $ref: '#/components/responses/Forbidden' + '404': + $ref: '#/components/responses/NotFound' + # =========================================== # ALERTING ENDPOINTS (Backend - Configuration) # =========================================== @@ -12349,8 +12873,8 @@ paths: extra round-trip per row. Sortable by `starts_at` (default desc), `severity` (criticality rank: critical - > warning > info), or `alertname`. `fingerprint` is used as a stable tiebreaker - so pagination doesn't shift between requests. + > warning > info), `alertname`, or `status` (Alertmanager state). `fingerprint` + is used as a stable tiebreaker so pagination doesn't shift between requests. Scope follows the same three modes as `/alerts/totals`: - `organization_id` omitted → caller's full hierarchy (cross-tenant fan-out). @@ -12404,7 +12928,7 @@ paths: description: Sort column (allowlist). schema: type: string - enum: [starts_at, severity, alertname] + enum: [starts_at, severity, alertname, status] default: starts_at - name: sort_direction in: query @@ -12412,9 +12936,9 @@ paths: type: string enum: [asc, desc] default: desc - - name: state + - name: status in: query - description: Filter alerts by state. Supports multiple values. + description: Filter alerts by Alertmanager state. Supports multiple values. schema: type: array items: From 47b95e67b67545c3059a1e467d95c52ab9642702 Mon Sep 17 00:00:00 2001 From: Edoardo Spadoni Date: Mon, 18 May 2026 08:58:07 +0200 Subject: [PATCH 06/10] feat(apitool): surface system_secret, document collect-based alert push create-system now prints system_secret (returned only at creation) so systems can authenticate to collect. Rewrite the alert-push docs to go through collect with Basic Auth + the required register step instead of hitting Mimir directly. Document the organization_id alert label in openapi. --- backend/cmd/apitool/README.md | 53 +++++++++++++++++++++++++++++------ backend/cmd/apitool/client.go | 21 ++++++++------ backend/cmd/apitool/main.go | 10 +++++-- backend/openapi.yaml | 13 ++++++--- 4 files changed, 73 insertions(+), 24 deletions(-) diff --git a/backend/cmd/apitool/README.md b/backend/cmd/apitool/README.md index c97ca5f0..29281791 100644 --- a/backend/cmd/apitool/README.md +++ b/backend/cmd/apitool/README.md @@ -145,24 +145,61 @@ working user whose token reflects the real hierarchy. ## Push test alerts -`apitool` doesn't push alerts itself — they go straight to Mimir Alertmanager, -which is per-tenant by `X-Scope-OrgID`. The org's `logto_id` (visible via -`apitool list`) is the tenant ID: +`apitool` doesn't push alerts itself. Alerts **must go through the `collect` +service**, not directly to Mimir Alertmanager. Hitting Mimir on `:9009` directly +bypasses authentication, skips the server-side label enrichment +(`system_key`, `system_id`, `organization_*`, …) and annotation templating, and +is not how real appliances behave — the GET `/api/alerts` aggregation will not +show such alerts as expected. + +`collect` authenticates the pushing **system** via HTTP Basic Auth +(`system_key:system_secret`) and injects `X-Scope-OrgID` itself from that +system's organization. So you push as a system, never as an org, and you never +send `X-Scope-OrgID` yourself. + +`create-system` now prints both the `system_key` and the full `system_secret` +token (`my_.`). The secret is only ever returned at creation +time — save it. Use `apitool create-system ... ` from the hierarchy example +above to get a system. + +**The system must be registered before `collect` will accept it.** `collect` +rejects appliances that have a valid secret on file but never completed +`POST /api/systems/register` (you'd get `401 invalid system credentials` on the +push otherwise). Register once with the `system_secret`: + +```bash +SYSTEM_KEY='NETH-...' +SYSTEM_SECRET='my_....' + +# Public, unauthenticated endpoint — the secret is the credential. +curl -s -X POST "http://localhost:8080/api/systems/register" \ + -H "Content-Type: application/json" \ + -d "{\"system_secret\":\"$SYSTEM_SECRET\"}" +``` + +Then push the alert through `collect`: ```bash -ORG=$(./apitool list | awk '/TestCust1/ {print $3; exit}') NOW=$(date -u +"%Y-%m-%dT%H:%M:%SZ") END=$(date -u -v+1H +"%Y-%m-%dT%H:%M:%SZ") -curl -s -X POST "http://localhost:9009/alertmanager/api/v2/alerts" \ - -H "X-Scope-OrgID: $ORG" -H "Content-Type: application/json" \ +# collect: localhost:18081 in the docker-compose full stack +curl -s -X POST \ + "http://localhost:18081/api/services/mimir/alertmanager/api/v2/alerts" \ + -u "$SYSTEM_KEY:$SYSTEM_SECRET" \ + -H "Content-Type: application/json" \ -d "[{\"startsAt\":\"$NOW\",\"endsAt\":\"$END\", \"labels\":{\"alertname\":\"HighCPU\",\"severity\":\"critical\", - \"system_key\":\"NETH-...\",\"instance\":\"test\"}, + \"instance\":\"test\"}, \"annotations\":{\"summary\":\"Test alert\"}}]" ``` -Then verify aggregation through `/api/alerts/totals` with each role's token. +`collect` enriches the payload with the authoritative `system_key`, +`system_id`, `system_name`, `organization_*` labels itself — don't set them in +the request; any client-supplied values are overridden. + +Then verify aggregation through `/api/alerts/totals` (and `/api/alerts`) with +each role's token. ## Known quirks diff --git a/backend/cmd/apitool/client.go b/backend/cmd/apitool/client.go index 8715aec7..14cace01 100644 --- a/backend/cmd/apitool/client.go +++ b/backend/cmd/apitool/client.go @@ -499,32 +499,35 @@ func (c *Client) ListUsersInOrg(orgID string) ([]struct{ LogtoID, Email string } return out, nil } -// CreateSystem creates a system under an org. Returns the system_key. -func (c *Client) CreateSystem(name, orgID string) (string, error) { +// CreateSystem creates a system under an org. Returns the system_key and the +// full system_secret token (my_.), the latter only ever +// returned by the API at creation time. +func (c *Client) CreateSystem(name, orgID string) (key, secret string, err error) { payload := map[string]interface{}{ "name": name, "organization_id": orgID, } r, err := c.api("POST", "/systems", payload) if err != nil { - return "", err + return "", "", err } if r.status >= 400 { - return "", fmt.Errorf("create system failed (%d): %s", r.status, r.body) + return "", "", fmt.Errorf("create system failed (%d): %s", r.status, r.body) } var resp struct { Data struct { - SystemKey string `json:"system_key"` - ID string `json:"id"` + SystemKey string `json:"system_key"` + SystemSecret string `json:"system_secret"` + ID string `json:"id"` } `json:"data"` } if err := json.Unmarshal(r.body, &resp); err != nil { - return "", err + return "", "", err } if resp.Data.SystemKey == "" { - return "", fmt.Errorf("no system_key in response: %s", r.body) + return "", "", fmt.Errorf("no system_key in response: %s", r.body) } - return resp.Data.SystemKey, nil + return resp.Data.SystemKey, resp.Data.SystemSecret, nil } func (c *Client) ResetPassword(userID, password string) error { diff --git a/backend/cmd/apitool/main.go b/backend/cmd/apitool/main.go index 97c2a6cb..ac4855ff 100644 --- a/backend/cmd/apitool/main.go +++ b/backend/cmd/apitool/main.go @@ -101,7 +101,7 @@ Usage: still has child orgs/users; clean those out first. apitool create-system --org= - Create a system under a customer org. Prints the system_key. + Create a system under a customer org. Prints system_key + system_secret. apitool cleanup-orphans --org= Soft-delete every user listed in whose email is NOT in registry. @@ -482,11 +482,15 @@ func cmdCreateSystem(args []string) error { if err != nil { return err } - systemKey, err := client.CreateSystem(systemName, org.LogtoID) + systemKey, systemSecret, err := client.CreateSystem(systemName, org.LogtoID) if err != nil { return err } - fmt.Printf("Created system %q in org %q (system_key=%s)\n", systemName, orgKey, systemKey) + fmt.Printf("Created system %q in org %q\n", systemName, orgKey) + fmt.Printf(" system_key=%s\n", systemKey) + fmt.Printf(" system_secret=%s\n", systemSecret) + fmt.Printf("\nPush alerts as this system (Basic Auth) through collect:\n") + fmt.Printf(" curl -u '%s:%s' http://localhost:18081/api/services/mimir/alertmanager/api/v2/alerts ...\n", systemKey, systemSecret) return nil } diff --git a/backend/openapi.yaml b/backend/openapi.yaml index 8e3c7c5f..12736135 100644 --- a/backend/openapi.yaml +++ b/backend/openapi.yaml @@ -392,10 +392,11 @@ components: description: | Alert labels. Always includes server-stamped identity labels (`system_id`, `system_key`, `system_name`, `system_type`, - `system_fqdn`, `system_ipv4`, `organization_name`, - `organization_vat`, `organization_type`) plus the alert's own - labels (`alertname`, `severity`, ...). Use `system_id` to link - to the system detail page (/systems/:id). + `system_fqdn`, `system_ipv4`, `organization_id`, + `organization_name`, `organization_vat`, `organization_type`) + plus the alert's own labels (`alertname`, `severity`, ...). Use + `system_id` to link to the system detail page (/systems/:id) and + `organization_id` to link to the organization. example: alertname: "DiskFilling" severity: "warning" @@ -403,6 +404,10 @@ components: system_key: "NETH-FBB2-1A6E-7CAD-44A4-A772-B3EE-F0F6-F371" system_name: "cust1-sys-A" system_type: "ns8" + organization_id: "org-1a2b3c4d" + organization_name: "Test Customer" + organization_type: "customer" + organization_vat: "123456789012" annotations: type: object additionalProperties: From 9cfab9054d8e22af98f38870c47069008f7b0be5 Mon Sep 17 00:00:00 2001 From: Edoardo Spadoni Date: Mon, 18 May 2026 08:58:13 +0200 Subject: [PATCH 07/10] feat(alerts): inject organization_id label on enriched alerts organization_id was resolved for X-Scope-OrgID but never added to the authoritative label set, so the frontend had no ID to make the Organization column clickable. Add it alongside organization_name/vat/type. --- collect/alerting/mimir.go | 1 + collect/alerting/mimir_test.go | 1 + 2 files changed, 2 insertions(+) diff --git a/collect/alerting/mimir.go b/collect/alerting/mimir.go index 172f5f8a..4827964d 100644 --- a/collect/alerting/mimir.go +++ b/collect/alerting/mimir.go @@ -124,6 +124,7 @@ func BuildSystemAlertContext(metadata SystemAlertMetadata) *SystemAlertContext { "system_type": metadata.SystemType, "system_fqdn": metadata.SystemFQDN, "system_ipv4": metadata.SystemIPv4, + "organization_id": metadata.OrganizationID, "organization_name": metadata.OrganizationName, "organization_vat": metadata.OrganizationVAT, "organization_type": metadata.OrganizationType, diff --git a/collect/alerting/mimir_test.go b/collect/alerting/mimir_test.go index 9c62de9f..3979c91d 100644 --- a/collect/alerting/mimir_test.go +++ b/collect/alerting/mimir_test.go @@ -40,6 +40,7 @@ func TestBuildSystemAlertContext(t *testing.T) { "system_type": "ns8", "system_fqdn": "web-01.example.com", "system_ipv4": "192.0.2.10", + "organization_id": "org-1", "organization_name": "Acme Corp", "organization_vat": "IT00000000001", "organization_type": "customer", From 2e80f07188480d6c8baf8d8eed9e17b24d5700af Mon Sep 17 00:00:00 2001 From: Edoardo Spadoni Date: Mon, 18 May 2026 13:56:48 +0200 Subject: [PATCH 08/10] feat(alerts): privileged effective-config inspection endpoint MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GET /alerts/config/effective returns a tenant's per-layer chain (Owner→tenant), merged effective layer, and rendered Mimir YAML. Gated by the new config:alerts permission (owner-only super role). Redaction extended to mask telegram bot_token in YAML list form and webhook url path/query, matching the layer redaction. --- backend/main.go | 35 ++----- backend/methods/alerting.go | 25 +++++ backend/openapi.yaml | 96 +++++++++++++++++ backend/services/alerting/client.go | 20 +++- backend/services/alerting/effective.go | 110 ++++++++++++++++++-- backend/services/alerting/redaction.go | 19 +++- backend/services/alerting/redaction_test.go | 55 ++++++++++ 7 files changed, 319 insertions(+), 41 deletions(-) create mode 100644 backend/services/alerting/redaction_test.go diff --git a/backend/main.go b/backend/main.go index 7fe55262..d0b4360e 100644 --- a/backend/main.go +++ b/backend/main.go @@ -269,50 +269,30 @@ func main() { } // =========================================== - // ALERTS - operations stay on `systems` (read:systems / manage:systems); - // configuration policy is gated by the dedicated `alerts` resource so - // only Admin/Super can rewrite the layered MSP policy. + // ALERTS - operations on `systems` RBAC; config policy on `alerts` resource // =========================================== alertsGroup := customAuthWithAudit.Group("/alerts", middleware.RequireResourcePermission("systems")) { - // Lists: active (Mimir live) and resolved (DB history). Both honor - // the same scope rules (hierarchy / single / descendants) and the - // same multi-value label filters. + // Lists: active (Mimir live) + resolved (DB history); same scope rules + label filters alertsGroup.GET("", methods.GetAlerts) // List active alerts (cross-hierarchy paginated) alertsGroup.GET("/history", methods.GetAlertsHistory) // List resolved alerts from DB (paginated, date range + filters) - // Aggregations: counts, trend over time, top-N + MTTR/MTBF. + // Aggregations: counts, trend over time, top-N + MTTR/MTBF alertsGroup.GET("/totals", methods.GetAlertsTotals) // Alert counts by severity + history total alertsGroup.GET("/trend", methods.GetAlertsTrend) // Alert history trend with daily data points alertsGroup.GET("/stats", methods.GetAlertsStats) // Aggregate stats: severity buckets, top-N alertname/system_key, MTTR/MTBF - // Per-alert audit timeline (silence created/updated/removed events for the alert detail drawer). - // The "activity" literal segment comes BEFORE the param so this path - // doesn't collide with /alerts/silences/{silence_id} (3-segment param-second pattern). + // Per-alert audit timeline; "activity" literal before the param to avoid colliding with /silences/{silence_id} alertsGroup.GET("/activity/:fingerprint", methods.GetAlertActivity) - // Silences (cross-system mute). Mirrors /systems/:id/alerts/silences* - // but takes ?organization_id= for the per-id ops and resolves the - // system_key from the alert labels (POST) or the silence matchers - // (PUT/DELETE). RBAC stays on `systems`: read:systems for GET, - // manage:systems for POST/PUT/DELETE. + // Cross-system silences; mirrors /systems/:id/alerts/silences*, ?organization_id= for per-id ops, RBAC on `systems` alertsGroup.GET("/silences", methods.GetAlertSilences) // List active+pending silences across the caller's hierarchy alertsGroup.POST("/silences", methods.CreateAlertSilence) // Mute an alert (body: { fingerprint, end_at, comment, duration_minutes? }) alertsGroup.GET("/silences/:silence_id", methods.GetAlertSilence) // Get a single silence (requires ?organization_id=) alertsGroup.PUT("/silences/:silence_id", methods.UpdateAlertSilence) // Update a silence's end time / comment (requires ?organization_id=) alertsGroup.DELETE("/silences/:silence_id", methods.DeleteAlertSilence) // Unmute (requires ?organization_id=) - // Configuration management (per-org layered model) — gated on the - // dedicated `alerts` resource. GET → read:alerts, POST/DELETE → manage:alerts. - // The handler always operates on the caller's own organization layer; merged - // effective views never leave the backend (only the local server-side render - // to Mimir consumes them). - // - // MaxBodySize(1 MiB) caps the JSON payload before binding to prevent - // memory-exhaustion DoS via crafted oversized layers. With the - // per-field `max=N` constraints in models.AlertingConfigLayer the - // realistic worst case is well under 64 KB; 1 MiB leaves comfortable - // headroom for legitimate use of the full recipient lists. + // Caller's own layer (read/manage:alerts); body capped at 1 MiB configGroup := alertsGroup.Group("/config", middleware.RequireResourcePermission("alerts"), middleware.MaxBodySize(1<<20), @@ -322,6 +302,9 @@ func main() { configGroup.POST("", methods.ConfigureAlerts) // Save caller's layer + propagate to descendants (manage:alerts required) configGroup.DELETE("", methods.DisableAlerts) // Remove caller's layer + propagate to descendants (manage:alerts required) } + + // Merged effective config + Mimir YAML for any tenant; config:alerts (owner-only super), secrets redacted + alertsGroup.GET("/config/effective", middleware.RequirePermission("config:alerts"), methods.GetEffectiveAlertingConfig) } // =========================================== diff --git a/backend/methods/alerting.go b/backend/methods/alerting.go index 7cebd24a..7afa5f28 100644 --- a/backend/methods/alerting.go +++ b/backend/methods/alerting.go @@ -768,6 +768,31 @@ func GetAlertingConfig(c *gin.Context) { })) } +// GetEffectiveAlertingConfig handles GET /api/alerts/config/effective: privileged per-layer + merged config + Mimir YAML for any tenant (organization_id required; nonexistent id → empty config), secrets redacted. +func GetEffectiveAlertingConfig(c *gin.Context) { + if _, ok := helpers.GetUserFromContext(c); !ok { + return + } + + orgID := c.Query("organization_id") + if !requireOrgID(c, orgID) { + return + } + + report, err := alerting.BuildEffectiveConfigReport(orgID) + if err != nil { + if errors.Is(err, alerting.ErrChainTooDeep) { + c.JSON(http.StatusUnprocessableEntity, response.UnprocessableEntity("organization ancestor chain is too deep or contains a cycle", nil)) + return + } + logger.Error().Err(err).Str("org_id", orgID).Msg("failed to build effective alerting config report") + c.JSON(http.StatusInternalServerError, response.InternalServerError("failed to build effective alerting config", nil)) + return + } + + c.JSON(http.StatusOK, response.OK("effective alerting configuration retrieved successfully", alerting.RedactEffectiveConfigReport(report))) +} + // alertsTotalsFanoutTimeout caps how long /totals will wait for Mimir to answer // across the caller's whole hierarchy. Per-tenant calls that don't return in // time are reported as warnings; their counts simply don't contribute. Tuned diff --git a/backend/openapi.yaml b/backend/openapi.yaml index 12736135..130316a4 100644 --- a/backend/openapi.yaml +++ b/backend/openapi.yaml @@ -12865,6 +12865,102 @@ paths: '403': $ref: '#/components/responses/Forbidden' + /alerts/config/effective: + get: + operationId: getEffectiveAlertingConfig + tags: + - Backend - Alerts + summary: Inspect a tenant's effective (merged) alerting config + description: | + Privileged troubleshooting view. Returns the configuration a tenant + ACTUALLY receives: the per-layer contribution of every organization + in its ancestor chain (Owner → tenant), the merged effective layer, + and the rendered Alertmanager YAML pushed to Mimir for that tenant. + + Unlike `GET /alerts/config` (which returns only the caller's own + layer), this exposes the full inherited + merged view, so it is + gated by the dedicated `config:alerts` permission. That permission + lives solely on the `super` user role, which is owner-assignable + only — so in practice only an Owner-org Super Admin can reach this. + It is not reachable by Distributor/Reseller admins. + + Secrets are redacted in the response: telegram `bot_token` and + webhook URL path/query in every layer and in the effective layer, + and SMTP credentials / bearer / bot tokens in the rendered YAML. + + `organization_id` is required and may target ANY tenant. A + nonexistent id returns an empty effective config (no error) — the + honest answer for a diagnostic tool. Read-only: no Mimir push, no + DB writes. + security: + - BearerAuth: [] + parameters: + - name: organization_id + in: query + required: true + schema: + type: string + description: Logto organization id of the tenant to inspect. + responses: + '200': + description: Effective configuration report + content: + application/json: + schema: + type: object + properties: + code: + type: integer + example: 200 + message: + type: string + example: effective alerting configuration retrieved successfully + data: + type: object + properties: + organization_id: + type: string + chain: + type: array + description: | + Contributing layers ordered Owner first → tenant + last. Orgs with no saved layer are listed with + `has_layer: false` and an empty layer. + items: + type: object + properties: + organization_id: + type: string + organization_name: + type: string + organization_role: + type: string + enum: [owner, distributor, reseller, customer] + has_layer: + type: boolean + layer: + $ref: '#/components/schemas/AlertingConfigLayer' + updated_by_name: + type: string + nullable: true + updated_at: + type: string + format: date-time + nullable: true + effective: + $ref: '#/components/schemas/AlertingConfigLayer' + yaml: + type: string + description: Rendered Alertmanager YAML (secrets redacted). + '400': + $ref: '#/components/responses/BadRequest' + '401': + $ref: '#/components/responses/Unauthorized' + '403': + $ref: '#/components/responses/Forbidden' + '422': + $ref: '#/components/responses/UnprocessableEntity' + /alerts: get: operationId: getAlerts diff --git a/backend/services/alerting/client.go b/backend/services/alerting/client.go index 6faaaec7..673848ca 100644 --- a/backend/services/alerting/client.go +++ b/backend/services/alerting/client.go @@ -33,14 +33,28 @@ const maxMimirResponseSize = 10 << 20 // 10 MB var smtpSensitiveFields = regexp.MustCompile(`(?m)^(\s*smtp_(?:smarthost|auth_username|auth_password):\s*).*$`) var bearerTokenField = regexp.MustCompile(`(?m)^(\s*credentials:\s*).*$`) -var telegramTokenField = regexp.MustCompile(`(?m)^(\s*bot_token:\s*).*$`) -// RedactSensitiveConfig replaces sensitive SMTP and bearer token fields in an alertmanager config -// YAML string with a redaction placeholder before returning to clients. +// bot_token is the first key of each telegram_configs list item, so the +// rendered YAML emits it as `- bot_token: ...`; match the optional list +// dash so the token is scrubbed in that form too. +var telegramTokenField = regexp.MustCompile(`(?m)^(\s*(?:-\s+)?bot_token:\s*).*$`) + +// webhook_configs `url:` carries bearer-equivalent secrets in its path/query +// (e.g. Slack incoming-webhook IDs); mask it like the layer redaction does +// (scheme+host kept, path/query stripped) for parity with RedactLayerForAudit. +var webhookURLField = regexp.MustCompile(`(?m)^(\s*-?\s*url:\s*)'?([^'\n]+)'?\s*$`) + +// RedactSensitiveConfig scrubs SMTP credentials, bearer tokens, telegram bot +// tokens, and webhook URL secrets from an alertmanager config YAML before it +// is returned to clients. func RedactSensitiveConfig(config string) string { config = smtpSensitiveFields.ReplaceAllString(config, "${1}'[REDACTED]'") config = bearerTokenField.ReplaceAllString(config, "${1}'[REDACTED]'") config = telegramTokenField.ReplaceAllString(config, "${1}'[REDACTED]'") + config = webhookURLField.ReplaceAllStringFunc(config, func(line string) string { + m := webhookURLField.FindStringSubmatch(line) + return m[1] + "'" + maskWebhookURL(strings.TrimSpace(m[2])) + "'" + }) return config } diff --git a/backend/services/alerting/effective.go b/backend/services/alerting/effective.go index cc8b36af..2f2bf3ff 100644 --- a/backend/services/alerting/effective.go +++ b/backend/services/alerting/effective.go @@ -8,6 +8,7 @@ package alerting import ( "context" "fmt" + "time" "github.com/nethesis/my/backend/configuration" "github.com/nethesis/my/backend/database" @@ -97,13 +98,7 @@ func lookupCreatedBy(orgID string) (string, error) { return *parent, nil } -// computeEffectiveLayer is the package-private entry point that walks the -// tenant's ancestor chain, fetches every layer in a single round-trip, and -// merges them in order from Owner to tenant. Empty layers (orgs with no row -// in alert_config_layers) contribute nothing but don't break the chain. -// -// Package-private intentionally: the merged view never leaves the backend. -// Only RenderAndPushEffective uses it, to drive the Mimir YAML push. +// computeEffectiveLayer walks the tenant's ancestor chain and merges every layer Owner→tenant (orgs with no row contribute nothing); used by RenderAndPushEffective for the Mimir push. func computeEffectiveLayer(tenantOrgID string) (models.AlertingConfigLayer, error) { chain, err := ResolveAncestorChain(tenantOrgID) if err != nil { @@ -155,3 +150,104 @@ func RenderAndPushEffective(ctx context.Context, tenantOrgID string) error { logger.Debug().Str("tenant", tenantOrgID).Msg("effective alerting config pushed to mimir") return nil } + +// EffectiveLayerContribution is one org's stored layer in a tenant's chain; HasLayer false = no row (no contribution), Layer is unredacted until RedactEffectiveConfigReport. +type EffectiveLayerContribution struct { + OrganizationID string `json:"organization_id"` + OrganizationName string `json:"organization_name"` + OrganizationRole string `json:"organization_role"` + HasLayer bool `json:"has_layer"` + Layer models.AlertingConfigLayer `json:"layer"` + UpdatedByName *string `json:"updated_by_name"` + UpdatedAt *time.Time `json:"updated_at"` +} + +// EffectiveConfigReport is a tenant's per-layer chain + merged layer + rendered Mimir YAML; unredacted until RedactEffectiveConfigReport. +type EffectiveConfigReport struct { + OrganizationID string `json:"organization_id"` + Chain []EffectiveLayerContribution `json:"chain"` + Effective models.AlertingConfigLayer `json:"effective"` + YAML string `json:"yaml"` +} + +// BuildEffectiveConfigReport resolves the chain, merges layers Owner→tenant, and renders the YAML; read-only, no Mimir push. +func BuildEffectiveConfigReport(tenantOrgID string) (EffectiveConfigReport, error) { + chain, err := ResolveAncestorChain(tenantOrgID) + if err != nil { + return EffectiveConfigReport{}, err + } + repo := entities.NewLocalAlertConfigLayersRepository() + layersByOrg, err := repo.GetByOrgIDs(chain) + if err != nil { + return EffectiveConfigReport{}, err + } + + contributions := make([]EffectiveLayerContribution, 0, len(chain)) + ordered := make([]models.AlertingConfigLayer, 0, len(chain)) + for _, oid := range chain { + ident, err := lookupOrgIdentity(oid) + if err != nil { + return EffectiveConfigReport{}, fmt.Errorf("resolve org identity for %s: %w", oid, err) + } + contribution := EffectiveLayerContribution{ + OrganizationID: oid, + OrganizationName: ident.Name, + OrganizationRole: ident.Role, + } + if rec, ok := layersByOrg[oid]; ok { + updatedAt := rec.UpdatedAt + contribution.HasLayer = true + contribution.Layer = rec.Config + contribution.UpdatedByName = rec.UpdatedByName + contribution.UpdatedAt = &updatedAt + ordered = append(ordered, rec.Config) + } + contributions = append(contributions, contribution) + } + + effective := MergeLayers(ordered) + + cfg := configuration.Config + yamlConfig, err := RenderConfig( + cfg.SMTPHost, cfg.SMTPPort, cfg.SMTPUsername, cfg.SMTPPassword, cfg.SMTPFrom, cfg.SMTPTLS, + cfg.AlertingHistoryWebhookURL, cfg.AlertingHistoryWebhookSecret, + &effective, + ) + if err != nil { + return EffectiveConfigReport{}, fmt.Errorf("render YAML for %s: %w", tenantOrgID, err) + } + + return EffectiveConfigReport{ + OrganizationID: tenantOrgID, + Chain: contributions, + Effective: effective, + YAML: yamlConfig, + }, nil +} + +// orgIdentity is best-effort display metadata for an org in a chain. +type orgIdentity struct { + Name string + Role string +} + +// lookupOrgIdentity returns orgID's name and role from the org tables; absent from all three = Owner. +func lookupOrgIdentity(orgID string) (orgIdentity, error) { + row := database.DB.QueryRow( + `SELECT name, 'distributor' AS role FROM distributors WHERE logto_id = $1 AND deleted_at IS NULL + UNION ALL + SELECT name, 'reseller' AS role FROM resellers WHERE logto_id = $1 AND deleted_at IS NULL + UNION ALL + SELECT name, 'customer' AS role FROM customers WHERE logto_id = $1 AND deleted_at IS NULL + LIMIT 1`, + orgID, + ) + var ident orgIdentity + if err := row.Scan(&ident.Name, &ident.Role); err != nil { + if err.Error() == "sql: no rows in result set" { + return orgIdentity{Role: "owner"}, nil + } + return orgIdentity{}, err + } + return ident, nil +} diff --git a/backend/services/alerting/redaction.go b/backend/services/alerting/redaction.go index d8297935..461cb71a 100644 --- a/backend/services/alerting/redaction.go +++ b/backend/services/alerting/redaction.go @@ -17,11 +17,7 @@ import ( // values live in alert_config_layers and are read only by the renderer. const RedactedSecretPlaceholder = "[REDACTED]" -// RedactLayerForAudit returns a copy of `layer` with secrets scrubbed for -// safe inclusion in audit log details. Used exclusively by the audit -// snapshot helpers — the API never exposes layers other than the caller's -// own and there is no /effective endpoint, so this is the only path on -// which a layer's bytes leave their owning context. +// RedactLayerForAudit returns a copy of `layer` with secrets scrubbed, used by audit snapshots and the effective-config inspection response. // // Specifically: // - telegram_recipients[].bot_token → "[REDACTED]" @@ -54,6 +50,19 @@ func RedactLayerForAudit(layer models.AlertingConfigLayer) models.AlertingConfig return out } +// RedactEffectiveConfigReport returns an API-safe copy: layers via RedactLayerForAudit, YAML via RedactSensitiveConfig; original untouched. +func RedactEffectiveConfigReport(r EffectiveConfigReport) EffectiveConfigReport { + out := r + out.Chain = make([]EffectiveLayerContribution, len(r.Chain)) + for i, c := range r.Chain { + c.Layer = RedactLayerForAudit(c.Layer) + out.Chain[i] = c + } + out.Effective = RedactLayerForAudit(r.Effective) + out.YAML = RedactSensitiveConfig(r.YAML) + return out +} + // maskWebhookURL keeps scheme + host + port (so the audit log records where // the webhook went) but strips path, query, and fragment which routinely // carry bearer-equivalent secrets (e.g. Slack incoming webhook IDs). diff --git a/backend/services/alerting/redaction_test.go b/backend/services/alerting/redaction_test.go new file mode 100644 index 00000000..96862d26 --- /dev/null +++ b/backend/services/alerting/redaction_test.go @@ -0,0 +1,55 @@ +/* +Copyright (C) 2026 Nethesis S.r.l. +SPDX-License-Identifier: AGPL-3.0-or-later +*/ + +package alerting + +import ( + "strings" + "testing" + + "github.com/nethesis/my/backend/models" +) + +func TestRedactEffectiveConfigReport(t *testing.T) { + secretLayer := models.AlertingConfigLayer{ + WebhookRecipients: []models.WebhookRecipient{ + {Name: "slack", URL: "https://hooks.slack.com/services/T00/B00/XXXSECRET"}, + }, + TelegramRecipients: []models.TelegramRecipient{ + {BotToken: "123456:super-secret-token", ChatID: 42}, + }, + } + report := EffectiveConfigReport{ + OrganizationID: "org-tenant", + Chain: []EffectiveLayerContribution{ + {OrganizationID: "org-owner", OrganizationRole: "owner", HasLayer: true, Layer: secretLayer}, + }, + Effective: secretLayer, + YAML: "global:\n smtp_auth_password: 'hunter2'\nreceivers:\n - name: wh\n webhook_configs:\n - url: 'https://hooks.slack.com/services/T00/B00/XXXSECRET'\n - name: tg\n telegram_configs:\n - bot_token: 123456:super-secret-token\n", + } + + out := RedactEffectiveConfigReport(report) + + if got := out.Chain[0].Layer.TelegramRecipients[0].BotToken; got != RedactedSecretPlaceholder { + t.Errorf("chain telegram token not redacted: %q", got) + } + if got := out.Chain[0].Layer.WebhookRecipients[0].URL; strings.Contains(got, "XXXSECRET") { + t.Errorf("chain webhook url leaked secret: %q", got) + } + if got := out.Effective.TelegramRecipients[0].BotToken; got != RedactedSecretPlaceholder { + t.Errorf("effective telegram token not redacted: %q", got) + } + if strings.Contains(out.YAML, "hunter2") || strings.Contains(out.YAML, "super-secret-token") || strings.Contains(out.YAML, "XXXSECRET") { + t.Errorf("yaml leaked secrets: %q", out.YAML) + } + if !strings.Contains(out.YAML, "https://hooks.slack.com/"+RedactedSecretPlaceholder) { + t.Errorf("yaml webhook url not masked to host: %q", out.YAML) + } + + // Original report must be left untouched (defensive copy). + if report.Effective.TelegramRecipients[0].BotToken != "123456:super-secret-token" { + t.Errorf("source report mutated: %q", report.Effective.TelegramRecipients[0].BotToken) + } +} From 9fa14a58ce192055ab73d2c8323f5fb77dbcdbeb Mon Sep 17 00:00:00 2001 From: Edoardo Spadoni Date: Mon, 18 May 2026 16:05:59 +0200 Subject: [PATCH 09/10] feat(alerts): static catalog filters endpoint + OpenAPI alerts regroup Add GET /api/filters/alerts: static alert catalog (ns8-metrics#71, nethsecurity#1633, collect heartbeat monitor) plus data-driven systems/severities/organizations scoped to the caller. Split the monolithic Backend - Alerts OpenAPI tag into four semantic groups (Alerts, Configuration, Silences, Per-System) with all summaries path-prefixed. Pure doc reorganization, routes unchanged. --- AGENTS.md | 2 +- backend/main.go | 6 + backend/methods/alert_catalog.go | 72 + backend/methods/alerts_filters.go | 195 + backend/openapi.yaml | 6105 +++++++++++++++-------------- 5 files changed, 3407 insertions(+), 2973 deletions(-) create mode 100644 backend/methods/alert_catalog.go create mode 100644 backend/methods/alerts_filters.go diff --git a/AGENTS.md b/AGENTS.md index 06244b19..83cb0fd3 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -374,7 +374,7 @@ Authoritative: `backend/openapi.yaml` (also `make docs` / redocly). High-level r /api/alerts, /api/alerts/{totals,trend,stats,history,config} active alerts + config + aggregates + history /api/alerts/silences/* cross-system silences (mute/unmute) — parallel to /systems/:id/alerts/silences /api/alerts/activity/:fingerprint per-alert audit timeline (silence created/updated/removed) -/api/filters/{systems,applications,users} UI filter aggregation +/api/filters/{systems,applications,users,alerts} UI filter aggregation (alerts: static catalog + data-driven systems/severities/orgs) /api/rebranding/* per-org per-product asset management /api/organizations, /api/roles, /api/organization-roles metadata /api/validators/vat/:entity_type VAT validation diff --git a/backend/main.go b/backend/main.go index d0b4360e..e06acb43 100644 --- a/backend/main.go +++ b/backend/main.go @@ -329,6 +329,12 @@ func main() { { usersFiltersGroup.GET("", methods.GetUserFilters) // Aggregated filters: roles, organizations } + + // Alerts filters (read:systems required, mirrors the alerts views) + alertsFiltersGroup := filtersGroup.Group("/alerts", middleware.RequireResourcePermission("systems")) + { + alertsFiltersGroup.GET("", methods.GetAlertFilters) // Aggregated filters: systems, alerts, severities, organizations + } } // =========================================== diff --git a/backend/methods/alert_catalog.go b/backend/methods/alert_catalog.go new file mode 100644 index 00000000..b3c661ae --- /dev/null +++ b/backend/methods/alert_catalog.go @@ -0,0 +1,72 @@ +/* +Copyright (C) 2025 Nethesis S.r.l. +SPDX-License-Identifier: AGPL-3.0-or-later +*/ + +package methods + +import "sort" + +// AlertCatalogEntry describes a single known alert type produced by NethServer +// (NS8) and NethSecurity systems. +type AlertCatalogEntry struct { + Name string `json:"name"` + Severity string `json:"severity"` // critical | warning | info + Service string `json:"service,omitempty"` // sub-service, when applicable +} + +// alertCatalog is the static, authoritative list of alert names the platform +// can receive. It is intentionally NOT derived from alert_history: the filters +// dropdown must offer every alert a system can raise, not only the ones already +// seen. Severities/organizations/systems remain data-driven; alert names do not. +// +// Source of truth — kept in sync with the vmalert rules shipped by: +// - NethServer / NS8 metrics: NethServer/ns8-metrics#71 +// - NethSecurity: NethServer/nethsecurity#1633 +// +// Plus alerts synthesised by the collect service itself (not raised by a +// system): LinkFailed, emitted by the heartbeat monitor cron when a system +// stops communicating (see collect/cron/linkfailed_monitor.go). +// +// See also the unified Alert Catalog in services/mimir/README.md. The +// placeholder `MyAlert` template rule from nethsecurity is intentionally +// excluded. +var alertCatalog = []AlertCatalogEntry{ + // --- Synthesised by collect (heartbeat monitor) --- + {Name: "LinkFailed", Severity: "critical"}, + + // --- NethServer / NS8 (ns8-metrics#71) --- + {Name: "BackupFailed", Severity: "critical", Service: "backup"}, + {Name: "CertExpired", Severity: "critical"}, + {Name: "CertExpiringCritical", Severity: "critical"}, + {Name: "CertExpiringSoon", Severity: "warning"}, + {Name: "DiskSpaceCritical", Severity: "critical", Service: "storage"}, + {Name: "DiskSpaceLow", Severity: "warning", Service: "storage"}, + {Name: "LokiOffline", Severity: "warning", Service: "loki"}, + {Name: "NodeOffline", Severity: "critical"}, + {Name: "RaidDiskFailed", Severity: "critical", Service: "storage"}, + {Name: "RaidDriveMissing", Severity: "critical", Service: "storage"}, + {Name: "SwapFull", Severity: "warning"}, + {Name: "SwapNotPresent", Severity: "critical"}, + + // --- NethSecurity (nethsecurity#1633) --- + {Name: "BackupEncryptionDisabled", Severity: "warning", Service: "backup"}, + {Name: "CriticalCpuUsage", Severity: "warning", Service: "host"}, + {Name: "CriticalMemoryUsage", Severity: "warning", Service: "host"}, + {Name: "DiskSpaceWarning", Severity: "warning", Service: "storage"}, + {Name: "HighCpuUsage", Severity: "info", Service: "host"}, + {Name: "HighMemoryUsage", Severity: "info", Service: "host"}, + {Name: "HighSystemLoad", Severity: "warning", Service: "host"}, + {Name: "ServiceDown", Severity: "critical"}, + {Name: "StorageStatus", Severity: "critical", Service: "storage"}, + {Name: "WanDown", Severity: "critical", Service: "network"}, +} + +// AlertCatalog returns the static alert catalog sorted by alert name. The +// returned slice is a fresh copy so callers cannot mutate the package state. +func AlertCatalog() []AlertCatalogEntry { + out := make([]AlertCatalogEntry, len(alertCatalog)) + copy(out, alertCatalog) + sort.Slice(out, func(i, j int) bool { return out[i].Name < out[j].Name }) + return out +} diff --git a/backend/methods/alerts_filters.go b/backend/methods/alerts_filters.go new file mode 100644 index 00000000..25c1f169 --- /dev/null +++ b/backend/methods/alerts_filters.go @@ -0,0 +1,195 @@ +/* +Copyright (C) 2025 Nethesis S.r.l. +SPDX-License-Identifier: AGPL-3.0-or-later +*/ + +package methods + +import ( + "fmt" + "net/http" + "strings" + "sync" + + "github.com/gin-gonic/gin" + + "github.com/nethesis/my/backend/database" + "github.com/nethesis/my/backend/helpers" + "github.com/nethesis/my/backend/logger" + "github.com/nethesis/my/backend/response" +) + +// GetAlertFilters handles GET /api/filters/alerts - aggregated filters endpoint +// for the alerts views. Returns the systems, alert names, severities and +// organizations that actually appear in alert_history within the caller's +// scope, so the UI dropdowns only offer values that yield results. +// +// Scope follows the same rules as the other alerts endpoints (resolveOrgScope): +// organization_id omitted = caller's full hierarchy; one/more organization_id = +// those tenants (validated, Owner exempt); customer pinned to own org. +// Single auth + scope resolution, parallel data fetching. +func GetAlertFilters(c *gin.Context) { + user, ok := helpers.GetUserFromContext(c) + if !ok { + return + } + + orgIDs, ok := resolveOrgScope(c, user) + if !ok { + return + } + + type SystemFilter struct { + ID string `json:"id"` + Name string `json:"name"` + Type string `json:"type"` + Key string `json:"key"` + } + type OrganizationFilter struct { + LogtoID string `json:"logto_id"` + Name string `json:"name"` + Type string `json:"type"` + } + + // `alerts` is a static catalog: every alert a system can raise, regardless + // of what has been received so far. It is NOT scoped to the caller's data. + alerts := AlertCatalog() + + // Empty scope → no data-driven rows anywhere; still return the static + // alert catalog so the dropdown is usable. + if len(orgIDs) == 0 { + c.JSON(http.StatusOK, response.OK("alert filters retrieved successfully", gin.H{ + "systems": []SystemFilter{}, + "alerts": alerts, + "severities": []string{}, + "organizations": []OrganizationFilter{}, + })) + return + } + + // Shared IN (...) placeholder list + args for organization_id scoping. + placeholders := make([]string, len(orgIDs)) + args := make([]interface{}, len(orgIDs)) + for i, id := range orgIDs { + placeholders[i] = fmt.Sprintf("$%d", i+1) + args[i] = id + } + inClause := strings.Join(placeholders, ",") + + var ( + systems []SystemFilter + severities []string + organizations []OrganizationFilter + + errSystems, errSeverities, errOrgs error + wg sync.WaitGroup + ) + + wg.Add(3) + + // Systems with at least one alert in scope. + go func() { + defer wg.Done() + + query := fmt.Sprintf(` + SELECT DISTINCT s.id, s.name, COALESCE(s.type, '') AS type, s.system_key + FROM alert_history ah + INNER JOIN systems s ON s.system_key = ah.system_key + WHERE s.deleted_at IS NULL + AND ah.organization_id IN (%s) + ORDER BY s.name ASC + `, inClause) + + rows, err := database.DB.Query(query, args...) + if err != nil { + errSystems = fmt.Errorf("failed to retrieve system filters: %w", err) + return + } + defer func() { _ = rows.Close() }() + + systems = make([]SystemFilter, 0) + for rows.Next() { + var s SystemFilter + if err := rows.Scan(&s.ID, &s.Name, &s.Type, &s.Key); err != nil { + continue + } + systems = append(systems, s) + } + }() + + // Distinct severities. + go func() { + defer wg.Done() + + query := fmt.Sprintf(` + SELECT DISTINCT severity + FROM alert_history + WHERE severity IS NOT NULL + AND severity != '' + AND organization_id IN (%s) + ORDER BY severity ASC + `, inClause) + + rows, err := database.DB.Query(query, args...) + if err != nil { + errSeverities = fmt.Errorf("failed to retrieve severity filters: %w", err) + return + } + defer func() { _ = rows.Close() }() + + severities = make([]string, 0) + for rows.Next() { + var sev string + if err := rows.Scan(&sev); err != nil { + continue + } + severities = append(severities, sev) + } + }() + + // Organizations with at least one alert in scope. + go func() { + defer wg.Done() + + query := fmt.Sprintf(` + SELECT DISTINCT uo.logto_id, uo.name, uo.org_type + FROM alert_history ah + INNER JOIN unified_organizations uo ON uo.logto_id = ah.organization_id + WHERE ah.organization_id IN (%s) + ORDER BY uo.name ASC + `, inClause) + + rows, err := database.DB.Query(query, args...) + if err != nil { + errOrgs = fmt.Errorf("failed to retrieve organization filters: %w", err) + return + } + defer func() { _ = rows.Close() }() + + organizations = make([]OrganizationFilter, 0) + for rows.Next() { + var o OrganizationFilter + if err := rows.Scan(&o.LogtoID, &o.Name, &o.Type); err != nil { + continue + } + organizations = append(organizations, o) + } + }() + + wg.Wait() + + for _, e := range []error{errSystems, errSeverities, errOrgs} { + if e != nil { + logger.Error().Err(e).Str("user_id", user.ID).Msg("Failed in alert filters") + c.JSON(http.StatusInternalServerError, response.InternalServerError("failed to retrieve alert filters", nil)) + return + } + } + + c.JSON(http.StatusOK, response.OK("alert filters retrieved successfully", gin.H{ + "systems": systems, + "alerts": alerts, + "severities": severities, + "organizations": organizations, + })) +} diff --git a/backend/openapi.yaml b/backend/openapi.yaml index 130316a4..94283ecc 100644 --- a/backend/openapi.yaml +++ b/backend/openapi.yaml @@ -86,7 +86,13 @@ tags: description: Backend service health check - name: Backend - Alerts - description: Alert routing configuration and active alert monitoring via Mimir + description: Active alert monitoring and aggregates (list, totals, trend, stats, history, activity) + - name: Backend - Alerts Configuration + description: Per-organization alerting layer configuration and effective merged config + - name: Backend - Alerts Silences + description: Cross-hierarchy alert silences (mute/unmute across systems) + - name: Backend - Alerts (Per-System) + description: Alerts and silences scoped to a single system (/systems/{id}/alerts) - name: Collect - Health description: Collect service health and monitoring @@ -8897,36 +8903,54 @@ paths: '404': $ref: '#/components/responses/NotFound' - /systems/{id}/alerts: + /alerts: get: - operationId: getSystemAlerts + operationId: getAlerts tags: - Backend - Alerts - summary: /systems/{id}/alerts - Get active alerts for a system + summary: "/alerts - List active alerts (cross-hierarchy)" description: | - Returns current alerts from Mimir scoped to a single system. Mirrors - the filter, pagination, and sort surface of `GET /alerts`: the only - difference is that `system_key` is pinned to the URL path (the - multi-value `system_key` query filter is therefore not exposed). - Suppressed alerts remain visible so silenced alerts can still be - inspected in the system detail view. + Retrieves active alerts from Mimir for the caller's scope, paginated. Each + alert is enriched with a `system` object (`name`, `type`) looked up from the + local `systems` table, so the UI can render the system column without an + extra round-trip per row. - System identity is carried as labels on each alert (`system_id`, - `system_key`, `system_name`, `system_type`), stamped at ingest time. + Sortable by `starts_at` (default desc), `severity` (criticality rank: critical + > warning > info), `alertname`, or `status` (Alertmanager state). `fingerprint` + is used as a stable tiebreaker so pagination doesn't shift between requests. - Multi-value filters: OR within the same filter, AND across filters. + Scope follows the same three modes as `/alerts/totals`: + - `organization_id` omitted → caller's full hierarchy (cross-tenant fan-out). + - `organization_id=X` → single tenant `X`. + - `organization_id=X&include=descendants` → `X` plus its sub-tree. - Requires `read:systems` permission. + All filter params support **multiple values** (repeat the param): values within + the same filter are matched as OR; different filters AND together. Example: + `?severity=critical&severity=warning&alertname=CVE-2024-1234` returns + CVE-2024-1234 alerts that are critical or warning. + + Per-tenant failures during fan-out (timeout, 5xx) are non-fatal: the rest of the + result is returned and the failure is reported in the `warnings` array. security: - BearerAuth: [] parameters: - - name: id - in: path - required: true - description: System ID (database UUID) + - name: organization_id + in: query + description: | + Target organization ID(s). Repeat the param to pass multiple values. + Optional for all roles except Customer (where it is ignored). + schema: + type: array + items: + type: string + style: form + explode: true + - name: include + in: query + description: Set to `descendants` together with `organization_id` to expand each value to its sub-tree. schema: type: string - example: "sys_123456789" + enum: [descendants] - name: page in: query description: 1-based page number. @@ -8944,7 +8968,7 @@ paths: default: 50 - name: sort_by in: query - description: Sort column (allowlist matches `/alerts`). + description: Sort column (allowlist). schema: type: string enum: [starts_at, severity, alertname, status] @@ -8957,7 +8981,7 @@ paths: default: desc - name: status in: query - description: Filter by Alertmanager state. Supports multiple values. + description: Filter alerts by Alertmanager state. Supports multiple values. schema: type: array items: @@ -8967,7 +8991,7 @@ paths: explode: true - name: severity in: query - description: Filter by severity. Supports multiple values. + description: Filter alerts by severity label. Supports multiple values. schema: type: array items: @@ -8975,9 +8999,20 @@ paths: enum: [critical, warning, info] style: form explode: true + - name: system_key + in: query + description: Filter alerts by system_key label. Supports multiple values. + schema: + type: array + items: + type: string + style: form + explode: true - name: alertname in: query - description: Filter by alertname. Supports multiple values. + description: | + Filter alerts by alertname label (the alert "type" — e.g. `HighCPU`, + `DiskFull`, `CVE-2024-1234`). Supports multiple values. schema: type: array items: @@ -8986,7 +9021,7 @@ paths: explode: true responses: '200': - description: Active alerts retrieved successfully + description: Paginated list of active alerts content: application/json: schema: @@ -8997,7 +9032,7 @@ paths: example: 200 message: type: string - example: "alerts retrieved successfully" + example: alerts retrieved successfully data: type: object properties: @@ -9007,17 +9042,25 @@ paths: $ref: '#/components/schemas/ActiveAlert' pagination: $ref: '#/components/schemas/Pagination' + warnings: + type: array + description: | + Per-tenant errors encountered during fan-out. Always present + (empty array when every tenant responded OK). Each entry is + a string `org : `. + items: + type: string examples: - ActiveAndSuppressedOnSystem: - summary: Two firing alerts on the same system, one silenced + ActiveAlertExample: + summary: One active warning across the caller's hierarchy description: | - The endpoint always includes silenced alerts so the UI can - show the muted state in the system detail view. `state` is - `"suppressed"` when at least one active silence matches the - alert; the matching silence IDs are listed in `silencedBy`. + A single warning alert. System identity (id, key, name, type) is + carried as labels, stamped at ingest time. `state="active"` means + Mimir has not been told to silence it; an actively-muted alert + would have `state="suppressed"` and a non-empty `silencedBy`. value: code: 200 - message: "alerts retrieved successfully" + message: alerts retrieved successfully data: alerts: - fingerprint: "0a9d04bb6eed523f" @@ -9029,46 +9072,32 @@ paths: system_name: "test-sys" system_type: "ns8" annotations: - summary: "/var is 92% full" - description: "Disk usage exceeded warning threshold." - status: - state: "suppressed" - silencedBy: - - "d9f91c6e-1b33-484e-befa-bfb41020e178" - inhibitedBy: [] - startsAt: "2026-05-12T08:14:00Z" - endsAt: "2026-05-12T08:44:00Z" - - fingerprint: "11a9302b0fa6526e" - labels: - alertname: "HighCPU" - severity: "critical" - system_id: "e4eb4844-46f6-448c-8279-7cfedf5e1037" - system_key: "NETH-D417-A2C2-7810-43D2-984B-2164-34C1-B22E" - system_name: "test-sys" - system_type: "ns8" - annotations: - summary: "CPU usage 98%" - description: "Sustained high CPU." + summary: "/var is 92% full on test-sys" + description: "Disk usage exceeded the warning threshold." status: state: "active" silencedBy: [] inhibitedBy: [] - startsAt: "2026-05-12T08:20:00Z" - endsAt: "2026-05-12T08:50:00Z" + startsAt: "2026-05-12T08:14:00Z" + endsAt: "2026-05-12T08:44:00Z" pagination: page: 1 page_size: 50 - total_count: 2 + total_count: 1 total_pages: 1 has_next: false has_prev: false - sort_by: "starts_at" - sort_direction: "desc" - NoActiveAlerts: - summary: System has no firing alerts + warnings: [] + PartialFanoutWarning: + summary: One tenant timed out during fan-out + description: | + When `organization_id` is omitted (or `include=descendants` is + used), the request fans out to every tenant in scope. A single + slow Mimir does not fail the whole request — the rest of the + results are returned and the failing tenant lands in `warnings`. value: code: 200 - message: "alerts retrieved successfully" + message: alerts retrieved successfully data: alerts: [] pagination: @@ -9078,39 +9107,77 @@ paths: total_pages: 0 has_next: false has_prev: false - sort_by: "starts_at" - sort_direction: "desc" + warnings: + - "org pt8gqs6y5wpr: context deadline exceeded" + '400': + $ref: '#/components/responses/BadRequest' '401': $ref: '#/components/responses/Unauthorized' '403': $ref: '#/components/responses/Forbidden' - '404': - $ref: '#/components/responses/NotFound' + '500': + $ref: '#/components/responses/InternalServerError' - /systems/{id}/alerts/silences: + # =========================================== + # ALERTING ENDPOINTS (Collect - Mimir Proxy) + # =========================================== + + /alerts/totals: get: - operationId: getSystemAlertSilences + operationId: getAlertsTotals tags: - Backend - Alerts - summary: /systems/{id}/alerts/silences - List active silences for a system + summary: "/alerts/totals - Get alert totals by severity" description: | - Returns all active and pending Alertmanager silences scoped to the target system. - Expired silences are excluded. Results are filtered server-side to silences that carry - an exact `system_key` matcher matching the system's key. - Requires `read:systems` permission. + Returns active alert counts by severity (from Mimir, per-tenant) and total resolved + alert history count (from DB). Requires `read:systems` permission. + + **Scope modes** (selected by query params): + + | `organization_id` | `include` | Result | + |---|---|---| + | omitted | — | Caller's full hierarchy (recursive). For Customer it's just self. | + | `X` | omitted | Single tenant `X` only. Resellers/Distributors hold no alerts on their own tenant — those live on their customer tenants — so single-tenant queries on a non-leaf org typically return zero. | + | `X` (repeated for multi) | omitted | Union of all `organization_id` values passed. Each must be in the caller's hierarchy (Owner exempt). | + | `X` (single or multi) | `descendants` | Each org_id is expanded to itself + its sub-tree (deduplicated). Use this to drill into one or more sub-trees. | + + Active counts are aggregated across the resolved scope by fanning out to Mimir, + one request per tenant, with bounded concurrency and a global timeout. Per-tenant + failures (timeout, 5xx, parse error) are non-fatal: their counts simply don't + contribute, and the failure is reported in the `warnings` array. The `history` + total comes from a single SQL query scoped to the same set of organization IDs. + + Customer callers are always pinned to their own organization regardless of + `organization_id`/`include` (Mimir tenant is fixed to `user.organization_id`). security: - BearerAuth: [] parameters: - - name: id - in: path - required: true - description: System ID (database UUID) + - name: organization_id + in: query + description: | + Target organization ID(s). Repeat the param to pass multiple values + (`?organization_id=A&organization_id=B`). Optional for all roles except + Customer (where it is ignored). Distributors/Resellers receive `403` if any + value is not in their hierarchy. + schema: + type: array + items: + type: string + style: form + explode: true + - name: include + in: query + description: | + Set to `descendants` together with `organization_id` to expand each value + to its full sub-tree (results deduplicated). Ignored when `organization_id` + is omitted (the caller's own hierarchy is already used) and when the caller + is a Customer. schema: type: string - example: "sys_123456789" + enum: [descendants] responses: '200': - description: Silences retrieved successfully + description: Alert totals retrieved content: application/json: schema: @@ -9121,132 +9188,89 @@ paths: example: 200 message: type: string - example: "silences retrieved successfully" + example: alert totals retrieved successfully data: type: object properties: - silences: + active: + type: integer + description: Total active alerts in scope + critical: + type: integer + description: Active critical alerts in scope + warning: + type: integer + description: Active warning alerts in scope + info: + type: integer + description: Active info alerts in scope + muted: + type: integer + description: Active alerts currently silenced (Alertmanager `silencedBy` non-empty) + history: + type: integer + description: Total resolved alerts in history (DB) in scope + warnings: type: array + description: | + Per-tenant errors encountered during fan-out. Always present + (empty array when every tenant responded OK). Each entry is a + string in the form `org : ` or + `history: ` for the DB lookup. items: - $ref: '#/components/schemas/AlertmanagerSilence' - examples: - ActiveSilence: - summary: One active silence for the system - description: | - The silence matches on `system_key` (server-injected) plus the - labels that uniquely identified the alert at silence creation - time. `silencedBy` on the active alert references this same - `id`. - value: - code: 200 - message: "silences retrieved successfully" - data: - silences: - - id: "d9f91c6e-1b33-484e-befa-bfb41020e178" - matchers: - - name: "system_key" - value: "NETH-D417-A2C2-7810-43D2-984B-2164-34C1-B22E" - isRegex: false - - name: "alertname" - value: "DiskFilling" - isRegex: false - - name: "severity" - value: "warning" - isRegex: false - startsAt: "2026-05-12T08:16:36Z" - endsAt: "2026-05-12T09:16:36Z" - updatedAt: "2026-05-12T08:16:36Z" - createdBy: "R1C1 Admin " - comment: "silenced during maintenance window" - status: - state: "active" - NoSilences: - summary: No silences for the system - value: - code: 200 - message: "silences retrieved successfully" - data: - silences: [] + type: string '401': $ref: '#/components/responses/Unauthorized' '403': $ref: '#/components/responses/Forbidden' - '404': - $ref: '#/components/responses/NotFound' - post: - operationId: createSystemAlertSilence + + /alerts/trend: + get: + operationId: getAlertsTrend tags: - Backend - Alerts - summary: /systems/{id}/alerts/silences - Create a silence for a system alert + summary: "/alerts/trend - Get alert history trend" description: | - Creates an Alertmanager silence for a specific active alert on the target system. - The request identifies the live alert by fingerprint. The backend resolves the alert, - builds the silence matchers server-side, and always uses the system's authoritative - `system_key`. - If `end_at` is provided it takes precedence over `duration_minutes`. - Requires `manage:systems` permission. + Returns trend data for resolved alerts over a specified period with daily data points. + Compares the current period with the previous period of equal length. + Requires `read:systems` permission. + + Scope follows the same three modes as `/alerts/totals`: + - `organization_id` omitted → caller's full hierarchy. + - `organization_id=X` → single tenant `X`. + - `organization_id=X&include=descendants` → `X` plus its sub-tree. + + Customer callers are always pinned to their own organization regardless of params. security: - BearerAuth: [] parameters: - - name: id - in: path - required: true - description: System ID (database UUID) + - name: organization_id + in: query + description: | + Target organization ID(s). Repeat the param to pass multiple values. + Optional for all roles except Customer (where it is ignored). + schema: + type: array + items: + type: string + style: form + explode: true + - name: include + in: query + description: Set to `descendants` together with `organization_id` to expand each value to its sub-tree. schema: type: string - example: "sys_123456789" - requestBody: - required: true - content: - application/json: - schema: - type: object - required: - - fingerprint - properties: - fingerprint: - type: string - description: Fingerprint of the active alert to silence - example: "8f2d65896d4bcf97" - comment: - type: string - description: Optional silence comment. Defaults to a system-generated value when empty. - example: "silenced during maintenance" - duration_minutes: - type: integer - minimum: 1 - maximum: 10080 - description: Optional silence duration in minutes. Defaults to 60 when omitted. Ignored when end_at is set. - example: 60 - end_at: - type: string - format: date-time - description: Optional explicit end time (RFC3339). Takes precedence over duration_minutes. - example: "2024-01-01T02:00:00Z" - examples: - ExplicitEndAt: - summary: Silence until a specific date/time - description: | - When `end_at` is set, the silence expires at that moment - regardless of `duration_minutes`. The backend resolves the - alert by `fingerprint`, attaches the system's authoritative - `system_key` to the matchers, and creates the silence. - value: - fingerprint: "0a9d04bb6eed523f" - comment: "silenced during maintenance window" - end_at: "2026-05-12T09:16:36Z" - DurationBased: - summary: Silence for the next 60 minutes - description: | - Without `end_at`, `duration_minutes` applies. If both are - omitted, the silence defaults to 60 minutes from creation. - value: - fingerprint: "0a9d04bb6eed523f" - comment: "investigating" - duration_minutes: 60 + enum: [descendants] + - name: period + in: query + description: Trend period in days + schema: + type: integer + enum: [7, 30, 180, 365] + default: 7 responses: '200': - description: Alert silence created successfully + description: Alert trend data content: application/json: schema: @@ -9257,65 +9281,105 @@ paths: example: 200 message: type: string - example: "alert silenced successfully" + example: alerts trend retrieved successfully data: type: object properties: - silence_id: + period: + type: integer + period_label: type: string - example: "4e6f0c30-c383-4e22-9443-0d7b6a8bd40b" - examples: - Created: - summary: Silence created - description: | - `silence_id` is the Alertmanager-assigned UUID; use it to - look up, update, or delete the silence later. The - corresponding `silenced` event is appended to the alert's - activity timeline (`GET /alerts/activity/{fingerprint}`). - value: - code: 200 - message: "alert silenced successfully" - data: - silence_id: "d9f91c6e-1b33-484e-befa-bfb41020e178" + current_total: + type: integer + previous_total: + type: integer + delta: + type: integer + delta_percentage: + type: number + trend: + type: string + enum: [up, down, stable] + data_points: + type: array + items: + type: object + properties: + date: + type: string + format: date + count: + type: integer '400': $ref: '#/components/responses/BadRequest' '401': $ref: '#/components/responses/Unauthorized' '403': $ref: '#/components/responses/Forbidden' - '404': - $ref: '#/components/responses/NotFound' - /systems/{id}/alerts/silences/{silence_id}: + /alerts/stats: get: - operationId: getSystemAlertSilence + operationId: getAlertsStats tags: - Backend - Alerts - summary: /systems/{id}/alerts/silences/{silence_id} - Get a single silence + summary: "/alerts/stats - Aggregate alert statistics" description: | - Returns a specific Alertmanager silence after verifying it belongs to the target system - via its `system_key` matcher. - Requires `read:systems` permission. + Returns aggregate statistics over `alert_history` for the caller's scope: + total, severity buckets, top-N alertname and system_key by count, plus MTTR + (mean time to resolve) and MTBF (mean time between failures, approximated). + + Scope follows the same three modes as `/alerts/totals`: + - `organization_id` omitted → caller's full hierarchy. + - `organization_id=X` → single tenant. + - `organization_id=X&include=descendants` → sub-tree drill-down. + + MTBF formula: + - When both `from_date` and `to_date` are provided: `(to - from) / total`. + - Otherwise: `(max(starts_at) - min(starts_at)) / (total - 1)`. + - Omitted from the response when the result is undefined. security: - BearerAuth: [] parameters: - - name: id - in: path - required: true - description: System ID (database UUID) + - name: organization_id + in: query + description: | + Target organization ID(s). Repeat the param to pass multiple values. + schema: + type: array + items: + type: string + style: form + explode: true + - name: include + in: query schema: type: string - example: "sys_123456789" - - name: silence_id - in: path - required: true - description: Alertmanager silence ID + enum: [descendants] + - name: from_date + in: query + description: Lower bound on `created_at` (inclusive). RFC3339 timestamp. schema: type: string - example: "4e6f0c30-c383-4e22-9443-0d7b6a8bd40b" + format: date-time + example: "2026-05-01T00:00:00Z" + - name: to_date + in: query + description: Upper bound on `created_at` (exclusive). RFC3339 timestamp. + schema: + type: string + format: date-time + example: "2026-05-08T00:00:00Z" + - name: top + in: query + description: Cap for top-N alertname / system_key buckets. Default 10, max 50. + schema: + type: integer + minimum: 1 + maximum: 50 + default: 10 responses: '200': - description: Silence retrieved successfully + description: Alert stats retrieved content: application/json: schema: @@ -9326,223 +9390,94 @@ paths: example: 200 message: type: string - example: "silence retrieved successfully" + example: alert stats retrieved successfully data: type: object properties: - silence: - $ref: '#/components/schemas/AlertmanagerSilence' - examples: - ActiveSilence: - summary: Silence found and active - value: - code: 200 - message: "silence retrieved successfully" - data: - silence: - id: "d9f91c6e-1b33-484e-befa-bfb41020e178" - matchers: - - name: "system_key" - value: "NETH-D417-A2C2-7810-43D2-984B-2164-34C1-B22E" - isRegex: false - - name: "alertname" - value: "DiskFilling" - isRegex: false - - name: "severity" - value: "warning" - isRegex: false - startsAt: "2026-05-12T08:16:36Z" - endsAt: "2026-05-12T09:16:36Z" - updatedAt: "2026-05-12T08:16:36Z" - createdBy: "R1C1 Admin " - comment: "silenced during maintenance window" - status: - state: "active" - '401': - $ref: '#/components/responses/Unauthorized' - '403': - $ref: '#/components/responses/Forbidden' - '404': - $ref: '#/components/responses/NotFound' - put: - operationId: updateSystemAlertSilence - tags: - - Backend - Alerts - summary: /systems/{id}/alerts/silences/{silence_id} - Update a silence - description: | - Updates the end time and/or comment of an existing silence. Preserves the original - matchers and start time. Ownership is verified via the `system_key` matcher. - Requires `manage:systems` permission. - security: - - BearerAuth: [] - parameters: - - name: id - in: path - required: true - description: System ID (database UUID) - schema: - type: string - example: "sys_123456789" - - name: silence_id - in: path - required: true - description: Alertmanager silence ID - schema: - type: string - example: "4e6f0c30-c383-4e22-9443-0d7b6a8bd40b" - requestBody: - required: true - content: - application/json: - schema: - type: object - required: - - end_at - properties: - comment: - type: string - description: New comment for the silence. Defaults to previous value if empty. - example: "extended for maintenance window" - end_at: - type: string - format: date-time - description: New end time (RFC3339). Must be in the future. - example: "2024-01-01T04:00:00Z" - examples: - ExtendEndTime: - summary: Extend the silence by 3 more hours - description: | - Alertmanager treats an update as "create a new silence with - the same matchers and start_at, then drop the old one", so - the response carries a new `silence_id`. The activity - timeline records this as a `silence_updated` event. - value: - comment: "extended for maintenance window" - end_at: "2026-05-12T12:16:36Z" - responses: - '200': - description: Silence updated successfully - content: - application/json: - schema: - type: object - properties: - code: - type: integer - example: 200 - message: - type: string - example: "silence updated successfully" - data: - type: object - properties: - silence_id: - type: string - example: "4e6f0c30-c383-4e22-9443-0d7b6a8bd40b" - examples: - Updated: - summary: New silence id after update - value: - code: 200 - message: "silence updated successfully" - data: - silence_id: "f1e1c2a4-7e57-4b1a-aaa0-2b96c8b5a3aa" - '400': - $ref: '#/components/responses/BadRequest' - '401': - $ref: '#/components/responses/Unauthorized' - '403': - $ref: '#/components/responses/Forbidden' - '404': - $ref: '#/components/responses/NotFound' - delete: - operationId: deleteSystemAlertSilence - tags: - - Backend - Alerts - summary: /systems/{id}/alerts/silences/{silence_id} - Disable a system alert silence - description: | - Deletes a system-scoped Alertmanager silence after validating that the silence belongs - to the target system through the authoritative `system_key` matcher. - Requires `manage:systems` permission. - security: - - BearerAuth: [] - parameters: - - name: id - in: path - required: true - description: System ID (database UUID) - schema: - type: string - example: "sys_123456789" - - name: silence_id - in: path - required: true - description: Alertmanager silence ID - schema: - type: string - example: "4e6f0c30-c383-4e22-9443-0d7b6a8bd40b" - responses: - '200': - description: Alert silence disabled successfully - content: - application/json: - schema: - type: object - properties: - code: - type: integer - example: 200 - message: - type: string - example: "silence disabled successfully" - examples: - Disabled: - summary: Silence removed - description: | - The silence is expired (not hard-deleted) so it disappears - from `GET /silences` but stays referenced in the alert's - activity timeline as an `unsilenced` event. On the wire - the response also carries `"data": null`; the example - omits it to match the declared schema which only includes - `code` and `message`. - value: - code: 200 - message: "silence disabled successfully" + total: + type: integer + description: Total alerts in scope (sum of severity buckets, including null severity). + by_severity: + type: object + additionalProperties: + type: integer + example: + critical: 30 + warning: 100 + info: 26 + top_alertnames: + type: array + items: + type: object + properties: + alertname: + type: string + count: + type: integer + top_systems: + type: array + items: + type: object + properties: + system_key: + type: string + count: + type: integer + mttr_seconds: + type: integer + description: Mean time to resolve (avg `ends_at - starts_at` over rows with `ends_at` set). Omitted when no resolved alerts. + mtbf_seconds: + type: integer + description: Mean time between failures (approximation, see endpoint description). Omitted when undefined. '400': $ref: '#/components/responses/BadRequest' '401': $ref: '#/components/responses/Unauthorized' '403': $ref: '#/components/responses/Forbidden' - '404': - $ref: '#/components/responses/NotFound' - /systems/{id}/alerts/history: + /alerts/history: get: - operationId: getSystemAlertHistory + operationId: getAlertsHistory tags: - Backend - Alerts - summary: /systems/{id}/alerts/history - Get system alert history + summary: "/alerts/history - Org-level paginated alert history" description: | - Get paginated history of resolved and inactive alerts for a specific system. - Alerts are stored by the collect service when Alertmanager sends webhook notifications. - Requires `read:systems` permission. + Returns paginated resolved alert history scoped to the caller's hierarchy + (no `organization_id`), a single tenant (`organization_id=X`), or a sub-tree + (`organization_id=X&include=descendants`). Mirrors the scope rules of + `/alerts/totals` and `/alerts/trend`. + + Supports date range (`from_date`/`to_date`, RFC3339) and multi-value label + filters (`system_key`, `alertname`, `severity`, `status`). All multi-value + filters: OR within the same filter, AND across filters. + + Customer callers are always pinned to their own organization regardless of params. security: - BearerAuth: [] parameters: - - name: id - in: path - required: true - description: System ID (logto_id) + - name: organization_id + in: query + description: | + Target organization ID(s). Repeat the param to pass multiple values. + Optional for all roles except Customer (where it is ignored). + schema: + type: array + items: + type: string + style: form + explode: true + - name: include + in: query + description: Set to `descendants` together with `organization_id` to expand each value to its sub-tree. schema: type: string - example: "sys_123456789" + enum: [descendants] - $ref: '#/components/parameters/PageParam' - $ref: '#/components/parameters/PageSizeParam' - name: sort_by in: query required: false - description: Field to sort by schema: type: string enum: [id, alertname, severity, status, starts_at, ends_at, created_at] @@ -9560,7 +9495,6 @@ paths: default: desc - name: from_date in: query - required: false description: Lower bound on `created_at` (inclusive). RFC3339 timestamp. schema: type: string @@ -9568,16 +9502,25 @@ paths: example: "2026-05-01T00:00:00Z" - name: to_date in: query - required: false description: Upper bound on `created_at` (exclusive). RFC3339 timestamp. Must be after `from_date`. schema: type: string format: date-time example: "2026-05-08T00:00:00Z" + - name: system_key + in: query + description: | + Filter by one or more system keys. Repeat the param to pass multiple + values; results are matched as `system_key IN (...)`. + schema: + type: array + items: + type: string + style: form + explode: true - name: alertname in: query - required: false - description: Filter by alertname. Supports multiple values (OR within filter). + description: Filter by alertname. Supports multiple values. schema: type: array items: @@ -9586,7 +9529,6 @@ paths: explode: true - name: severity in: query - required: false description: Filter by severity. Supports multiple values. schema: type: array @@ -9597,7 +9539,6 @@ paths: explode: true - name: status in: query - required: false description: Filter by status. Supports multiple values. schema: type: array @@ -9607,7 +9548,7 @@ paths: explode: true responses: '200': - description: Alert history retrieved successfully + description: Paginated alert history content: application/json: schema: @@ -9618,7 +9559,7 @@ paths: example: 200 message: type: string - example: "alert history retrieved successfully" + example: alert history retrieved successfully data: type: object properties: @@ -9629,16 +9570,17 @@ paths: pagination: $ref: '#/components/schemas/Pagination' examples: - ResolvedAlertsForSystem: - summary: Two resolved alerts on this system + TwoResolvedAlerts: + summary: Two resolved alerts on the same system description: | - Records are scoped to the path system's `system_key`. The - response is identical in shape to `GET /alerts/history` but - filtered to one system without needing to pass it as a - query param. + Result for a Customer caller. Same `system_key` appears in both + rows because they were fired against the same NS8 host. Each + row is a discrete event (firing → resolved) captured by the + history webhook at dispatch time; `created_at` records when + the row landed in `alert_history`. value: code: 200 - message: "alert history retrieved successfully" + message: alert history retrieved successfully data: alerts: - id: 55 @@ -9657,7 +9599,7 @@ paths: system_key: "NETH-D417-A2C2-7810-43D2-984B-2164-34C1-B22E" annotations: summary: "plain body check" - description: "resolved" + description: "checking html:'' fix" receiver: "severity-critical-receiver" created_at: "2026-05-12T07:52:00Z" - id: 54 @@ -9687,10 +9629,10 @@ paths: has_next: false has_prev: false EmptyHistory: - summary: No history rows for this system yet + summary: No history rows match value: code: 200 - message: "alert history retrieved successfully" + message: alert history retrieved successfully data: alerts: [] pagination: @@ -9706,45 +9648,57 @@ paths: $ref: '#/components/responses/Unauthorized' '403': $ref: '#/components/responses/Forbidden' - '404': - $ref: '#/components/responses/NotFound' - # =========================================================================== - # SYSTEM BACKUPS - # =========================================================================== - /systems/{id}/backups: + # =========================================== + # ALERTING ENDPOINTS (Per-alert audit timeline) + # =========================================== + + /alerts/activity/{fingerprint}: + parameters: + - name: fingerprint + in: path + required: true + description: | + Alertmanager fingerprint of the alert (hex hash of its labels). + Stable across re-firings of the same alert. + schema: + type: string + pattern: '^[A-Za-z0-9._:-]{1,128}$' + - name: organization_id + in: query + required: true + description: Tenant the alert belongs to. Required for non-Customer roles. + schema: + type: string get: - operationId: getSystemBackups + operationId: getAlertActivity tags: - - Backend - Systems Backups - summary: /systems/{id}/backups - List configuration backups for a system + - Backend - Alerts + summary: "/alerts/activity/{fingerprint} - Per-alert audit timeline" description: | - Returns the list of configuration backups stored for the system, - together with aggregate usage counters. Backups are produced by - the appliance itself (see collect ingest endpoint) and consumed - here read-only. + Returns the audit timeline for the alert identified by `fingerprint`, most recent + first. Events are written transparently as silences are created, updated, or + removed via the `/api/alerts/silences` and `/api/systems/:id/alerts/silences` endpoints. - Each entry carries `size`, `sha256`, and `uploaded_at`. The - peer IP observed at ingest is intentionally not exposed: on - traffic that transits the translation proxy the recorded value - would be the proxy's IP, and even when it is accurate it is a - reconnaissance aid for higher-tier admins. + Operator notes are stored as the silence `comment` (Alertmanager native), so a + note edit appears here as a `silence_updated` event whose `details` payload + includes the new comment. - Access is gated by the same RBAC rules as `GET /systems/{id}`: - the caller must belong to the organization that currently owns - the system. After a cross-org reassignment, the new owner sees - the full backup list and the previous owner loses visibility. + Requires `read:systems` permission. + security: + - BearerAuth: [] parameters: - - name: id - in: path - required: true - description: System ID + - name: limit + in: query + description: Max events to return. Default 100, max 500. schema: - type: string - example: "sys_123456789" + type: integer + minimum: 1 + maximum: 500 + default: 100 responses: '200': - description: Backups retrieved successfully + description: Activity timeline content: application/json: schema: @@ -9755,134 +9709,203 @@ paths: example: 200 message: type: string - example: "backups retrieved successfully" + example: alert activity retrieved successfully data: type: object properties: - backups: + events: type: array items: - $ref: '#/components/schemas/BackupMetadata' - quota_used_bytes: - type: integer - format: int64 - description: Sum of backup sizes stored for this system - example: 314572800 - slots_used: - type: integer - description: Number of backups currently stored - example: 4 + $ref: '#/components/schemas/AlertActivityEntry' + examples: + SilenceCreatedThenRemoved: + summary: A silence was created and later removed + description: | + Events are most-recent first. Both rows share the same + `silence_id` because they describe the same silence's + lifecycle. `actor_user_id` is the logto_id of the operator + who performed the action; `details` carries the silence + metadata captured at action time (comment, end_at, etc.). + value: + code: 200 + message: alert activity retrieved successfully + data: + events: + - id: 5 + organization_id: "m4m3mdjdiizs" + fingerprint: "0a9d04bb6eed523f" + action: "unsilenced" + actor_user_id: "c5gpnoo2do48" + actor_name: "R1C1 Admin" + silence_id: "d9f91c6e-1b33-484e-befa-bfb41020e178" + details: {} + created_at: "2026-05-12T08:20:38.410596Z" + - id: 4 + organization_id: "m4m3mdjdiizs" + fingerprint: "0a9d04bb6eed523f" + action: "silenced" + actor_user_id: "c5gpnoo2do48" + actor_name: "R1C1 Admin" + silence_id: "d9f91c6e-1b33-484e-befa-bfb41020e178" + details: + comment: "silenced during maintenance window" + end_at: "2026-05-12T09:16:36Z" + duration_minutes: 0 + created_at: "2026-05-12T08:16:36.661832Z" + EmptyTimeline: + summary: No silence events yet + description: | + The alert has fired but has never been silenced. The events + array is empty (not `null`). + value: + code: 200 + message: alert activity retrieved successfully + data: + events: [] + '400': + $ref: '#/components/responses/BadRequest' '401': $ref: '#/components/responses/Unauthorized' '403': $ref: '#/components/responses/Forbidden' - '404': - $ref: '#/components/responses/NotFound' - '502': - description: Backup storage unreachable or returned an error - '503': - description: Backup storage is not configured - /systems/{id}/backups/{backup_id}/download: - get: - operationId: downloadSystemBackup + # =========================================== + # ALERTING ENDPOINTS (Backend - Cross-system silences) + # =========================================== + + /alerts/config: + post: + operationId: configureAlerts tags: - - Backend - Systems Backups - summary: /systems/{id}/backups/{backup_id}/download - Issue a short-lived download URL + - Backend - Alerts Configuration + summary: "/alerts/config - Save the caller's alerting layer" description: | - Returns a short-lived presigned S3 URL that the user's browser - uses to stream the backup object directly from storage. The API - never proxies the object body itself. - - The backend does not perform a redirect because the frontend - sends its JWT on the initial request — browsers would drop the - `Authorization` header when following a 3xx redirect, so the - frontend receives the URL in the JSON response and navigates to - it explicitly. + Saves the CALLER's alerting configuration layer (one row per + organization in alert_config_layers). The body is an + `AlertingConfigLayer`: three channel toggles plus three recipient + lists. Each recipient carries its own `severities[]`; email + recipients additionally carry `language` and `format`. - The presigned URL's lifetime is controlled by the - `BACKUP_PRESIGN_TTL` environment variable (default 5 minutes). + After save, the effective per-tenant Mimir YAML is recomputed + server-side (merge of all layers walking up to the Owner) and + pushed to every tenant in the caller's hierarchy with bounded + concurrency. Per-tenant push failures are returned in `warnings[]`; + the caller's layer is saved regardless of push outcome (Mimir can + be reconciled by saving again). - Access is gated by the same RBAC rules as `GET /systems/{id}`. - parameters: - - name: id - in: path - required: true - description: System ID - schema: - type: string - example: "sys_123456789" - - name: backup_id - in: path - required: true - description: Backup object ID (UUIDv7 + extension, e.g. "01934f...tar.gz") - schema: - type: string - example: "01934fab-bc33-7890-a1b2-c3d4e5f6a7b8.tar.gz" - responses: - '200': - description: Download URL issued - content: - application/json: - schema: - type: object - properties: - code: - type: integer - example: 200 - message: - type: string - example: "download URL issued" - data: - type: object - properties: - download_url: - type: string - format: uri - description: Short-lived presigned S3 URL - expires_in_seconds: - type: integer - description: How long the URL remains valid - example: 300 - '401': - $ref: '#/components/responses/Unauthorized' - '403': - $ref: '#/components/responses/Forbidden' - '404': - $ref: '#/components/responses/NotFound' - '502': - description: Backup storage unreachable or returned an error - '503': - description: Backup storage is not configured + Additive-only contract: descendants can ADD recipients but cannot + disable channels enabled by ancestors. The server normalises any + explicit `false` in `enabled.{email,webhook,telegram}` from + non-Owner layers to null on storage. - delete: - operationId: deleteSystemBackup - tags: - - Backend - Systems Backups - summary: /systems/{id}/backups/{backup_id} - Delete a stored backup - description: | - Deletes a backup object from storage. The operation is final — - storage uses object-level deletion, not a soft-delete table. + Save+propagate is serialised per-organization (in-process mutex) to + prevent two concurrent saves from racing at the Mimir push step. + Body is capped at 1 MiB; oversized payloads are rejected with 413. - Access is gated by the same RBAC rules as `GET /systems/{id}`. - parameters: - - name: id - in: path - required: true - description: System ID - schema: - type: string - example: "sys_123456789" - - name: backup_id - in: path - required: true - description: Backup object ID - schema: - type: string - example: "01934fab-bc33-7890-a1b2-c3d4e5f6a7b8.tar.gz" + Requires `manage:alerts` permission. + security: + - BearerAuth: [] + requestBody: + required: true + content: + application/json: + schema: + $ref: '#/components/schemas/AlertingConfigLayer' + examples: + OwnerGlobalBaseline: + summary: Owner — global baseline + description: | + Owner enables email + webhook globally, sets a NOC + recipient on all severities in Italian HTML, plus a SIEM + webhook on every severity. Every descendant inherits. + value: + enabled: { email: true, webhook: true, telegram: false } + email_recipients: + - address: "noc@msp.example" + severities: [] + language: "it" + format: "html" + webhook_recipients: + - name: "central-siem" + url: "https://siem.example/api/alerts" + severities: [] + telegram_recipients: [] + DescendantAddRecipient: + summary: Reseller — additively add a recipient + description: | + Reseller does NOT touch channel toggles (null = "no + opinion"); it just adds a local NOC mailbox in English + for critical+warning. Merged with Owner's recipients. + value: + enabled: { email: null, webhook: null, telegram: null } + email_recipients: + - address: "noc@reseller.example" + severities: ["critical", "warning"] + language: "en" + format: "html" + webhook_recipients: [] + telegram_recipients: [] + CustomerMixedFormatAndLang: + summary: Customer — mixed languages and formats per recipient + description: | + Different recipients can request different bodies. The + on-call inbox wants plain text (alerts piped into a + ticketing tool); the manager wants HTML in Italian. + value: + enabled: { email: null, webhook: null, telegram: null } + email_recipients: + - address: "oncall@customer.example" + severities: ["critical"] + language: "en" + format: "plain" + - address: "manager@customer.example" + severities: [] + language: "it" + format: "html" + webhook_recipients: [] + telegram_recipients: [] + CustomerWebhookCriticalOnly: + summary: Customer — Slack webhook only for `critical` + description: | + Customer adds a Slack webhook scoped to critical. The + rendered Alertmanager route puts this webhook only on + the critical receiver. + value: + enabled: { email: null, webhook: null, telegram: null } + email_recipients: [] + webhook_recipients: + - name: "ops-slack" + url: "https://hooks.slack.com/services/T000/B000/XXX" + severities: ["critical"] + telegram_recipients: [] + TelegramAllSeverities: + summary: Customer — Telegram channel on every severity + description: | + Single Telegram bot pushing to a channel for every + severity (severities=[]). Telegram messages are + currently always rendered in English. + value: + enabled: { email: null, webhook: null, telegram: true } + email_recipients: [] + webhook_recipients: [] + telegram_recipients: + - bot_token: "123456:ABC-DEF1234ghIkl" + chat_id: -1001234567890 + severities: [] + InheritPurely: + summary: Descendant — explicit "inherit everything" + description: | + Saving an empty layer is meaningful: it just records + audit metadata (who/when) without contributing + recipients or toggles. + value: + enabled: { email: null, webhook: null, telegram: null } + email_recipients: [] + webhook_recipients: [] + telegram_recipients: [] responses: '200': - description: Backup deleted + description: Layer saved (and propagation attempted) content: application/json: schema: @@ -9893,61 +9916,62 @@ paths: example: 200 message: type: string - example: "backup deleted" + example: alerting configuration updated successfully data: type: object properties: - system_id: - type: string - example: "sys_123456789" - backup_id: - type: string - example: "01934fab-bc33-7890-a1b2-c3d4e5f6a7b8.tar.gz" + affected_tenants: + type: integer + description: Number of tenants in caller's hierarchy whose effective config was recomputed + propagated_to: + type: integer + description: Of `affected_tenants`, how many were successfully pushed to Mimir + warnings: + type: array + description: | + Per-tenant push errors. Always present; empty when every push succeeded. + Each entry: `org : `. + items: + type: string + '400': + $ref: '#/components/responses/BadRequest' '401': $ref: '#/components/responses/Unauthorized' '403': $ref: '#/components/responses/Forbidden' - '404': - $ref: '#/components/responses/NotFound' - '502': - description: Backup storage unreachable or returned an error - '503': - description: Backup storage is not configured - - # =========================================================================== - # APPLICATIONS MANAGEMENT - # =========================================================================== - /applications: - get: - operationId: getApplications + '413': + description: Request body exceeds the configured maximum (1 MiB). + content: + application/json: + schema: + type: object + properties: + code: + type: integer + example: 413 + message: + type: string + example: request body exceeds the configured maximum + '500': + $ref: '#/components/responses/InternalServerError' + delete: + operationId: disableAlerts tags: - - Backend - Applications - summary: /applications - List applications + - Backend - Alerts Configuration + summary: "/alerts/config - Remove the caller's alerting layer" description: | - Get list of system applications visible to the user based on hierarchical organization permissions. - Supports filtering by type, version, system, organization, and status. - - **Query String Examples:** + Removes the CALLER's layer from alert_config_layers. The effective + config of all descendant tenants is recomputed without the caller's + contribution and re-pushed to Mimir; ancestor layers are preserved. + To completely silence a tenant, every layer in its chain must drop + its contribution. - 1. **Single type filter**: `?type=mail` - 2. **Multiple types filter**: `?type=mail&type=webtop` - 3. **Status filter**: `?status=unassigned` - 4. **With pagination and sorting**: `?page=1&page_size=50&sort_by=instance_of&sort_direction=asc` - 5. **Combined filters**: `?type=mail&status=assigned&organization_id=org_abc123` - parameters: - - $ref: '#/components/parameters/PageParam' - - $ref: '#/components/parameters/PageSizeParam' - - $ref: '#/components/parameters/SearchParam' - - $ref: '#/components/parameters/AppSortByParam' - - $ref: '#/components/parameters/SortDirectionParam' - - $ref: '#/components/parameters/AppTypeFilterParam' - - $ref: '#/components/parameters/AppVersionFilterParam' - - $ref: '#/components/parameters/AppSystemFilterParam' - - $ref: '#/components/parameters/AppOrganizationFilterParam' - - $ref: '#/components/parameters/AppStatusFilterParam' + Requires `manage:alerts` permission. + security: + - BearerAuth: [] responses: '200': - description: Applications retrieved successfully + description: Layer removed (and propagation attempted) content: application/json: schema: @@ -9958,31 +9982,48 @@ paths: example: 200 message: type: string - example: "applications retrieved successfully" + example: alerting layer removed successfully data: type: object properties: - applications: + affected_tenants: + type: integer + propagated_to: + type: integer + warnings: type: array items: - $ref: '#/components/schemas/ApplicationListItem' - pagination: - $ref: '#/components/schemas/Pagination' + type: string + '400': + $ref: '#/components/responses/BadRequest' '401': $ref: '#/components/responses/Unauthorized' '403': $ref: '#/components/responses/Forbidden' - - /applications/totals: + '500': + $ref: '#/components/responses/InternalServerError' get: - operationId: getApplicationTotals + operationId: getAlertingConfig tags: - - Backend - Applications Stats - summary: /applications/totals - Get application statistics - description: Get statistics about applications including counts by type and status + - Backend - Alerts Configuration + summary: "/alerts/config - Get the caller's alerting layer" + description: | + Returns the CALLER's own alerting configuration layer. No inherited + ancestor layers, no merged effective view: every organization sees + only its own configuration. The server-side merge that backs the + Mimir YAML stays inside the backend. + + When the caller has never saved a layer the response body contains + an empty layer (toggles all null, recipient lists empty) and the + two audit fields set to null. The UI uses that state to render a + first-save form. + + Requires `read:alerts` permission. + security: + - BearerAuth: [] responses: '200': - description: Application totals retrieved successfully + description: Caller's layer content: application/json: schema: @@ -9993,117 +10034,92 @@ paths: example: 200 message: type: string - example: "application totals retrieved successfully" + example: alerting layer retrieved successfully data: - $ref: '#/components/schemas/ApplicationTotals' - '401': - $ref: '#/components/responses/Unauthorized' - '403': - $ref: '#/components/responses/Forbidden' - - /applications/summary: - get: - operationId: getApplicationTypeSummary - tags: - - Backend - Applications Stats - summary: /applications/summary - Get applications grouped by type - description: | - Returns applications grouped by instance_of (type) with a counter. - Hierarchically authorized: the caller only sees applications on systems within their organization tree. - Optionally filterable by organization_id to restrict to a specific organization or by system_id to restrict to a specific system. - When include_hierarchy is true, also includes applications from all child organizations. - parameters: - - name: organization_id - in: query - required: false - description: Logto ID of a specific organization to filter by. Must be within the caller's hierarchy. - schema: - type: string - - name: system_id - in: query - required: false - description: UUID of a specific system to filter by. Must be within the caller's hierarchy. - schema: - type: string - - name: include_hierarchy - in: query - required: false - description: When true (and organization_id is provided), includes applications from all child organizations in the hierarchy - schema: - type: boolean - default: false - - name: page - in: query - required: false - description: Page number for paginating the by_type array (1-based). If omitted, all types are returned. - schema: - type: integer - minimum: 1 - - name: page_size - in: query - required: false - description: Number of application types per page (max 100). If omitted, all types are returned. - schema: - type: integer - minimum: 1 - maximum: 100 - - name: sort_by - in: query - required: false - description: Field to sort by_type results by - schema: - type: string - enum: [count, created_at, instance_of] - default: count - - name: sort_direction - in: query - required: false - description: Sort direction - schema: - type: string - enum: [asc, desc] - default: desc - responses: - '200': - description: Application type summary retrieved successfully - content: - application/json: - schema: - type: object - properties: - code: - type: integer - example: 200 - message: - type: string - example: "application type summary retrieved successfully" - data: - $ref: '#/components/schemas/ApplicationTypeSummary' + allOf: + - $ref: '#/components/schemas/AlertingConfigLayer' + - type: object + properties: + updated_by_name: + type: string + nullable: true + updated_at: + type: string + format: date-time + nullable: true + examples: + Configured: + summary: Caller has saved a layer + value: + code: 200 + message: "alerting layer retrieved successfully" + data: + enabled: { email: true, webhook: null, telegram: null } + email_recipients: + - address: "noc@reseller.example" + severities: ["critical"] + language: "it" + format: "html" + webhook_recipients: [] + telegram_recipients: [] + updated_by_name: "Reseller Admin" + updated_at: "2026-05-09T10:14:00Z" + FirstTime: + summary: Caller has not saved a layer yet + value: + code: 200 + message: "alerting layer retrieved successfully" + data: + enabled: { email: null, webhook: null, telegram: null } + email_recipients: [] + webhook_recipients: [] + telegram_recipients: [] + updated_by_name: null + updated_at: null '401': $ref: '#/components/responses/Unauthorized' '403': $ref: '#/components/responses/Forbidden' - /applications/trend: + /alerts/config/effective: get: - operationId: getApplicationsTrend + operationId: getEffectiveAlertingConfig tags: - - Backend - Applications Stats - summary: /applications/trend - Get applications trend data - description: Get trend data for applications over a specified period showing daily counts + - Backend - Alerts Configuration + summary: "/alerts/config/effective - Inspect a tenant's effective merged config" + description: | + Privileged troubleshooting view. Returns the configuration a tenant + ACTUALLY receives: the per-layer contribution of every organization + in its ancestor chain (Owner → tenant), the merged effective layer, + and the rendered Alertmanager YAML pushed to Mimir for that tenant. + + Unlike `GET /alerts/config` (which returns only the caller's own + layer), this exposes the full inherited + merged view, so it is + gated by the dedicated `config:alerts` permission. That permission + lives solely on the `super` user role, which is owner-assignable + only — so in practice only an Owner-org Super Admin can reach this. + It is not reachable by Distributor/Reseller admins. + + Secrets are redacted in the response: telegram `bot_token` and + webhook URL path/query in every layer and in the effective layer, + and SMTP credentials / bearer / bot tokens in the rendered YAML. + + `organization_id` is required and may target ANY tenant. A + nonexistent id returns an empty effective config (no error) — the + honest answer for a diagnostic tool. Read-only: no Mimir push, no + DB writes. + security: + - BearerAuth: [] parameters: - - name: period + - name: organization_id in: query - required: false - description: Number of days to include in trend data (default 7, max 365) + required: true schema: - type: integer - default: 7 - minimum: 1 - maximum: 365 + type: string + description: Logto organization id of the tenant to inspect. responses: '200': - description: Applications trend data retrieved successfully + description: Effective configuration report content: application/json: schema: @@ -10114,31 +10130,107 @@ paths: example: 200 message: type: string - example: "applications trend retrieved successfully" + example: effective alerting configuration retrieved successfully data: - $ref: '#/components/schemas/TrendResponse' + type: object + properties: + organization_id: + type: string + chain: + type: array + description: | + Contributing layers ordered Owner first → tenant + last. Orgs with no saved layer are listed with + `has_layer: false` and an empty layer. + items: + type: object + properties: + organization_id: + type: string + organization_name: + type: string + organization_role: + type: string + enum: [owner, distributor, reseller, customer] + has_layer: + type: boolean + layer: + $ref: '#/components/schemas/AlertingConfigLayer' + updated_by_name: + type: string + nullable: true + updated_at: + type: string + format: date-time + nullable: true + effective: + $ref: '#/components/schemas/AlertingConfigLayer' + yaml: + type: string + description: Rendered Alertmanager YAML (secrets redacted). + '400': + $ref: '#/components/responses/BadRequest' '401': $ref: '#/components/responses/Unauthorized' '403': $ref: '#/components/responses/Forbidden' + '422': + $ref: '#/components/responses/UnprocessableEntity' - /filters/applications: + /alerts/silences: get: - operationId: getApplicationFilters + operationId: getAlertSilences tags: - - Backend - Filters - summary: /filters/applications - Get aggregated application filters + - Backend - Alerts Silences + summary: "/alerts/silences - List active+pending silences (hierarchy-wide)" description: | - Aggregated endpoint that returns all application filter data in a single request. - RBAC is resolved once, then types, versions, systems, and organizations are fetched in parallel. + Cross-system parallel of `GET /systems/{id}/alerts/silences`. Returns + every active or pending Alertmanager silence in the caller's scope, + enriched with `organization_id` (the tenant that owns the silence) and + `system_key` (extracted from the silence matchers). Expired silences + and silences without a `system_key` matcher are excluded — only + silences our UI ever creates are addressable. - **Version Format**: Returns versions in prefixed format `type:version` (e.g., `"nethvoice:1.5.3"`, `"mail:1.7.4"`) + Scope follows the same three modes as `/alerts/totals`: + - `organization_id` omitted → caller's full hierarchy (cross-tenant fan-out). + - `organization_id=X` → single tenant `X`. + - `organization_id=X&include=descendants` → `X` plus its sub-tree. - **Organizations special value**: If there are applications without an assigned organization, the response includes - a special "No organization" entry with `id: "no_org"` and `type: "unassigned"`. + Requires `read:systems` permission. + security: + - BearerAuth: [] + parameters: + - name: organization_id + in: query + description: | + Target organization ID(s). Repeat the param for multiple values. + Optional for all roles except Customer (where it is ignored). + schema: + type: array + items: + type: string + style: form + explode: true + - name: include + in: query + description: Set to `descendants` together with `organization_id` to expand each value to its sub-tree. + schema: + type: string + enum: [descendants] + - name: system_key + in: query + description: | + Filter silences by one or more system keys (exact match on the + `system_key` matcher). Repeat the param for multiple values. + schema: + type: array + items: + type: string + style: form + explode: true responses: '200': - description: Application filters retrieved successfully + description: Paginated list of system-scoped silences content: application/json: schema: @@ -10149,93 +10241,136 @@ paths: example: 200 message: type: string - example: "application filters retrieved successfully" + example: silences retrieved successfully data: type: object properties: - types: + silences: type: array items: - $ref: '#/components/schemas/ApplicationType' - versions: - type: array - items: - type: object - properties: - application: - type: string - description: Application type (instance_of) - example: "nethvoice" - name: - type: string - description: Human-readable application name - example: "NethVoice" - versions: - type: array - items: - type: string - description: Prefixed versions (type:version) - example: ["nethvoice:1.5.4", "nethvoice:1.5.3"] - systems: - type: array - items: - $ref: '#/components/schemas/ApplicationSystemSummary' - organizations: + allOf: + - $ref: '#/components/schemas/AlertmanagerSilence' + - type: object + properties: + organization_id: + type: string + description: Tenant that owns the silence (Mimir stores silences per-tenant). + example: "m4m3mdjdiizs" + system_key: + type: string + description: System key extracted from the silence matchers. + example: "NETH-D417-A2C2-7810-43D2-984B-2164-34C1-B22E" + warnings: type: array + description: | + Per-tenant fan-out errors. Always present (empty when + every tenant responded OK). Each entry is a string + `org : `. items: - $ref: '#/components/schemas/OrganizationSummary' - example: - code: 200 - message: "application filters retrieved successfully" - data: - types: - - instance_of: "nethvoice" - name: "NethVoice" - count: 15 - - instance_of: "mail" - name: "Mail" - count: 8 - versions: - - application: "mail" - name: "Mail" - versions: ["mail:1.7.4"] - - application: "nethvoice" - name: "NethVoice" - versions: ["nethvoice:1.5.4", "nethvoice:1.5.3"] - systems: - - id: "abc123-system-uuid" - name: "Production Server" - - id: "def456-system-uuid" - name: "Development Server" - organizations: - - id: "no_org" - logto_id: "no_org" - name: "No organization" - description: "" - type: "unassigned" - - id: "8b04e253-d408-4218-a30e-b048196847e5" - logto_id: "fso3biosnaqp" - name: "Acme Corp" - description: "" - type: "customer" + type: string + examples: + OneActive: + summary: One active silence on a customer system + value: + code: 200 + message: silences retrieved successfully + data: + silences: + - id: "d9f91c6e-1b33-484e-befa-bfb41020e178" + organization_id: "m4m3mdjdiizs" + system_key: "NETH-D417-A2C2-7810-43D2-984B-2164-34C1-B22E" + matchers: + - name: "system_key" + value: "NETH-D417-A2C2-7810-43D2-984B-2164-34C1-B22E" + isRegex: false + - name: "alertname" + value: "HighCPUUsage" + isRegex: false + - name: "severity" + value: "warning" + isRegex: false + startsAt: "2026-05-12T08:16:36Z" + endsAt: "2026-05-12T09:16:36Z" + updatedAt: "2026-05-12T08:16:36Z" + createdBy: "amelia.foster" + comment: "muted during maintenance window" + status: + state: "active" + warnings: [] '401': $ref: '#/components/responses/Unauthorized' '403': $ref: '#/components/responses/Forbidden' - - /filters/users: - get: - operationId: getUserFilters + '500': + $ref: '#/components/responses/InternalServerError' + post: + operationId: createAlertSilence tags: - - Backend - Filters - summary: /filters/users - Get aggregated user filters + - Backend - Alerts Silences + summary: "/alerts/silences - Mute an alert across systems" description: | - Aggregated endpoint that returns all user filter data in a single request. - Auth is checked once, then roles and organizations are fetched in parallel. - Respects RBAC hierarchy for both roles and organizations. + Cross-system parallel of `POST /systems/{id}/alerts/silences`. Mutes + an active alert identified by fingerprint inside a single tenant + (`?organization_id=`). The backend looks up the alert in Mimir, + extracts `system_key` from its labels, builds the matchers + server-side, and delegates to the same silence-creation path used by + the per-system endpoint — so the silence object stored in Mimir is + byte-identical regardless of which route created it. + + If `end_at` is set it takes precedence over `duration_minutes`. + + Requires `manage:systems` permission. + security: + - BearerAuth: [] + parameters: + - name: organization_id + in: query + required: true + description: | + Tenant that owns the alert. Mandatory for every role except + Customer (where it is ignored — they're always pinned to their + own organization). Owners can address any tenant in the system. + schema: + type: string + example: "m4m3mdjdiizs" + requestBody: + required: true + content: + application/json: + schema: + type: object + required: + - fingerprint + properties: + fingerprint: + type: string + description: Fingerprint of the active alert to silence. + example: "0a9d04bb6eed523f" + comment: + type: string + description: Optional silence comment. Defaults to a system-generated value when empty. + example: "silenced during maintenance" + duration_minutes: + type: integer + minimum: 1 + maximum: 10080 + description: Optional duration in minutes. Defaults to 60 when omitted. Ignored when end_at is set. + example: 60 + end_at: + type: string + format: date-time + description: Optional explicit end time (RFC3339). Takes precedence over duration_minutes. + example: "2026-05-12T09:16:36Z" + examples: + ExplicitEndAt: + summary: Silence until a specific date/time + value: + fingerprint: "0a9d04bb6eed523f" + comment: "silenced during maintenance window" + end_at: "2026-05-12T09:16:36Z" responses: '200': - description: User filters retrieved successfully + description: Alert silenced successfully content: application/json: schema: @@ -10246,116 +10381,133 @@ paths: example: 200 message: type: string - example: "user filters retrieved successfully" + example: "alert silenced successfully" data: type: object properties: - roles: - type: array - items: - $ref: '#/components/schemas/Role' - organizations: - type: array - items: - type: object - properties: - id: - type: string - description: Organization Logto ID - example: "org_abc123" - name: - type: string - description: Organization name - example: "ACME Corp" - type: - type: string - enum: [distributor, reseller, customer] - description: Organization type - example: "customer" - example: - code: 200 - message: "user filters retrieved successfully" - data: - roles: - - id: "role_abc123" - name: "Admin" - description: "System administrator" - - id: "role_def456" - name: "Support" - description: "Support operator" - organizations: - - id: "org_abc123" - name: "ACME Corp" - type: "customer" - - id: "org_def456" - name: "TechStart Inc" - type: "reseller" + silence_id: + type: string + example: "d9f91c6e-1b33-484e-befa-bfb41020e178" + '400': + $ref: '#/components/responses/BadRequest' '401': $ref: '#/components/responses/Unauthorized' '403': $ref: '#/components/responses/Forbidden' + '404': + $ref: '#/components/responses/NotFound' - /applications/{id}: + /alerts/silences/{silence_id}: get: - operationId: getApplicationById + operationId: getAlertSilence tags: - - Backend - Applications - summary: /applications/{id} - Get single application - description: Get a specific application by ID + - Backend - Alerts Silences + summary: "/alerts/silences/{silence_id} - Get a single silence" + description: | + Cross-system parallel of `GET /systems/{id}/alerts/silences/{silence_id}`. + Looks up a silence inside a single tenant (`?organization_id=`) and + returns it enriched with `organization_id` and `system_key`. Silences + without a `system_key` matcher are reported as 404 — they don't belong + to our domain. + + Requires `read:systems` permission. + security: + - BearerAuth: [] parameters: - - name: id + - name: silence_id in: path required: true - description: Application ID + description: Alertmanager silence ID. schema: type: string - example: "sys_abc123_mail1" - responses: - '200': - description: Application retrieved successfully - content: - application/json: - schema: - type: object + example: "d9f91c6e-1b33-484e-befa-bfb41020e178" + - name: organization_id + in: query + required: true + description: | + Tenant that owns the silence. Mandatory for every role except + Customer (where it is ignored). + schema: + type: string + example: "m4m3mdjdiizs" + responses: + '200': + description: Silence retrieved successfully + content: + application/json: + schema: + type: object properties: code: type: integer example: 200 message: type: string - example: "application retrieved successfully" + example: "silence retrieved successfully" data: - $ref: '#/components/schemas/Application' + type: object + properties: + silence: + allOf: + - $ref: '#/components/schemas/AlertmanagerSilence' + - type: object + properties: + organization_id: + type: string + example: "m4m3mdjdiizs" + system_key: + type: string + example: "NETH-D417-A2C2-7810-43D2-984B-2164-34C1-B22E" '401': $ref: '#/components/responses/Unauthorized' '403': $ref: '#/components/responses/Forbidden' '404': $ref: '#/components/responses/NotFound' - put: - operationId: updateApplication + operationId: updateAlertSilence tags: - - Backend - Applications - summary: /applications/{id} - Update application - description: Update an application's notes (other fields are read-only and populated from inventory) + - Backend - Alerts Silences + summary: "/alerts/silences/{silence_id} - Update a silence" + description: | + Cross-system parallel of `PUT /systems/{id}/alerts/silences/{silence_id}`. + Preserves the original matchers and start time; only `end_at` and + `comment` change. Refuses to operate on silences without a + `system_key` matcher (404). Requires `manage:systems` permission. + security: + - BearerAuth: [] parameters: - - name: id + - name: silence_id in: path required: true - description: Application ID schema: type: string - example: "sys_abc123_mail1" + example: "d9f91c6e-1b33-484e-befa-bfb41020e178" + - name: organization_id + in: query + required: true + schema: + type: string + example: "m4m3mdjdiizs" requestBody: required: true content: application/json: schema: - $ref: '#/components/schemas/UpdateApplicationRequest' + type: object + required: + - end_at + properties: + comment: + type: string + example: "extended for maintenance window" + end_at: + type: string + format: date-time + example: "2026-05-12T12:16:36Z" responses: '200': - description: Application updated successfully + description: Silence updated successfully content: application/json: schema: @@ -10366,9 +10518,13 @@ paths: example: 200 message: type: string - example: "application updated successfully" + example: "silence updated successfully" data: - $ref: '#/components/schemas/Application' + type: object + properties: + silence_id: + type: string + example: "f1e1c2a4-7e57-4b1a-aaa0-2b96c8b5a3aa" '400': $ref: '#/components/responses/BadRequest' '401': @@ -10377,69 +10533,34 @@ paths: $ref: '#/components/responses/Forbidden' '404': $ref: '#/components/responses/NotFound' - delete: - operationId: deleteApplication + operationId: deleteAlertSilence tags: - - Backend - Applications - summary: /applications/{id} - Delete application - description: Soft-delete an application + - Backend - Alerts Silences + summary: "/alerts/silences/{silence_id} - Unmute an alert (delete silence)" + description: | + Cross-system parallel of `DELETE /systems/{id}/alerts/silences/{silence_id}`. + Removes a system-scoped silence; generic Alertmanager silences (no + `system_key` matcher) are not addressable through this endpoint and + return 404. Requires `manage:systems` permission. + security: + - BearerAuth: [] parameters: - - name: id + - name: silence_id in: path required: true - description: Application ID schema: type: string - example: "sys_abc123_mail1" - responses: - '200': - description: Application deleted successfully - content: - application/json: - schema: - type: object - properties: - code: - type: integer - example: 200 - message: - type: string - example: "application deleted successfully" - data: - type: object - nullable: true - '401': - $ref: '#/components/responses/Unauthorized' - '403': - $ref: '#/components/responses/Forbidden' - '404': - $ref: '#/components/responses/NotFound' - - /applications/{id}/assign: - patch: - operationId: assignApplicationOrganization - tags: - - Backend - Applications - summary: /applications/{id}/assign - Assign organization - description: Assign an organization to an application - parameters: - - name: id - in: path + example: "d9f91c6e-1b33-484e-befa-bfb41020e178" + - name: organization_id + in: query required: true - description: Application ID schema: type: string - example: "sys_abc123_mail1" - requestBody: - required: true - content: - application/json: - schema: - $ref: '#/components/schemas/AssignApplicationRequest' + example: "m4m3mdjdiizs" responses: '200': - description: Organization assigned successfully + description: Silence disabled successfully content: application/json: schema: @@ -10450,9 +10571,7 @@ paths: example: 200 message: type: string - example: "organization assigned successfully" - data: - $ref: '#/components/schemas/Application' + example: "silence disabled successfully" '400': $ref: '#/components/responses/BadRequest' '401': @@ -10462,24 +10581,100 @@ paths: '404': $ref: '#/components/responses/NotFound' - /applications/{id}/unassign: - patch: - operationId: unassignApplicationOrganization + # =========================================== + # ALERTING ENDPOINTS (Backend - Configuration) + # =========================================== + + /systems/{id}/alerts: + get: + operationId: getSystemAlerts tags: - - Backend - Applications - summary: /applications/{id}/unassign - Remove organization - description: Remove organization assignment from an application + - Backend - Alerts (Per-System) + summary: "/systems/{id}/alerts - Get active alerts for a system" + description: | + Returns current alerts from Mimir scoped to a single system. Mirrors + the filter, pagination, and sort surface of `GET /alerts`: the only + difference is that `system_key` is pinned to the URL path (the + multi-value `system_key` query filter is therefore not exposed). + Suppressed alerts remain visible so silenced alerts can still be + inspected in the system detail view. + + System identity is carried as labels on each alert (`system_id`, + `system_key`, `system_name`, `system_type`), stamped at ingest time. + + Multi-value filters: OR within the same filter, AND across filters. + + Requires `read:systems` permission. + security: + - BearerAuth: [] parameters: - name: id in: path required: true - description: Application ID + description: System ID (database UUID) schema: type: string - example: "sys_abc123_mail1" + example: "sys_123456789" + - name: page + in: query + description: 1-based page number. + schema: + type: integer + minimum: 1 + default: 1 + - name: page_size + in: query + description: Page size. Default 50, max 100. + schema: + type: integer + minimum: 1 + maximum: 100 + default: 50 + - name: sort_by + in: query + description: Sort column (allowlist matches `/alerts`). + schema: + type: string + enum: [starts_at, severity, alertname, status] + default: starts_at + - name: sort_direction + in: query + schema: + type: string + enum: [asc, desc] + default: desc + - name: status + in: query + description: Filter by Alertmanager state. Supports multiple values. + schema: + type: array + items: + type: string + enum: [active, suppressed, unprocessed] + style: form + explode: true + - name: severity + in: query + description: Filter by severity. Supports multiple values. + schema: + type: array + items: + type: string + enum: [critical, warning, info] + style: form + explode: true + - name: alertname + in: query + description: Filter by alertname. Supports multiple values. + schema: + type: array + items: + type: string + style: form + explode: true responses: '200': - description: Organization unassigned successfully + description: Active alerts retrieved successfully content: application/json: schema: @@ -10490,115 +10685,187 @@ paths: example: 200 message: type: string - example: "organization unassigned successfully" - data: - $ref: '#/components/schemas/Application' - '401': - $ref: '#/components/responses/Unauthorized' - '403': - $ref: '#/components/responses/Forbidden' - '404': - $ref: '#/components/responses/NotFound' - - # =========================================================================== - # OAUTH2/OIDC STANDARD ENDPOINTS (Third-party Applications) - # =========================================================================== - /user/permissions: - get: - operationId: getUserPermissions - tags: - - Backend - User - summary: /user/permissions - Get user permissions (OAuth2/OIDC) - description: Get current user permissions using standard OAuth2/OIDC flow with Logto token - responses: - '200': - description: User permissions retrieved successfully - content: - application/json: - schema: - type: object - properties: - code: - type: integer - example: 200 - message: - type: string - example: "user permissions retrieved successfully" + example: "alerts retrieved successfully" data: type: object properties: - user_roles: - type: array - items: - type: string - example: ["Admin"] - user_permissions: - type: array - items: - type: string - example: ["manage:systems", "read:systems"] - org_role: - type: string - example: "Owner" - org_permissions: + alerts: type: array items: - type: string - example: ["manage:resellers", "create:customers"] - organization_id: - type: string - example: "org_123456789" - organization_name: - type: string - example: "ACME Corp" - '401': - $ref: '#/components/responses/Unauthorized' - - /user/profile: - get: - operationId: getUserProfile - tags: - - Backend - User - summary: /user/profile - Get user profile (OAuth2/OIDC) - description: Get current user profile using standard OAuth2/OIDC flow with Logto token - responses: - '200': - description: User profile retrieved successfully - content: - application/json: - schema: - type: object - properties: - code: - type: integer - example: 200 - message: - type: string - example: "user profile retrieved successfully" - data: - $ref: '#/components/schemas/UserProfile' + $ref: '#/components/schemas/ActiveAlert' + pagination: + $ref: '#/components/schemas/Pagination' + examples: + ActiveAndSuppressedOnSystem: + summary: Two firing alerts on the same system, one silenced + description: | + The endpoint always includes silenced alerts so the UI can + show the muted state in the system detail view. `state` is + `"suppressed"` when at least one active silence matches the + alert; the matching silence IDs are listed in `silencedBy`. + value: + code: 200 + message: "alerts retrieved successfully" + data: + alerts: + - fingerprint: "0a9d04bb6eed523f" + labels: + alertname: "DiskFilling" + severity: "warning" + system_id: "e4eb4844-46f6-448c-8279-7cfedf5e1037" + system_key: "NETH-D417-A2C2-7810-43D2-984B-2164-34C1-B22E" + system_name: "test-sys" + system_type: "ns8" + annotations: + summary: "/var is 92% full" + description: "Disk usage exceeded warning threshold." + status: + state: "suppressed" + silencedBy: + - "d9f91c6e-1b33-484e-befa-bfb41020e178" + inhibitedBy: [] + startsAt: "2026-05-12T08:14:00Z" + endsAt: "2026-05-12T08:44:00Z" + - fingerprint: "11a9302b0fa6526e" + labels: + alertname: "HighCPU" + severity: "critical" + system_id: "e4eb4844-46f6-448c-8279-7cfedf5e1037" + system_key: "NETH-D417-A2C2-7810-43D2-984B-2164-34C1-B22E" + system_name: "test-sys" + system_type: "ns8" + annotations: + summary: "CPU usage 98%" + description: "Sustained high CPU." + status: + state: "active" + silencedBy: [] + inhibitedBy: [] + startsAt: "2026-05-12T08:20:00Z" + endsAt: "2026-05-12T08:50:00Z" + pagination: + page: 1 + page_size: 50 + total_count: 2 + total_pages: 1 + has_next: false + has_prev: false + sort_by: "starts_at" + sort_direction: "desc" + NoActiveAlerts: + summary: System has no firing alerts + value: + code: 200 + message: "alerts retrieved successfully" + data: + alerts: [] + pagination: + page: 1 + page_size: 50 + total_count: 0 + total_pages: 0 + has_next: false + has_prev: false + sort_by: "starts_at" + sort_direction: "desc" '401': $ref: '#/components/responses/Unauthorized' + '403': + $ref: '#/components/responses/Forbidden' + '404': + $ref: '#/components/responses/NotFound' - # =========================================== - # COLLECT SERVICE ENDPOINTS - # =========================================== - - /systems/info: + /systems/{id}/alerts/history: get: - operationId: getSystemInfo - servers: - - url: https://collect.your-domain.com/api - description: Collect API server (port 8081) + operationId: getSystemAlertHistory tags: - - Collect - Systems - summary: /systems/info - Get system info - description: Returns information about the authenticated system, including suspension and organization status. + - Backend - Alerts (Per-System) + summary: "/systems/{id}/alerts/history - Get system alert history" + description: | + Get paginated history of resolved and inactive alerts for a specific system. + Alerts are stored by the collect service when Alertmanager sends webhook notifications. + Requires `read:systems` permission. security: - - BasicAuth: [system_key:system_secret] + - BearerAuth: [] + parameters: + - name: id + in: path + required: true + description: System ID (logto_id) + schema: + type: string + example: "sys_123456789" + - $ref: '#/components/parameters/PageParam' + - $ref: '#/components/parameters/PageSizeParam' + - name: sort_by + in: query + required: false + description: Field to sort by + schema: + type: string + enum: [id, alertname, severity, status, starts_at, ends_at, created_at] + default: created_at + - name: sort_direction + in: query + required: false + description: | + Sort direction. Unlike the shared default of `asc`, this endpoint + defaults to `desc` so the natural "most recent first" ordering is + applied when the caller omits the param. + schema: + type: string + enum: [asc, desc] + default: desc + - name: from_date + in: query + required: false + description: Lower bound on `created_at` (inclusive). RFC3339 timestamp. + schema: + type: string + format: date-time + example: "2026-05-01T00:00:00Z" + - name: to_date + in: query + required: false + description: Upper bound on `created_at` (exclusive). RFC3339 timestamp. Must be after `from_date`. + schema: + type: string + format: date-time + example: "2026-05-08T00:00:00Z" + - name: alertname + in: query + required: false + description: Filter by alertname. Supports multiple values (OR within filter). + schema: + type: array + items: + type: string + style: form + explode: true + - name: severity + in: query + required: false + description: Filter by severity. Supports multiple values. + schema: + type: array + items: + type: string + enum: [critical, warning, info] + style: form + explode: true + - name: status + in: query + required: false + description: Filter by status. Supports multiple values. + schema: + type: array + items: + type: string + style: form + explode: true responses: '200': - description: System info retrieved successfully + description: Alert history retrieved successfully content: application/json: schema: @@ -10609,154 +10876,125 @@ paths: example: 200 message: type: string - example: "system info retrieved successfully" + example: "alert history retrieved successfully" data: type: object properties: - system_id: - type: string - example: "abc-123" - system_key: - type: string - example: "my-system-key" - name: - type: string - example: "Milan Office Server" - type: - type: string - nullable: true - example: "ns8" - fqdn: - type: string - nullable: true - example: "server.example.com" - status: - type: string - example: "active" - suspended: - type: boolean - example: false - suspended_at: - type: string - format: date-time - nullable: true - deleted: - type: boolean - example: false - deleted_at: - type: string - format: date-time - nullable: true - registered: - type: boolean - example: true - registered_at: - type: string - format: date-time - nullable: true - example: "2025-01-15T10:00:00Z" - created_at: - type: string - format: date-time - example: "2025-01-10T08:00:00Z" - rebranding_enabled: - type: boolean - description: Whether rebranding is enabled for this system (directly or inherited through hierarchy) - example: false - organization: - type: object - properties: - id: - type: string - description: Database UUID of the organization - example: "4405ffd0-0aca-44ef-bae2-c8545bce94f4" - logto_id: - type: string - description: Logto organization ID - example: "akkbs6x2wo82" - name: - type: string - description: Organization name - example: "Owner" - type: - type: string - description: Organization type in the hierarchy - enum: [owner, distributor, reseller, customer] - example: "owner" - suspended: - type: boolean - example: false - suspended_at: - type: string - format: date-time - nullable: true + alerts: + type: array + items: + $ref: '#/components/schemas/AlertHistoryRecord' + pagination: + $ref: '#/components/schemas/Pagination' + examples: + ResolvedAlertsForSystem: + summary: Two resolved alerts on this system + description: | + Records are scoped to the path system's `system_key`. The + response is identical in shape to `GET /alerts/history` but + filtered to one system without needing to pass it as a + query param. + value: + code: 200 + message: "alert history retrieved successfully" + data: + alerts: + - id: 55 + organization_id: "m4m3mdjdiizs" + system_key: "NETH-D417-A2C2-7810-43D2-984B-2164-34C1-B22E" + alertname: "PlainBodyTest" + severity: "critical" + status: "resolved" + fingerprint: "11a9302b0fa6526e" + starts_at: "2026-05-12T07:46:50Z" + ends_at: "2026-05-12T07:51:50Z" + summary: "plain body check" + labels: + alertname: "PlainBodyTest" + severity: "critical" + system_key: "NETH-D417-A2C2-7810-43D2-984B-2164-34C1-B22E" + annotations: + summary: "plain body check" + description: "resolved" + receiver: "severity-critical-receiver" + created_at: "2026-05-12T07:52:00Z" + - id: 54 + organization_id: "m4m3mdjdiizs" + system_key: "NETH-D417-A2C2-7810-43D2-984B-2164-34C1-B22E" + alertname: "HistFlowTest" + severity: "critical" + status: "resolved" + fingerprint: "9c1a23e87f4d0a11" + starts_at: "2026-05-12T08:01:07Z" + ends_at: "2026-05-12T08:06:06Z" + summary: "history flow check" + labels: + alertname: "HistFlowTest" + severity: "critical" + system_key: "NETH-D417-A2C2-7810-43D2-984B-2164-34C1-B22E" + annotations: + summary: "history flow check" + description: "resolved" + receiver: "severity-critical-receiver" + created_at: "2026-05-12T08:11:30Z" + pagination: + page: 1 + page_size: 50 + total_count: 2 + total_pages: 1 + has_next: false + has_prev: false + EmptyHistory: + summary: No history rows for this system yet + value: + code: 200 + message: "alert history retrieved successfully" + data: + alerts: [] + pagination: + page: 1 + page_size: 50 + total_count: 0 + total_pages: 0 + has_next: false + has_prev: false + '400': + $ref: '#/components/responses/BadRequest' '401': $ref: '#/components/responses/Unauthorized' + '403': + $ref: '#/components/responses/Forbidden' '404': - description: System not found or deleted - content: - application/json: - schema: - $ref: '#/components/schemas/ErrorResponse' - '500': - $ref: '#/components/responses/InternalServerError' + $ref: '#/components/responses/NotFound' - /systems/inventory: - post: - operationId: collectSystemInventory - servers: - - url: https://collect.your-domain.com/api - description: Collect API server (port 8081) + # =========================================================================== + # SYSTEM BACKUPS + # =========================================================================== + + /systems/{id}/alerts/silences: + get: + operationId: getSystemAlertSilences tags: - - Collect - Systems - summary: /systems/inventory - Collect system inventory - description: System inventory collection endpoint with HTTP Basic authentication + - Backend - Alerts (Per-System) + summary: "/systems/{id}/alerts/silences - List active silences for a system" + description: | + Returns all active and pending Alertmanager silences scoped to the target system. + Expired silences are excluded. Results are filtered server-side to silences that carry + an exact `system_key` matcher matching the system's key. + Requires `read:systems` permission. security: - - BasicAuth: [system_key:system_secret] - requestBody: - required: true - description: Raw inventory JSON from the system (structure varies by system type) - content: - application/json: - schema: - type: object - description: Raw inventory data sent directly from the system - additionalProperties: true - example: { - "$schema": "https://schema.nethserver.org/facts/2022-12.json", - "uuid": "659d2fbe-792f-4a0d-ae58-f278304d4f7f", - "installation": "nethserver", - "facts": { - "cluster": { - "leader_node_id": "1", - "user_domains": [], - "subscription": "community", - "ui_name": "MyNethServer 8" - }, - "nodes": { - "1": { - "cluster_leader": true, - "fqdn": "rl1.dp.nethserver.net", - "default_ipv4": "165.22.17.26", - "default_ipv6": "2a03:b0c0:3:f0:0:1:dbfe:3000", - "version": "3.17.0-dev.6", - "ui_name": "MyNodeRl1" - } - }, - "modules": [ - { - "id": "mail1", - "version": "1.7.4", - "name": "mail", - "node": "1", - "ui_name": "MyMail" - } - ] - } - } + - BearerAuth: [] + parameters: + - name: id + in: path + required: true + description: System ID (database UUID) + schema: + type: string + example: "sys_123456789" responses: - '202': - description: Inventory received and queued for processing + '200': + description: Silences retrieved successfully content: application/json: schema: @@ -10767,107 +11005,132 @@ paths: example: 200 message: type: string - example: "Inventory received and queued for processing" - data: - type: object - properties: - data_size: - type: integer - description: Inventory data size in bytes - example: 10145 - message: - type: string - description: A message from server - example: "Your inventory data has been received and will be processed shortly" - queue_status: - type: string - description: Queue status message - example: "queued" - system_key: - type: string - description: System KEY - example: "NETH-4cf3053f-d0d5-4b10-b752-ff8f7b63c2f7" - timestamp: - type: string - format: date-time - description: Timestamp of received inventory - example: "2025-07-16T15:46:51.571831+02:00" - '400': - $ref: '#/components/responses/BadRequest' - '401': - $ref: '#/components/responses/Unauthorized' - '413': - description: Request Entity Too Large - content: - application/json: - schema: - type: object - properties: - code: - type: integer - example: 413 - message: - type: string - example: "Request too large" - data: - type: object - properties: - max_size_bytes: - type: integer - description: Maximum allowed request size in bytes - example: 10485760 - received_bytes: - type: integer - description: Size of received request in bytes - example: 20971520 - '500': - description: Internal Server Error - content: - application/json: - schema: - type: object - properties: - code: - type: integer - example: 500 - message: - type: string - example: "Failed to process inventory" + example: "silences retrieved successfully" data: type: object properties: - error: - type: string - example: "Processing queue unavailable" - - /systems/heartbeat: + silences: + type: array + items: + $ref: '#/components/schemas/AlertmanagerSilence' + examples: + ActiveSilence: + summary: One active silence for the system + description: | + The silence matches on `system_key` (server-injected) plus the + labels that uniquely identified the alert at silence creation + time. `silencedBy` on the active alert references this same + `id`. + value: + code: 200 + message: "silences retrieved successfully" + data: + silences: + - id: "d9f91c6e-1b33-484e-befa-bfb41020e178" + matchers: + - name: "system_key" + value: "NETH-D417-A2C2-7810-43D2-984B-2164-34C1-B22E" + isRegex: false + - name: "alertname" + value: "DiskFilling" + isRegex: false + - name: "severity" + value: "warning" + isRegex: false + startsAt: "2026-05-12T08:16:36Z" + endsAt: "2026-05-12T09:16:36Z" + updatedAt: "2026-05-12T08:16:36Z" + createdBy: "R1C1 Admin " + comment: "silenced during maintenance window" + status: + state: "active" + NoSilences: + summary: No silences for the system + value: + code: 200 + message: "silences retrieved successfully" + data: + silences: [] + '401': + $ref: '#/components/responses/Unauthorized' + '403': + $ref: '#/components/responses/Forbidden' + '404': + $ref: '#/components/responses/NotFound' post: - operationId: sendSystemHeartbeat - servers: - - url: https://collect.your-domain.com/api - description: Collect API server (port 8081) + operationId: createSystemAlertSilence tags: - - Collect - Systems - summary: /systems/heartbeat - System heartbeat - description: System heartbeat endpoint to track system liveness (every 10 minutes) + - Backend - Alerts (Per-System) + summary: "/systems/{id}/alerts/silences - Create a silence for a system alert" + description: | + Creates an Alertmanager silence for a specific active alert on the target system. + The request identifies the live alert by fingerprint. The backend resolves the alert, + builds the silence matchers server-side, and always uses the system's authoritative + `system_key`. + If `end_at` is provided it takes precedence over `duration_minutes`. + Requires `manage:systems` permission. security: - - BasicAuth: [system_key:system_secret] + - BearerAuth: [] + parameters: + - name: id + in: path + required: true + description: System ID (database UUID) + schema: + type: string + example: "sys_123456789" requestBody: required: true content: application/json: schema: type: object + required: + - fingerprint properties: - system_key: + fingerprint: type: string - description: System KEY sending the heartbeat - example: "NETH-4cf3053f-d0d5-4b10-b752-ff8f7b63c2f7" - required: - - system_key + description: Fingerprint of the active alert to silence + example: "8f2d65896d4bcf97" + comment: + type: string + description: Optional silence comment. Defaults to a system-generated value when empty. + example: "silenced during maintenance" + duration_minutes: + type: integer + minimum: 1 + maximum: 10080 + description: Optional silence duration in minutes. Defaults to 60 when omitted. Ignored when end_at is set. + example: 60 + end_at: + type: string + format: date-time + description: Optional explicit end time (RFC3339). Takes precedence over duration_minutes. + example: "2024-01-01T02:00:00Z" + examples: + ExplicitEndAt: + summary: Silence until a specific date/time + description: | + When `end_at` is set, the silence expires at that moment + regardless of `duration_minutes`. The backend resolves the + alert by `fingerprint`, attaches the system's authoritative + `system_key` to the matchers, and creates the silence. + value: + fingerprint: "0a9d04bb6eed523f" + comment: "silenced during maintenance window" + end_at: "2026-05-12T09:16:36Z" + DurationBased: + summary: Silence for the next 60 minutes + description: | + Without `end_at`, `duration_minutes` applies. If both are + omitted, the silence defaults to 60 minutes from creation. + value: + fingerprint: "0a9d04bb6eed523f" + comment: "investigating" + duration_minutes: 60 responses: '200': - description: Heartbeat acknowledged + description: Alert silence created successfully content: application/json: schema: @@ -10878,117 +11141,65 @@ paths: example: 200 message: type: string - example: "heartbeat acknowledged" + example: "alert silenced successfully" data: type: object properties: - system_key: - type: string - description: System KEY - example: "NETH-4cf3053f-d0d5-4b10-b752-ff8f7b63c2f7" - acknowledged: - type: boolean - description: Whether heartbeat was acknowledged - example: true - last_heartbeat: + silence_id: type: string - format: date-time - description: Timestamp of this heartbeat - example: "2025-07-21T10:25:00Z" + example: "4e6f0c30-c383-4e22-9443-0d7b6a8bd40b" + examples: + Created: + summary: Silence created + description: | + `silence_id` is the Alertmanager-assigned UUID; use it to + look up, update, or delete the silence later. The + corresponding `silenced` event is appended to the alert's + activity timeline (`GET /alerts/activity/{fingerprint}`). + value: + code: 200 + message: "alert silenced successfully" + data: + silence_id: "d9f91c6e-1b33-484e-befa-bfb41020e178" '400': $ref: '#/components/responses/BadRequest' '401': $ref: '#/components/responses/Unauthorized' '403': $ref: '#/components/responses/Forbidden' - '500': - $ref: '#/components/responses/InternalServerError' + '404': + $ref: '#/components/responses/NotFound' - /systems/backups: - post: - operationId: uploadSystemBackup - servers: - - url: https://collect.your-domain.com/api - description: Collect API server (port 8081) + /systems/{id}/alerts/silences/{silence_id}: + get: + operationId: getSystemAlertSilence tags: - - Collect - Systems - summary: /systems/backups - Upload a configuration backup + - Backend - Alerts (Per-System) + summary: "/systems/{id}/alerts/silences/{silence_id} - Get a single silence" description: | - Stream a GPG-encrypted configuration snapshot to the backup store. - The body is the ciphertext; SHA-256 is computed at ingest. The - appliance authenticates with HTTP Basic auth using its - `system_key:system_secret`. - - Server-side limits enforced before/during ingest: - - Per-system rate limit (default 6/min, 60/hour) — `429 Retry-After` on hit - - Per-system slot cap (default 10) — oldest pruned at retention - - Per-system size cap (default 500 MiB total) — oldest pruned - - Per-org aggregate quota (default 100 GiB) — `413` on hit - - Per-upload size cap (default 2 GiB) — `413` on hit - - Object key shape: `{org_id}/{system_key}/{uuidv7}.{ext}`. The - appliance never picks the destination prefix; it is server-derived - from the authenticated identity. + Returns a specific Alertmanager silence after verifying it belongs to the target system + via its `system_key` matcher. + Requires `read:systems` permission. security: - - BasicAuth: [system_key:system_secret] + - BearerAuth: [] parameters: - - name: X-Filename - in: header - required: false - description: User-facing filename (sanitized server-side) + - name: id + in: path + required: true + description: System ID (database UUID) schema: type: string - example: "daily-backup-2026-04-12.tar.gz.gpg" - requestBody: - required: true - content: - application/octet-stream: - schema: - type: string - format: binary - responses: - '201': - description: Backup stored - content: - application/json: - schema: - type: object - properties: - code: - type: integer - example: 201 - message: - type: string - example: "backup stored" - data: - $ref: '#/components/schemas/BackupMetadata' - '400': - $ref: '#/components/responses/BadRequest' - '401': - $ref: '#/components/responses/Unauthorized' - '413': - $ref: '#/components/responses/RequestEntityTooLarge' - '429': - description: Rate limit exceeded; retry after the time indicated by `Retry-After`. - '503': - description: Backup storage or org-quota service unavailable; retry later. - get: - operationId: listSystemBackupsAppliance - servers: - - url: https://collect.your-domain.com/api - description: Collect API server (port 8081) - tags: - - Collect - Systems - summary: /systems/backups - List backups for the authenticated system - description: | - Returns metadata for every backup currently stored under the - authenticated system's prefix. Cross-tenant access is impossible - — the listing scope is server-derived from the credentials. - security: - - BasicAuth: [system_key:system_secret] + example: "sys_123456789" + - name: silence_id + in: path + required: true + description: Alertmanager silence ID + schema: + type: string + example: "4e6f0c30-c383-4e22-9443-0d7b6a8bd40b" responses: '200': - description: Backups listed + description: Silence retrieved successfully content: application/json: schema: @@ -10997,122 +11208,232 @@ paths: code: type: integer example: 200 + message: + type: string + example: "silence retrieved successfully" data: type: object properties: - backups: - type: array - items: - $ref: '#/components/schemas/BackupMetadata' - '401': - $ref: '#/components/responses/Unauthorized' - '503': - description: Backup storage unavailable. - - /systems/backups/{id}: - get: - operationId: downloadSystemBackupAppliance - servers: - - url: https://collect.your-domain.com/api - description: Collect API server (port 8081) + silence: + $ref: '#/components/schemas/AlertmanagerSilence' + examples: + ActiveSilence: + summary: Silence found and active + value: + code: 200 + message: "silence retrieved successfully" + data: + silence: + id: "d9f91c6e-1b33-484e-befa-bfb41020e178" + matchers: + - name: "system_key" + value: "NETH-D417-A2C2-7810-43D2-984B-2164-34C1-B22E" + isRegex: false + - name: "alertname" + value: "DiskFilling" + isRegex: false + - name: "severity" + value: "warning" + isRegex: false + startsAt: "2026-05-12T08:16:36Z" + endsAt: "2026-05-12T09:16:36Z" + updatedAt: "2026-05-12T08:16:36Z" + createdBy: "R1C1 Admin " + comment: "silenced during maintenance window" + status: + state: "active" + '401': + $ref: '#/components/responses/Unauthorized' + '403': + $ref: '#/components/responses/Forbidden' + '404': + $ref: '#/components/responses/NotFound' + put: + operationId: updateSystemAlertSilence tags: - - Collect - Systems - summary: /systems/backups/{id} - Download a backup + - Backend - Alerts (Per-System) + summary: "/systems/{id}/alerts/silences/{silence_id} - Update a silence" description: | - Stream the ciphertext body for a specific backup belonging to the - authenticated system. Foreign IDs return `404`; the path can never - escape the authenticated system's prefix. + Updates the end time and/or comment of an existing silence. Preserves the original + matchers and start time. Ownership is verified via the `system_key` matcher. + Requires `manage:systems` permission. security: - - BasicAuth: [system_key:system_secret] + - BearerAuth: [] parameters: - name: id in: path required: true - description: Backup ID (UUIDv7 plus extension) + description: System ID (database UUID) schema: type: string - example: "01934fab-bc33-7890-a1b2-c3d4e5f6a7b8.tar.gz" + example: "sys_123456789" + - name: silence_id + in: path + required: true + description: Alertmanager silence ID + schema: + type: string + example: "4e6f0c30-c383-4e22-9443-0d7b6a8bd40b" + requestBody: + required: true + content: + application/json: + schema: + type: object + required: + - end_at + properties: + comment: + type: string + description: New comment for the silence. Defaults to previous value if empty. + example: "extended for maintenance window" + end_at: + type: string + format: date-time + description: New end time (RFC3339). Must be in the future. + example: "2024-01-01T04:00:00Z" + examples: + ExtendEndTime: + summary: Extend the silence by 3 more hours + description: | + Alertmanager treats an update as "create a new silence with + the same matchers and start_at, then drop the old one", so + the response carries a new `silence_id`. The activity + timeline records this as a `silence_updated` event. + value: + comment: "extended for maintenance window" + end_at: "2026-05-12T12:16:36Z" responses: '200': - description: Backup body streamed as `application/octet-stream` + description: Silence updated successfully content: - application/octet-stream: + application/json: schema: - type: string - format: binary + type: object + properties: + code: + type: integer + example: 200 + message: + type: string + example: "silence updated successfully" + data: + type: object + properties: + silence_id: + type: string + example: "4e6f0c30-c383-4e22-9443-0d7b6a8bd40b" + examples: + Updated: + summary: New silence id after update + value: + code: 200 + message: "silence updated successfully" + data: + silence_id: "f1e1c2a4-7e57-4b1a-aaa0-2b96c8b5a3aa" '400': $ref: '#/components/responses/BadRequest' '401': $ref: '#/components/responses/Unauthorized' + '403': + $ref: '#/components/responses/Forbidden' '404': $ref: '#/components/responses/NotFound' - '503': - description: Backup storage unavailable. delete: - operationId: deleteSystemBackupAppliance - servers: - - url: https://collect.your-domain.com/api - description: Collect API server (port 8081) + operationId: deleteSystemAlertSilence tags: - - Collect - Systems - summary: /systems/backups/{id} - Delete a backup + - Backend - Alerts (Per-System) + summary: "/systems/{id}/alerts/silences/{silence_id} - Disable a system alert silence" description: | - Remove a specific backup belonging to the authenticated system. - Foreign IDs return `404`. + Deletes a system-scoped Alertmanager silence after validating that the silence belongs + to the target system through the authoritative `system_key` matcher. + Requires `manage:systems` permission. security: - - BasicAuth: [system_key:system_secret] + - BearerAuth: [] parameters: - name: id in: path required: true + description: System ID (database UUID) + schema: + type: string + example: "sys_123456789" + - name: silence_id + in: path + required: true + description: Alertmanager silence ID schema: type: string + example: "4e6f0c30-c383-4e22-9443-0d7b6a8bd40b" responses: '200': - description: Backup deleted + description: Alert silence disabled successfully + content: + application/json: + schema: + type: object + properties: + code: + type: integer + example: 200 + message: + type: string + example: "silence disabled successfully" + examples: + Disabled: + summary: Silence removed + description: | + The silence is expired (not hard-deleted) so it disappears + from `GET /silences` but stays referenced in the alert's + activity timeline as an `unsilenced` event. On the wire + the response also carries `"data": null`; the example + omits it to match the declared schema which only includes + `code` and `message`. + value: + code: 200 + message: "silence disabled successfully" '400': $ref: '#/components/responses/BadRequest' '401': $ref: '#/components/responses/Unauthorized' + '403': + $ref: '#/components/responses/Forbidden' '404': $ref: '#/components/responses/NotFound' - # =========================================================================== - # VALIDATORS - # =========================================================================== - /validators/vat/{entity_type}: + /systems/{id}/backups: get: - operationId: validateVAT + operationId: getSystemBackups tags: - - Backend - Validators - summary: /validators/vat/{entity_type} - Validate VAT number - description: Check if a VAT number exists in the specified entity type (distributors, resellers, customers) + - Backend - Systems Backups + summary: /systems/{id}/backups - List configuration backups for a system + description: | + Returns the list of configuration backups stored for the system, + together with aggregate usage counters. Backups are produced by + the appliance itself (see collect ingest endpoint) and consumed + here read-only. + + Each entry carries `size`, `sha256`, and `uploaded_at`. The + peer IP observed at ingest is intentionally not exposed: on + traffic that transits the translation proxy the recorded value + would be the proxy's IP, and even when it is accurate it is a + reconnaissance aid for higher-tier admins. + + Access is gated by the same RBAC rules as `GET /systems/{id}`: + the caller must belong to the organization that currently owns + the system. After a cross-org reassignment, the new owner sees + the full backup list and the previous owner loses visibility. parameters: - - name: entity_type + - name: id in: path required: true - description: Type of entity to check VAT against - schema: - type: string - enum: [distributors, resellers, customers] - example: "customers" - - name: vat - in: query - required: true - description: VAT number to validate - schema: - type: string - example: "12345678901" - - name: exclude_id - in: query - required: false - description: Entity ID to exclude from check (useful for updates) + description: System ID schema: type: string - example: "cust_123456789" + example: "sys_123456789" responses: '200': - description: VAT validation completed successfully + description: Backups retrieved successfully content: application/json: schema: @@ -11123,32 +11444,73 @@ paths: example: 200 message: type: string - example: "VAT validation completed" + example: "backups retrieved successfully" data: - $ref: '#/components/schemas/VATValidationResponse' - '400': - $ref: '#/components/responses/BadRequest' + type: object + properties: + backups: + type: array + items: + $ref: '#/components/schemas/BackupMetadata' + quota_used_bytes: + type: integer + format: int64 + description: Sum of backup sizes stored for this system + example: 314572800 + slots_used: + type: integer + description: Number of backups currently stored + example: 4 '401': $ref: '#/components/responses/Unauthorized' - '500': - $ref: '#/components/responses/InternalServerError' - - # =========================================== - # REBRANDING ENDPOINTS (Backend) - # =========================================== + '403': + $ref: '#/components/responses/Forbidden' + '404': + $ref: '#/components/responses/NotFound' + '502': + description: Backup storage unreachable or returned an error + '503': + description: Backup storage is not configured - /rebranding/products: + /systems/{id}/backups/{backup_id}/download: get: - operationId: getRebrandingProducts + operationId: downloadSystemBackup tags: - - Backend - Rebranding - summary: List rebrandable products - description: Returns all products that support rebranding - security: - - BearerAuth: [] + - Backend - Systems Backups + summary: /systems/{id}/backups/{backup_id}/download - Issue a short-lived download URL + description: | + Returns a short-lived presigned S3 URL that the user's browser + uses to stream the backup object directly from storage. The API + never proxies the object body itself. + + The backend does not perform a redirect because the frontend + sends its JWT on the initial request — browsers would drop the + `Authorization` header when following a 3xx redirect, so the + frontend receives the URL in the JSON response and navigates to + it explicitly. + + The presigned URL's lifetime is controlled by the + `BACKUP_PRESIGN_TTL` environment variable (default 5 minutes). + + Access is gated by the same RBAC rules as `GET /systems/{id}`. + parameters: + - name: id + in: path + required: true + description: System ID + schema: + type: string + example: "sys_123456789" + - name: backup_id + in: path + required: true + description: Backup object ID (UUIDv7 + extension, e.g. "01934f...tar.gz") + schema: + type: string + example: "01934fab-bc33-7890-a1b2-c3d4e5f6a7b8.tar.gz" responses: '200': - description: Rebrandable products retrieved + description: Download URL issued content: application/json: schema: @@ -11159,80 +11521,57 @@ paths: example: 200 message: type: string + example: "download URL issued" data: type: object properties: - products: - type: array - items: - $ref: '#/components/schemas/RebrandableProduct' + download_url: + type: string + format: uri + description: Short-lived presigned S3 URL + expires_in_seconds: + type: integer + description: How long the URL remains valid + example: 300 '401': $ref: '#/components/responses/Unauthorized' - - /rebranding/{org_id}/enable: - patch: - operationId: enableRebranding - tags: - - Backend - Rebranding - summary: Enable rebranding for an organization - description: Owner-only. Enables rebranding capability for a specific organization. - security: - - BearerAuth: [] - parameters: - - name: org_id - in: path - required: true - schema: - type: string - description: Logto organization ID (logto_id, not the database UUID) - responses: - '200': - description: Rebranding enabled - '400': - $ref: '#/components/responses/BadRequest' '403': $ref: '#/components/responses/Forbidden' + '404': + $ref: '#/components/responses/NotFound' + '502': + description: Backup storage unreachable or returned an error + '503': + description: Backup storage is not configured - /rebranding/{org_id}/disable: - patch: - operationId: disableRebranding + delete: + operationId: deleteSystemBackup tags: - - Backend - Rebranding - summary: Disable rebranding for an organization - description: Owner-only. Disables rebranding for a specific organization. - security: - - BearerAuth: [] + - Backend - Systems Backups + summary: /systems/{id}/backups/{backup_id} - Delete a stored backup + description: | + Deletes a backup object from storage. The operation is final — + storage uses object-level deletion, not a soft-delete table. + + Access is gated by the same RBAC rules as `GET /systems/{id}`. parameters: - - name: org_id + - name: id in: path required: true + description: System ID schema: type: string - description: Logto organization ID (logto_id, not the database UUID) - responses: - '200': - description: Rebranding disabled - '403': - $ref: '#/components/responses/Forbidden' - - /rebranding/{org_id}/status: - get: - operationId: getRebrandingStatus - tags: - - Backend - Rebranding - summary: Get rebranding status for an organization - security: - - BearerAuth: [] - parameters: - - name: org_id + example: "sys_123456789" + - name: backup_id in: path required: true + description: Backup object ID schema: type: string - description: Logto organization ID (logto_id, not the database UUID) + example: "01934fab-bc33-7890-a1b2-c3d4e5f6a7b8.tar.gz" responses: '200': - description: Rebranding status retrieved + description: Backup deleted content: application/json: schema: @@ -11240,31 +11579,64 @@ paths: properties: code: type: integer + example: 200 message: type: string + example: "backup deleted" data: - $ref: '#/components/schemas/RebrandingOrgStatus' + type: object + properties: + system_id: + type: string + example: "sys_123456789" + backup_id: + type: string + example: "01934fab-bc33-7890-a1b2-c3d4e5f6a7b8.tar.gz" + '401': + $ref: '#/components/responses/Unauthorized' '403': $ref: '#/components/responses/Forbidden' + '404': + $ref: '#/components/responses/NotFound' + '502': + description: Backup storage unreachable or returned an error + '503': + description: Backup storage is not configured - /rebranding/{org_id}/products: + # =========================================================================== + # APPLICATIONS MANAGEMENT + # =========================================================================== + /applications: get: - operationId: getRebrandingOrgProducts + operationId: getApplications tags: - - Backend - Rebranding - summary: Get rebranding products for an organization - security: - - BearerAuth: [] + - Backend - Applications + summary: /applications - List applications + description: | + Get list of system applications visible to the user based on hierarchical organization permissions. + Supports filtering by type, version, system, organization, and status. + + **Query String Examples:** + + 1. **Single type filter**: `?type=mail` + 2. **Multiple types filter**: `?type=mail&type=webtop` + 3. **Status filter**: `?status=unassigned` + 4. **With pagination and sorting**: `?page=1&page_size=50&sort_by=instance_of&sort_direction=asc` + 5. **Combined filters**: `?type=mail&status=assigned&organization_id=org_abc123` parameters: - - name: org_id - in: path - required: true - schema: - type: string - description: Logto organization ID (logto_id, not the database UUID) + - $ref: '#/components/parameters/PageParam' + - $ref: '#/components/parameters/PageSizeParam' + - $ref: '#/components/parameters/SearchParam' + - $ref: '#/components/parameters/AppSortByParam' + - $ref: '#/components/parameters/SortDirectionParam' + - $ref: '#/components/parameters/AppTypeFilterParam' + - $ref: '#/components/parameters/AppVersionFilterParam' + - $ref: '#/components/parameters/AppSystemFilterParam' + - $ref: '#/components/parameters/AppOrganizationFilterParam' + - $ref: '#/components/parameters/AppStatusFilterParam' responses: '200': - description: Rebranding products retrieved + description: Applications retrieved successfully content: application/json: schema: @@ -11272,182 +11644,117 @@ paths: properties: code: type: integer + example: 200 message: type: string + example: "applications retrieved successfully" data: - $ref: '#/components/schemas/RebrandingOrgStatus' + type: object + properties: + applications: + type: array + items: + $ref: '#/components/schemas/ApplicationListItem' + pagination: + $ref: '#/components/schemas/Pagination' + '401': + $ref: '#/components/responses/Unauthorized' '403': $ref: '#/components/responses/Forbidden' - /rebranding/{org_id}/products/{product_id}: - put: - operationId: uploadRebrandingAssets - tags: - - Backend - Rebranding - summary: Upload rebranding assets for a product - description: | - Multipart upload of logos, favicon, background, and product name. - All fields optional. Only provided fields are updated. - Requires rebranding to be enabled for the organization. - security: - - BearerAuth: [] - parameters: - - name: org_id - in: path - required: true - schema: - type: string - description: Logto organization ID (logto_id, not the database UUID) - - name: product_id - in: path - required: true - schema: - type: string - requestBody: - content: - multipart/form-data: - schema: - type: object - properties: - product_name: - type: string - maxLength: 100 - logo_light_rect: - type: string - format: binary - logo_dark_rect: - type: string - format: binary - logo_light_square: - type: string - format: binary - logo_dark_square: - type: string - format: binary - favicon: - type: string - format: binary - background_image: - type: string - format: binary - responses: - '200': - description: Assets uploaded - '400': - $ref: '#/components/responses/BadRequest' - '403': - $ref: '#/components/responses/Forbidden' - delete: - operationId: deleteRebrandingProduct - tags: - - Backend - Rebranding - summary: Delete all rebranding assets for a product - security: - - BearerAuth: [] - parameters: - - name: org_id - in: path - required: true - schema: - type: string - description: Logto organization ID (logto_id, not the database UUID) - - name: product_id - in: path - required: true - schema: - type: string + /applications/totals: + get: + operationId: getApplicationTotals + tags: + - Backend - Applications Stats + summary: /applications/totals - Get application statistics + description: Get statistics about applications including counts by type and status responses: '200': - description: Product rebranding deleted + description: Application totals retrieved successfully + content: + application/json: + schema: + type: object + properties: + code: + type: integer + example: 200 + message: + type: string + example: "application totals retrieved successfully" + data: + $ref: '#/components/schemas/ApplicationTotals' + '401': + $ref: '#/components/responses/Unauthorized' '403': $ref: '#/components/responses/Forbidden' - /rebranding/{org_id}/products/{product_id}/{asset}: + /applications/summary: get: - operationId: getRebrandingAsset + operationId: getApplicationTypeSummary tags: - - Backend - Rebranding - summary: Get a rebranding asset binary - security: - - BearerAuth: [] + - Backend - Applications Stats + summary: /applications/summary - Get applications grouped by type + description: | + Returns applications grouped by instance_of (type) with a counter. + Hierarchically authorized: the caller only sees applications on systems within their organization tree. + Optionally filterable by organization_id to restrict to a specific organization or by system_id to restrict to a specific system. + When include_hierarchy is true, also includes applications from all child organizations. parameters: - - name: org_id - in: path - required: true + - name: organization_id + in: query + required: false + description: Logto ID of a specific organization to filter by. Must be within the caller's hierarchy. schema: type: string - description: Logto organization ID (logto_id, not the database UUID) - - name: product_id - in: path - required: true + - name: system_id + in: query + required: false + description: UUID of a specific system to filter by. Must be within the caller's hierarchy. schema: type: string - - name: asset - in: path - required: true + - name: include_hierarchy + in: query + required: false + description: When true (and organization_id is provided), includes applications from all child organizations in the hierarchy schema: - type: string - enum: [logo_light_rect, logo_dark_rect, logo_light_square, logo_dark_square, favicon, background_image] - responses: - '200': - description: Asset binary - content: - image/*: - schema: - type: string - format: binary - '404': - $ref: '#/components/responses/NotFound' - delete: - operationId: deleteRebrandingAsset - tags: - - Backend - Rebranding - summary: Delete a single rebranding asset - security: - - BearerAuth: [] - parameters: - - name: org_id - in: path - required: true + type: boolean + default: false + - name: page + in: query + required: false + description: Page number for paginating the by_type array (1-based). If omitted, all types are returned. schema: - type: string - description: Logto organization ID (logto_id, not the database UUID) - - name: product_id - in: path - required: true + type: integer + minimum: 1 + - name: page_size + in: query + required: false + description: Number of application types per page (max 100). If omitted, all types are returned. + schema: + type: integer + minimum: 1 + maximum: 100 + - name: sort_by + in: query + required: false + description: Field to sort by_type results by schema: type: string - - name: asset - in: path - required: true + enum: [count, created_at, instance_of] + default: count + - name: sort_direction + in: query + required: false + description: Sort direction schema: type: string - enum: [logo_light_rect, logo_dark_rect, logo_light_square, logo_dark_square, favicon, background_image] - responses: - '200': - description: Asset deleted - '404': - $ref: '#/components/responses/NotFound' - - # =========================================== - # REBRANDING ENDPOINTS (Collect) - # =========================================== - - /systems/rebranding: - get: - operationId: getSystemRebranding - tags: - - Collect - Rebranding - summary: Get system rebranding configuration - description: | - Returns rebranding configuration for the authenticated system. - Resolves hierarchy inheritance (customer -> reseller -> distributor). - Response groups products by type (system vs application). - security: - - BasicAuth: [] + enum: [asc, desc] + default: desc responses: '200': - description: System rebranding configuration + description: Application type summary retrieved successfully content: application/json: schema: @@ -11455,105 +11762,72 @@ paths: properties: code: type: integer + example: 200 message: type: string + example: "application type summary retrieved successfully" data: - $ref: '#/components/schemas/SystemRebrandingResponse' + $ref: '#/components/schemas/ApplicationTypeSummary' '401': $ref: '#/components/responses/Unauthorized' + '403': + $ref: '#/components/responses/Forbidden' - /systems/rebranding/{product_id}/{asset}: + /applications/trend: get: - operationId: getSystemRebrandingAsset + operationId: getApplicationsTrend tags: - - Collect - Rebranding - summary: Get a rebranding asset binary for the system - description: Serves asset binary with hierarchy resolution - security: - - BasicAuth: [] + - Backend - Applications Stats + summary: /applications/trend - Get applications trend data + description: Get trend data for applications over a specified period showing daily counts parameters: - - name: product_id - in: path - required: true - schema: - type: string - - name: asset - in: path - required: true + - name: period + in: query + required: false + description: Number of days to include in trend data (default 7, max 365) schema: - type: string - enum: [logo_light_rect, logo_dark_rect, logo_light_square, logo_dark_square, favicon, background_image] + type: integer + default: 7 + minimum: 1 + maximum: 365 responses: '200': - description: Asset binary + description: Applications trend data retrieved successfully content: - image/*: + application/json: schema: - type: string - format: binary - '404': - $ref: '#/components/responses/NotFound' - - # =========================================== - # ALERTING ENDPOINTS (Backend - Totals & Trend) - # =========================================== + type: object + properties: + code: + type: integer + example: 200 + message: + type: string + example: "applications trend retrieved successfully" + data: + $ref: '#/components/schemas/TrendResponse' + '401': + $ref: '#/components/responses/Unauthorized' + '403': + $ref: '#/components/responses/Forbidden' - /alerts/totals: + /filters/applications: get: - operationId: getAlertsTotals + operationId: getApplicationFilters tags: - - Backend - Alerts - summary: Get alert totals - description: | - Returns active alert counts by severity (from Mimir, per-tenant) and total resolved - alert history count (from DB). Requires `read:systems` permission. - - **Scope modes** (selected by query params): - - | `organization_id` | `include` | Result | - |---|---|---| - | omitted | — | Caller's full hierarchy (recursive). For Customer it's just self. | - | `X` | omitted | Single tenant `X` only. Resellers/Distributors hold no alerts on their own tenant — those live on their customer tenants — so single-tenant queries on a non-leaf org typically return zero. | - | `X` (repeated for multi) | omitted | Union of all `organization_id` values passed. Each must be in the caller's hierarchy (Owner exempt). | - | `X` (single or multi) | `descendants` | Each org_id is expanded to itself + its sub-tree (deduplicated). Use this to drill into one or more sub-trees. | - - Active counts are aggregated across the resolved scope by fanning out to Mimir, - one request per tenant, with bounded concurrency and a global timeout. Per-tenant - failures (timeout, 5xx, parse error) are non-fatal: their counts simply don't - contribute, and the failure is reported in the `warnings` array. The `history` - total comes from a single SQL query scoped to the same set of organization IDs. - - Customer callers are always pinned to their own organization regardless of - `organization_id`/`include` (Mimir tenant is fixed to `user.organization_id`). - security: - - BearerAuth: [] - parameters: - - name: organization_id - in: query - description: | - Target organization ID(s). Repeat the param to pass multiple values - (`?organization_id=A&organization_id=B`). Optional for all roles except - Customer (where it is ignored). Distributors/Resellers receive `403` if any - value is not in their hierarchy. - schema: - type: array - items: - type: string - style: form - explode: true - - name: include - in: query - description: | - Set to `descendants` together with `organization_id` to expand each value - to its full sub-tree (results deduplicated). Ignored when `organization_id` - is omitted (the caller's own hierarchy is already used) and when the caller - is a Customer. - schema: - type: string - enum: [descendants] + - Backend - Filters + summary: /filters/applications - Get aggregated application filters + description: | + Aggregated endpoint that returns all application filter data in a single request. + RBAC is resolved once, then types, versions, systems, and organizations are fetched in parallel. + + **Version Format**: Returns versions in prefixed format `type:version` (e.g., `"nethvoice:1.5.3"`, `"mail:1.7.4"`) + + **Organizations special value**: If there are applications without an assigned organization, the response includes + a special "No organization" entry with `id: "no_org"` and `type: "unassigned"`. responses: '200': - description: Alert totals retrieved + description: Application filters retrieved successfully content: application/json: schema: @@ -11564,89 +11838,93 @@ paths: example: 200 message: type: string - example: alert totals retrieved successfully + example: "application filters retrieved successfully" data: type: object properties: - active: - type: integer - description: Total active alerts in scope - critical: - type: integer - description: Active critical alerts in scope - warning: - type: integer - description: Active warning alerts in scope - info: - type: integer - description: Active info alerts in scope - muted: - type: integer - description: Active alerts currently silenced (Alertmanager `silencedBy` non-empty) - history: - type: integer - description: Total resolved alerts in history (DB) in scope - warnings: + types: type: array - description: | - Per-tenant errors encountered during fan-out. Always present - (empty array when every tenant responded OK). Each entry is a - string in the form `org : ` or - `history: ` for the DB lookup. items: - type: string + $ref: '#/components/schemas/ApplicationType' + versions: + type: array + items: + type: object + properties: + application: + type: string + description: Application type (instance_of) + example: "nethvoice" + name: + type: string + description: Human-readable application name + example: "NethVoice" + versions: + type: array + items: + type: string + description: Prefixed versions (type:version) + example: ["nethvoice:1.5.4", "nethvoice:1.5.3"] + systems: + type: array + items: + $ref: '#/components/schemas/ApplicationSystemSummary' + organizations: + type: array + items: + $ref: '#/components/schemas/OrganizationSummary' + example: + code: 200 + message: "application filters retrieved successfully" + data: + types: + - instance_of: "nethvoice" + name: "NethVoice" + count: 15 + - instance_of: "mail" + name: "Mail" + count: 8 + versions: + - application: "mail" + name: "Mail" + versions: ["mail:1.7.4"] + - application: "nethvoice" + name: "NethVoice" + versions: ["nethvoice:1.5.4", "nethvoice:1.5.3"] + systems: + - id: "abc123-system-uuid" + name: "Production Server" + - id: "def456-system-uuid" + name: "Development Server" + organizations: + - id: "no_org" + logto_id: "no_org" + name: "No organization" + description: "" + type: "unassigned" + - id: "8b04e253-d408-4218-a30e-b048196847e5" + logto_id: "fso3biosnaqp" + name: "Acme Corp" + description: "" + type: "customer" '401': $ref: '#/components/responses/Unauthorized' '403': $ref: '#/components/responses/Forbidden' - /alerts/trend: + /filters/users: get: - operationId: getAlertsTrend + operationId: getUserFilters tags: - - Backend - Alerts - summary: Get alert history trend + - Backend - Filters + summary: /filters/users - Get aggregated user filters description: | - Returns trend data for resolved alerts over a specified period with daily data points. - Compares the current period with the previous period of equal length. - Requires `read:systems` permission. - - Scope follows the same three modes as `/alerts/totals`: - - `organization_id` omitted → caller's full hierarchy. - - `organization_id=X` → single tenant `X`. - - `organization_id=X&include=descendants` → `X` plus its sub-tree. - - Customer callers are always pinned to their own organization regardless of params. - security: - - BearerAuth: [] - parameters: - - name: organization_id - in: query - description: | - Target organization ID(s). Repeat the param to pass multiple values. - Optional for all roles except Customer (where it is ignored). - schema: - type: array - items: - type: string - style: form - explode: true - - name: include - in: query - description: Set to `descendants` together with `organization_id` to expand each value to its sub-tree. - schema: - type: string - enum: [descendants] - - name: period - in: query - description: Trend period in days - schema: - type: integer - enum: [7, 30, 180, 365] - default: 7 + Aggregated endpoint that returns all user filter data in a single request. + Auth is checked once, then roles and organizations are fetched in parallel. + Respects RBAC hierarchy for both roles and organizations. responses: '200': - description: Alert trend data + description: User filters retrieved successfully content: application/json: schema: @@ -11657,105 +11935,100 @@ paths: example: 200 message: type: string - example: alerts trend retrieved successfully + example: "user filters retrieved successfully" data: type: object properties: - period: - type: integer - period_label: - type: string - current_total: - type: integer - previous_total: - type: integer - delta: - type: integer - delta_percentage: - type: number - trend: - type: string - enum: [up, down, stable] - data_points: + roles: + type: array + items: + $ref: '#/components/schemas/Role' + organizations: type: array items: type: object properties: - date: + id: type: string - format: date - count: - type: integer - '400': - $ref: '#/components/responses/BadRequest' + description: Organization Logto ID + example: "org_abc123" + name: + type: string + description: Organization name + example: "ACME Corp" + type: + type: string + enum: [distributor, reseller, customer] + description: Organization type + example: "customer" + example: + code: 200 + message: "user filters retrieved successfully" + data: + roles: + - id: "role_abc123" + name: "Admin" + description: "System administrator" + - id: "role_def456" + name: "Support" + description: "Support operator" + organizations: + - id: "org_abc123" + name: "ACME Corp" + type: "customer" + - id: "org_def456" + name: "TechStart Inc" + type: "reseller" '401': $ref: '#/components/responses/Unauthorized' '403': $ref: '#/components/responses/Forbidden' - /alerts/stats: + /filters/alerts: get: - operationId: getAlertsStats + operationId: getAlertFilters tags: - - Backend - Alerts - summary: Aggregate alert statistics + - Backend - Filters + summary: /filters/alerts - Get aggregated alert filters description: | - Returns aggregate statistics over `alert_history` for the caller's scope: - total, severity buckets, top-N alertname and system_key by count, plus MTTR - (mean time to resolve) and MTBF (mean time between failures, approximated). + Aggregated endpoint for the alerts views' filter dropdowns. - Scope follows the same three modes as `/alerts/totals`: - - `organization_id` omitted → caller's full hierarchy. - - `organization_id=X` → single tenant. - - `organization_id=X&include=descendants` → sub-tree drill-down. + `systems`, `severities` and `organizations` are data-driven: only + values that actually appear in alert history within the caller's scope + are returned. `alerts` is a static catalog of every alert a NethServer + (NS8) or NethSecurity system can raise — it is NOT scoped to the + caller's data, so the dropdown can filter on alerts not yet received. - MTBF formula: - - When both `from_date` and `to_date` are provided: `(to - from) / total`. - - Otherwise: `(max(starts_at) - min(starts_at)) / (total - 1)`. - - Omitted from the response when the result is undefined. - security: - - BearerAuth: [] + Auth is checked once, then the data-driven datasets are fetched in + parallel. + + Scope follows the same rules as the other alerts endpoints: + `organization_id` omitted = caller's full hierarchy; one or more + `organization_id` = those tenants (validated, Owner exempt); + Customer is always pinned to its own organization. parameters: - name: organization_id in: query + required: false description: | - Target organization ID(s). Repeat the param to pass multiple values. + Tenant scope. Omit for the caller's full hierarchy, or repeat to + target specific tenants (each validated against the caller's + hierarchy; Owner exempt). schema: type: array items: type: string - style: form - explode: true + example: ["org_abc123"] - name: include in: query + required: false + description: When set to `descendants`, expand each organization_id to itself plus its sub-tree. schema: type: string enum: [descendants] - - name: from_date - in: query - description: Lower bound on `created_at` (inclusive). RFC3339 timestamp. - schema: - type: string - format: date-time - example: "2026-05-01T00:00:00Z" - - name: to_date - in: query - description: Upper bound on `created_at` (exclusive). RFC3339 timestamp. - schema: - type: string - format: date-time - example: "2026-05-08T00:00:00Z" - - name: top - in: query - description: Cap for top-N alertname / system_key buckets. Default 10, max 50. - schema: - type: integer - minimum: 1 - maximum: 50 - default: 10 responses: '200': - description: Alert stats retrieved + description: Alert filters retrieved successfully content: application/json: schema: @@ -11766,165 +12039,205 @@ paths: example: 200 message: type: string - example: alert stats retrieved successfully + example: "alert filters retrieved successfully" data: type: object properties: - total: - type: integer - description: Total alerts in scope (sum of severity buckets, including null severity). - by_severity: - type: object - additionalProperties: - type: integer - example: - critical: 30 - warning: 100 - info: 26 - top_alertnames: + systems: type: array items: type: object properties: - alertname: + id: type: string - count: - type: integer - top_systems: + description: System local ID + example: "sys_abc123" + name: + type: string + description: System display name + example: "Milan Office Server" + type: + type: string + description: System type (empty string if unknown) + example: "ns8" + key: + type: string + description: Unique system key + example: "ABCD1234" + alerts: type: array + description: | + Static catalog of every alert a NethServer (NS8) or + NethSecurity system can raise. Not scoped to the + caller's data — returned in full regardless of what + alert history exists. items: type: object properties: - system_key: + name: type: string - count: - type: integer - mttr_seconds: - type: integer - description: Mean time to resolve (avg `ends_at - starts_at` over rows with `ends_at` set). Omitted when no resolved alerts. - mtbf_seconds: - type: integer - description: Mean time between failures (approximation, see endpoint description). Omitted when undefined. - '400': - $ref: '#/components/responses/BadRequest' + description: Alert name (alertname label) + example: "DiskSpaceLow" + severity: + type: string + enum: [critical, warning, info] + description: Default severity for this alert type + example: "warning" + service: + type: string + description: Sub-service that raises the alert (omitted when not applicable) + example: "storage" + severities: + type: array + items: + type: string + description: Distinct severities present in alert history + example: ["critical", "warning"] + organizations: + type: array + items: + type: object + properties: + logto_id: + type: string + description: Organization Logto ID + example: "org_abc123" + name: + type: string + description: Organization name + example: "ACME Corp" + type: + type: string + enum: [distributor, reseller, customer] + description: Organization type + example: "customer" + example: + code: 200 + message: "alert filters retrieved successfully" + data: + systems: + - id: "sys_abc123" + name: "Milan Office Server" + type: "ns8" + key: "ABCD1234" + alerts: + - name: "DiskSpaceLow" + severity: "warning" + service: "storage" + - name: "ServiceDown" + severity: "critical" + severities: + - "critical" + - "warning" + organizations: + - logto_id: "org_abc123" + name: "ACME Corp" + type: "customer" '401': $ref: '#/components/responses/Unauthorized' '403': $ref: '#/components/responses/Forbidden' - /alerts/history: + /applications/{id}: get: - operationId: getAlertsHistory + operationId: getApplicationById tags: - - Backend - Alerts - summary: Org-level paginated alert history - description: | - Returns paginated resolved alert history scoped to the caller's hierarchy - (no `organization_id`), a single tenant (`organization_id=X`), or a sub-tree - (`organization_id=X&include=descendants`). Mirrors the scope rules of - `/alerts/totals` and `/alerts/trend`. - - Supports date range (`from_date`/`to_date`, RFC3339) and multi-value label - filters (`system_key`, `alertname`, `severity`, `status`). All multi-value - filters: OR within the same filter, AND across filters. - - Customer callers are always pinned to their own organization regardless of params. - security: - - BearerAuth: [] + - Backend - Applications + summary: /applications/{id} - Get single application + description: Get a specific application by ID parameters: - - name: organization_id - in: query - description: | - Target organization ID(s). Repeat the param to pass multiple values. - Optional for all roles except Customer (where it is ignored). - schema: - type: array - items: - type: string - style: form - explode: true - - name: include - in: query - description: Set to `descendants` together with `organization_id` to expand each value to its sub-tree. - schema: - type: string - enum: [descendants] - - $ref: '#/components/parameters/PageParam' - - $ref: '#/components/parameters/PageSizeParam' - - name: sort_by - in: query - required: false - schema: - type: string - enum: [id, alertname, severity, status, starts_at, ends_at, created_at] - default: created_at - - name: sort_direction - in: query - required: false - description: | - Sort direction. Unlike the shared default of `asc`, this endpoint - defaults to `desc` so the natural "most recent first" ordering is - applied when the caller omits the param. - schema: - type: string - enum: [asc, desc] - default: desc - - name: from_date - in: query - description: Lower bound on `created_at` (inclusive). RFC3339 timestamp. + - name: id + in: path + required: true + description: Application ID schema: type: string - format: date-time - example: "2026-05-01T00:00:00Z" - - name: to_date - in: query - description: Upper bound on `created_at` (exclusive). RFC3339 timestamp. Must be after `from_date`. + example: "sys_abc123_mail1" + responses: + '200': + description: Application retrieved successfully + content: + application/json: + schema: + type: object + properties: + code: + type: integer + example: 200 + message: + type: string + example: "application retrieved successfully" + data: + $ref: '#/components/schemas/Application' + '401': + $ref: '#/components/responses/Unauthorized' + '403': + $ref: '#/components/responses/Forbidden' + '404': + $ref: '#/components/responses/NotFound' + + put: + operationId: updateApplication + tags: + - Backend - Applications + summary: /applications/{id} - Update application + description: Update an application's notes (other fields are read-only and populated from inventory) + parameters: + - name: id + in: path + required: true + description: Application ID schema: type: string - format: date-time - example: "2026-05-08T00:00:00Z" - - name: system_key - in: query - description: | - Filter by one or more system keys. Repeat the param to pass multiple - values; results are matched as `system_key IN (...)`. - schema: - type: array - items: - type: string - style: form - explode: true - - name: alertname - in: query - description: Filter by alertname. Supports multiple values. - schema: - type: array - items: - type: string - style: form - explode: true - - name: severity - in: query - description: Filter by severity. Supports multiple values. - schema: - type: array - items: - type: string - enum: [critical, warning, info] - style: form - explode: true - - name: status - in: query - description: Filter by status. Supports multiple values. - schema: - type: array - items: - type: string - style: form - explode: true + example: "sys_abc123_mail1" + requestBody: + required: true + content: + application/json: + schema: + $ref: '#/components/schemas/UpdateApplicationRequest' + responses: + '200': + description: Application updated successfully + content: + application/json: + schema: + type: object + properties: + code: + type: integer + example: 200 + message: + type: string + example: "application updated successfully" + data: + $ref: '#/components/schemas/Application' + '400': + $ref: '#/components/responses/BadRequest' + '401': + $ref: '#/components/responses/Unauthorized' + '403': + $ref: '#/components/responses/Forbidden' + '404': + $ref: '#/components/responses/NotFound' + + delete: + operationId: deleteApplication + tags: + - Backend - Applications + summary: /applications/{id} - Delete application + description: Soft-delete an application + parameters: + - name: id + in: path + required: true + description: Application ID + schema: + type: string + example: "sys_abc123_mail1" responses: '200': - description: Paginated alert history + description: Application deleted successfully content: application/json: schema: @@ -11935,146 +12248,165 @@ paths: example: 200 message: type: string - example: alert history retrieved successfully + example: "application deleted successfully" data: type: object - properties: - alerts: - type: array - items: - $ref: '#/components/schemas/AlertHistoryRecord' - pagination: - $ref: '#/components/schemas/Pagination' - examples: - TwoResolvedAlerts: - summary: Two resolved alerts on the same system - description: | - Result for a Customer caller. Same `system_key` appears in both - rows because they were fired against the same NS8 host. Each - row is a discrete event (firing → resolved) captured by the - history webhook at dispatch time; `created_at` records when - the row landed in `alert_history`. - value: - code: 200 - message: alert history retrieved successfully - data: - alerts: - - id: 55 - organization_id: "m4m3mdjdiizs" - system_key: "NETH-D417-A2C2-7810-43D2-984B-2164-34C1-B22E" - alertname: "PlainBodyTest" - severity: "critical" - status: "resolved" - fingerprint: "11a9302b0fa6526e" - starts_at: "2026-05-12T07:46:50Z" - ends_at: "2026-05-12T07:51:50Z" - summary: "plain body check" - labels: - alertname: "PlainBodyTest" - severity: "critical" - system_key: "NETH-D417-A2C2-7810-43D2-984B-2164-34C1-B22E" - annotations: - summary: "plain body check" - description: "checking html:'' fix" - receiver: "severity-critical-receiver" - created_at: "2026-05-12T07:52:00Z" - - id: 54 - organization_id: "m4m3mdjdiizs" - system_key: "NETH-D417-A2C2-7810-43D2-984B-2164-34C1-B22E" - alertname: "HistFlowTest" - severity: "critical" - status: "resolved" - fingerprint: "9c1a23e87f4d0a11" - starts_at: "2026-05-12T08:01:07Z" - ends_at: "2026-05-12T08:06:06Z" - summary: "history flow check" - labels: - alertname: "HistFlowTest" - severity: "critical" - system_key: "NETH-D417-A2C2-7810-43D2-984B-2164-34C1-B22E" - annotations: - summary: "history flow check" - description: "resolved" - receiver: "severity-critical-receiver" - created_at: "2026-05-12T08:11:30Z" - pagination: - page: 1 - page_size: 50 - total_count: 2 - total_pages: 1 - has_next: false - has_prev: false - EmptyHistory: - summary: No history rows match - value: - code: 200 - message: alert history retrieved successfully - data: - alerts: [] - pagination: - page: 1 - page_size: 50 - total_count: 0 - total_pages: 0 - has_next: false - has_prev: false + nullable: true + '401': + $ref: '#/components/responses/Unauthorized' + '403': + $ref: '#/components/responses/Forbidden' + '404': + $ref: '#/components/responses/NotFound' + + /applications/{id}/assign: + patch: + operationId: assignApplicationOrganization + tags: + - Backend - Applications + summary: /applications/{id}/assign - Assign organization + description: Assign an organization to an application + parameters: + - name: id + in: path + required: true + description: Application ID + schema: + type: string + example: "sys_abc123_mail1" + requestBody: + required: true + content: + application/json: + schema: + $ref: '#/components/schemas/AssignApplicationRequest' + responses: + '200': + description: Organization assigned successfully + content: + application/json: + schema: + type: object + properties: + code: + type: integer + example: 200 + message: + type: string + example: "organization assigned successfully" + data: + $ref: '#/components/schemas/Application' '400': $ref: '#/components/responses/BadRequest' '401': $ref: '#/components/responses/Unauthorized' '403': $ref: '#/components/responses/Forbidden' + '404': + $ref: '#/components/responses/NotFound' - # =========================================== - # ALERTING ENDPOINTS (Per-alert audit timeline) - # =========================================== - - /alerts/activity/{fingerprint}: - parameters: - - name: fingerprint - in: path - required: true - description: | - Alertmanager fingerprint of the alert (hex hash of its labels). - Stable across re-firings of the same alert. - schema: - type: string - pattern: '^[A-Za-z0-9._:-]{1,128}$' - - name: organization_id - in: query - required: true - description: Tenant the alert belongs to. Required for non-Customer roles. - schema: - type: string - get: - operationId: getAlertActivity + /applications/{id}/unassign: + patch: + operationId: unassignApplicationOrganization tags: - - Backend - Alerts - summary: Per-alert audit timeline (silence events) - description: | - Returns the audit timeline for the alert identified by `fingerprint`, most recent - first. Events are written transparently as silences are created, updated, or - removed via the `/api/alerts/silences` and `/api/systems/:id/alerts/silences` endpoints. - - Operator notes are stored as the silence `comment` (Alertmanager native), so a - note edit appears here as a `silence_updated` event whose `details` payload - includes the new comment. - - Requires `read:systems` permission. - security: - - BearerAuth: [] + - Backend - Applications + summary: /applications/{id}/unassign - Remove organization + description: Remove organization assignment from an application parameters: - - name: limit - in: query - description: Max events to return. Default 100, max 500. + - name: id + in: path + required: true + description: Application ID schema: - type: integer - minimum: 1 - maximum: 500 - default: 100 + type: string + example: "sys_abc123_mail1" + responses: + '200': + description: Organization unassigned successfully + content: + application/json: + schema: + type: object + properties: + code: + type: integer + example: 200 + message: + type: string + example: "organization unassigned successfully" + data: + $ref: '#/components/schemas/Application' + '401': + $ref: '#/components/responses/Unauthorized' + '403': + $ref: '#/components/responses/Forbidden' + '404': + $ref: '#/components/responses/NotFound' + + # =========================================================================== + # OAUTH2/OIDC STANDARD ENDPOINTS (Third-party Applications) + # =========================================================================== + /user/permissions: + get: + operationId: getUserPermissions + tags: + - Backend - User + summary: /user/permissions - Get user permissions (OAuth2/OIDC) + description: Get current user permissions using standard OAuth2/OIDC flow with Logto token + responses: + '200': + description: User permissions retrieved successfully + content: + application/json: + schema: + type: object + properties: + code: + type: integer + example: 200 + message: + type: string + example: "user permissions retrieved successfully" + data: + type: object + properties: + user_roles: + type: array + items: + type: string + example: ["Admin"] + user_permissions: + type: array + items: + type: string + example: ["manage:systems", "read:systems"] + org_role: + type: string + example: "Owner" + org_permissions: + type: array + items: + type: string + example: ["manage:resellers", "create:customers"] + organization_id: + type: string + example: "org_123456789" + organization_name: + type: string + example: "ACME Corp" + '401': + $ref: '#/components/responses/Unauthorized' + + /user/profile: + get: + operationId: getUserProfile + tags: + - Backend - User + summary: /user/profile - Get user profile (OAuth2/OIDC) + description: Get current user profile using standard OAuth2/OIDC flow with Logto token responses: '200': - description: Activity timeline + description: User profile retrieved successfully content: application/json: schema: @@ -12085,124 +12417,31 @@ paths: example: 200 message: type: string - example: alert activity retrieved successfully + example: "user profile retrieved successfully" data: - type: object - properties: - events: - type: array - items: - $ref: '#/components/schemas/AlertActivityEntry' - examples: - SilenceCreatedThenRemoved: - summary: A silence was created and later removed - description: | - Events are most-recent first. Both rows share the same - `silence_id` because they describe the same silence's - lifecycle. `actor_user_id` is the logto_id of the operator - who performed the action; `details` carries the silence - metadata captured at action time (comment, end_at, etc.). - value: - code: 200 - message: alert activity retrieved successfully - data: - events: - - id: 5 - organization_id: "m4m3mdjdiizs" - fingerprint: "0a9d04bb6eed523f" - action: "unsilenced" - actor_user_id: "c5gpnoo2do48" - actor_name: "R1C1 Admin" - silence_id: "d9f91c6e-1b33-484e-befa-bfb41020e178" - details: {} - created_at: "2026-05-12T08:20:38.410596Z" - - id: 4 - organization_id: "m4m3mdjdiizs" - fingerprint: "0a9d04bb6eed523f" - action: "silenced" - actor_user_id: "c5gpnoo2do48" - actor_name: "R1C1 Admin" - silence_id: "d9f91c6e-1b33-484e-befa-bfb41020e178" - details: - comment: "silenced during maintenance window" - end_at: "2026-05-12T09:16:36Z" - duration_minutes: 0 - created_at: "2026-05-12T08:16:36.661832Z" - EmptyTimeline: - summary: No silence events yet - description: | - The alert has fired but has never been silenced. The events - array is empty (not `null`). - value: - code: 200 - message: alert activity retrieved successfully - data: - events: [] - '400': - $ref: '#/components/responses/BadRequest' + $ref: '#/components/schemas/UserProfile' '401': $ref: '#/components/responses/Unauthorized' - '403': - $ref: '#/components/responses/Forbidden' # =========================================== - # ALERTING ENDPOINTS (Backend - Cross-system silences) + # COLLECT SERVICE ENDPOINTS # =========================================== - /alerts/silences: + /systems/info: get: - operationId: getAlertSilences + operationId: getSystemInfo + servers: + - url: https://collect.your-domain.com/api + description: Collect API server (port 8081) tags: - - Backend - Alerts - summary: List active+pending silences across the caller's hierarchy - description: | - Cross-system parallel of `GET /systems/{id}/alerts/silences`. Returns - every active or pending Alertmanager silence in the caller's scope, - enriched with `organization_id` (the tenant that owns the silence) and - `system_key` (extracted from the silence matchers). Expired silences - and silences without a `system_key` matcher are excluded — only - silences our UI ever creates are addressable. - - Scope follows the same three modes as `/alerts/totals`: - - `organization_id` omitted → caller's full hierarchy (cross-tenant fan-out). - - `organization_id=X` → single tenant `X`. - - `organization_id=X&include=descendants` → `X` plus its sub-tree. - - Requires `read:systems` permission. + - Collect - Systems + summary: /systems/info - Get system info + description: Returns information about the authenticated system, including suspension and organization status. security: - - BearerAuth: [] - parameters: - - name: organization_id - in: query - description: | - Target organization ID(s). Repeat the param for multiple values. - Optional for all roles except Customer (where it is ignored). - schema: - type: array - items: - type: string - style: form - explode: true - - name: include - in: query - description: Set to `descendants` together with `organization_id` to expand each value to its sub-tree. - schema: - type: string - enum: [descendants] - - name: system_key - in: query - description: | - Filter silences by one or more system keys (exact match on the - `system_key` matcher). Repeat the param for multiple values. - schema: - type: array - items: - type: string - style: form - explode: true + - BasicAuth: [system_key:system_secret] responses: '200': - description: Paginated list of system-scoped silences + description: System info retrieved successfully content: application/json: schema: @@ -12213,136 +12452,154 @@ paths: example: 200 message: type: string - example: silences retrieved successfully + example: "system info retrieved successfully" data: type: object properties: - silences: - type: array - items: - allOf: - - $ref: '#/components/schemas/AlertmanagerSilence' - - type: object - properties: - organization_id: - type: string - description: Tenant that owns the silence (Mimir stores silences per-tenant). - example: "m4m3mdjdiizs" - system_key: - type: string - description: System key extracted from the silence matchers. - example: "NETH-D417-A2C2-7810-43D2-984B-2164-34C1-B22E" - warnings: - type: array - description: | - Per-tenant fan-out errors. Always present (empty when - every tenant responded OK). Each entry is a string - `org : `. - items: - type: string - examples: - OneActive: - summary: One active silence on a customer system - value: - code: 200 - message: silences retrieved successfully - data: - silences: - - id: "d9f91c6e-1b33-484e-befa-bfb41020e178" - organization_id: "m4m3mdjdiizs" - system_key: "NETH-D417-A2C2-7810-43D2-984B-2164-34C1-B22E" - matchers: - - name: "system_key" - value: "NETH-D417-A2C2-7810-43D2-984B-2164-34C1-B22E" - isRegex: false - - name: "alertname" - value: "HighCPUUsage" - isRegex: false - - name: "severity" - value: "warning" - isRegex: false - startsAt: "2026-05-12T08:16:36Z" - endsAt: "2026-05-12T09:16:36Z" - updatedAt: "2026-05-12T08:16:36Z" - createdBy: "amelia.foster" - comment: "muted during maintenance window" - status: - state: "active" - warnings: [] + system_id: + type: string + example: "abc-123" + system_key: + type: string + example: "my-system-key" + name: + type: string + example: "Milan Office Server" + type: + type: string + nullable: true + example: "ns8" + fqdn: + type: string + nullable: true + example: "server.example.com" + status: + type: string + example: "active" + suspended: + type: boolean + example: false + suspended_at: + type: string + format: date-time + nullable: true + deleted: + type: boolean + example: false + deleted_at: + type: string + format: date-time + nullable: true + registered: + type: boolean + example: true + registered_at: + type: string + format: date-time + nullable: true + example: "2025-01-15T10:00:00Z" + created_at: + type: string + format: date-time + example: "2025-01-10T08:00:00Z" + rebranding_enabled: + type: boolean + description: Whether rebranding is enabled for this system (directly or inherited through hierarchy) + example: false + organization: + type: object + properties: + id: + type: string + description: Database UUID of the organization + example: "4405ffd0-0aca-44ef-bae2-c8545bce94f4" + logto_id: + type: string + description: Logto organization ID + example: "akkbs6x2wo82" + name: + type: string + description: Organization name + example: "Owner" + type: + type: string + description: Organization type in the hierarchy + enum: [owner, distributor, reseller, customer] + example: "owner" + suspended: + type: boolean + example: false + suspended_at: + type: string + format: date-time + nullable: true '401': $ref: '#/components/responses/Unauthorized' - '403': - $ref: '#/components/responses/Forbidden' + '404': + description: System not found or deleted + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' '500': $ref: '#/components/responses/InternalServerError' + + /systems/inventory: post: - operationId: createAlertSilence + operationId: collectSystemInventory + servers: + - url: https://collect.your-domain.com/api + description: Collect API server (port 8081) tags: - - Backend - Alerts - summary: Mute an alert across systems - description: | - Cross-system parallel of `POST /systems/{id}/alerts/silences`. Mutes - an active alert identified by fingerprint inside a single tenant - (`?organization_id=`). The backend looks up the alert in Mimir, - extracts `system_key` from its labels, builds the matchers - server-side, and delegates to the same silence-creation path used by - the per-system endpoint — so the silence object stored in Mimir is - byte-identical regardless of which route created it. - - If `end_at` is set it takes precedence over `duration_minutes`. - - Requires `manage:systems` permission. + - Collect - Systems + summary: /systems/inventory - Collect system inventory + description: System inventory collection endpoint with HTTP Basic authentication security: - - BearerAuth: [] - parameters: - - name: organization_id - in: query - required: true - description: | - Tenant that owns the alert. Mandatory for every role except - Customer (where it is ignored — they're always pinned to their - own organization). Owners can address any tenant in the system. - schema: - type: string - example: "m4m3mdjdiizs" + - BasicAuth: [system_key:system_secret] requestBody: required: true + description: Raw inventory JSON from the system (structure varies by system type) content: application/json: schema: type: object - required: - - fingerprint - properties: - fingerprint: - type: string - description: Fingerprint of the active alert to silence. - example: "0a9d04bb6eed523f" - comment: - type: string - description: Optional silence comment. Defaults to a system-generated value when empty. - example: "silenced during maintenance" - duration_minutes: - type: integer - minimum: 1 - maximum: 10080 - description: Optional duration in minutes. Defaults to 60 when omitted. Ignored when end_at is set. - example: 60 - end_at: - type: string - format: date-time - description: Optional explicit end time (RFC3339). Takes precedence over duration_minutes. - example: "2026-05-12T09:16:36Z" - examples: - ExplicitEndAt: - summary: Silence until a specific date/time - value: - fingerprint: "0a9d04bb6eed523f" - comment: "silenced during maintenance window" - end_at: "2026-05-12T09:16:36Z" + description: Raw inventory data sent directly from the system + additionalProperties: true + example: { + "$schema": "https://schema.nethserver.org/facts/2022-12.json", + "uuid": "659d2fbe-792f-4a0d-ae58-f278304d4f7f", + "installation": "nethserver", + "facts": { + "cluster": { + "leader_node_id": "1", + "user_domains": [], + "subscription": "community", + "ui_name": "MyNethServer 8" + }, + "nodes": { + "1": { + "cluster_leader": true, + "fqdn": "rl1.dp.nethserver.net", + "default_ipv4": "165.22.17.26", + "default_ipv6": "2a03:b0c0:3:f0:0:1:dbfe:3000", + "version": "3.17.0-dev.6", + "ui_name": "MyNodeRl1" + } + }, + "modules": [ + { + "id": "mail1", + "version": "1.7.4", + "name": "mail", + "node": "1", + "ui_name": "MyMail" + } + ] + } + } responses: - '200': - description: Alert silenced successfully + '202': + description: Inventory received and queued for processing content: application/json: schema: @@ -12353,58 +12610,37 @@ paths: example: 200 message: type: string - example: "alert silenced successfully" + example: "Inventory received and queued for processing" data: type: object properties: - silence_id: + data_size: + type: integer + description: Inventory data size in bytes + example: 10145 + message: type: string - example: "d9f91c6e-1b33-484e-befa-bfb41020e178" + description: A message from server + example: "Your inventory data has been received and will be processed shortly" + queue_status: + type: string + description: Queue status message + example: "queued" + system_key: + type: string + description: System KEY + example: "NETH-4cf3053f-d0d5-4b10-b752-ff8f7b63c2f7" + timestamp: + type: string + format: date-time + description: Timestamp of received inventory + example: "2025-07-16T15:46:51.571831+02:00" '400': $ref: '#/components/responses/BadRequest' '401': $ref: '#/components/responses/Unauthorized' - '403': - $ref: '#/components/responses/Forbidden' - '404': - $ref: '#/components/responses/NotFound' - - /alerts/silences/{silence_id}: - get: - operationId: getAlertSilence - tags: - - Backend - Alerts - summary: Read a single silence - description: | - Cross-system parallel of `GET /systems/{id}/alerts/silences/{silence_id}`. - Looks up a silence inside a single tenant (`?organization_id=`) and - returns it enriched with `organization_id` and `system_key`. Silences - without a `system_key` matcher are reported as 404 — they don't belong - to our domain. - - Requires `read:systems` permission. - security: - - BearerAuth: [] - parameters: - - name: silence_id - in: path - required: true - description: Alertmanager silence ID. - schema: - type: string - example: "d9f91c6e-1b33-484e-befa-bfb41020e178" - - name: organization_id - in: query - required: true - description: | - Tenant that owns the silence. Mandatory for every role except - Customer (where it is ignored). - schema: - type: string - example: "m4m3mdjdiizs" - responses: - '200': - description: Silence retrieved successfully + '413': + description: Request Entity Too Large content: application/json: schema: @@ -12412,74 +12648,69 @@ paths: properties: code: type: integer - example: 200 + example: 413 message: type: string - example: "silence retrieved successfully" + example: "Request too large" data: type: object properties: - silence: - allOf: - - $ref: '#/components/schemas/AlertmanagerSilence' - - type: object - properties: - organization_id: - type: string - example: "m4m3mdjdiizs" - system_key: - type: string - example: "NETH-D417-A2C2-7810-43D2-984B-2164-34C1-B22E" - '401': - $ref: '#/components/responses/Unauthorized' - '403': - $ref: '#/components/responses/Forbidden' - '404': - $ref: '#/components/responses/NotFound' - put: - operationId: updateAlertSilence + max_size_bytes: + type: integer + description: Maximum allowed request size in bytes + example: 10485760 + received_bytes: + type: integer + description: Size of received request in bytes + example: 20971520 + '500': + description: Internal Server Error + content: + application/json: + schema: + type: object + properties: + code: + type: integer + example: 500 + message: + type: string + example: "Failed to process inventory" + data: + type: object + properties: + error: + type: string + example: "Processing queue unavailable" + + /systems/heartbeat: + post: + operationId: sendSystemHeartbeat + servers: + - url: https://collect.your-domain.com/api + description: Collect API server (port 8081) tags: - - Backend - Alerts - summary: Update a silence's end time / comment - description: | - Cross-system parallel of `PUT /systems/{id}/alerts/silences/{silence_id}`. - Preserves the original matchers and start time; only `end_at` and - `comment` change. Refuses to operate on silences without a - `system_key` matcher (404). Requires `manage:systems` permission. + - Collect - Systems + summary: /systems/heartbeat - System heartbeat + description: System heartbeat endpoint to track system liveness (every 10 minutes) security: - - BearerAuth: [] - parameters: - - name: silence_id - in: path - required: true - schema: - type: string - example: "d9f91c6e-1b33-484e-befa-bfb41020e178" - - name: organization_id - in: query - required: true - schema: - type: string - example: "m4m3mdjdiizs" + - BasicAuth: [system_key:system_secret] requestBody: required: true content: application/json: schema: type: object - required: - - end_at properties: - comment: - type: string - example: "extended for maintenance window" - end_at: + system_key: type: string - format: date-time - example: "2026-05-12T12:16:36Z" + description: System KEY sending the heartbeat + example: "NETH-4cf3053f-d0d5-4b10-b752-ff8f7b63c2f7" + required: + - system_key responses: '200': - description: Silence updated successfully + description: Heartbeat acknowledged content: application/json: schema: @@ -12490,49 +12721,77 @@ paths: example: 200 message: type: string - example: "silence updated successfully" + example: "heartbeat acknowledged" data: type: object properties: - silence_id: + system_key: type: string - example: "f1e1c2a4-7e57-4b1a-aaa0-2b96c8b5a3aa" + description: System KEY + example: "NETH-4cf3053f-d0d5-4b10-b752-ff8f7b63c2f7" + acknowledged: + type: boolean + description: Whether heartbeat was acknowledged + example: true + last_heartbeat: + type: string + format: date-time + description: Timestamp of this heartbeat + example: "2025-07-21T10:25:00Z" '400': $ref: '#/components/responses/BadRequest' '401': $ref: '#/components/responses/Unauthorized' '403': $ref: '#/components/responses/Forbidden' - '404': - $ref: '#/components/responses/NotFound' - delete: - operationId: deleteAlertSilence + '500': + $ref: '#/components/responses/InternalServerError' + + /systems/backups: + post: + operationId: uploadSystemBackup + servers: + - url: https://collect.your-domain.com/api + description: Collect API server (port 8081) tags: - - Backend - Alerts - summary: Unmute an alert + - Collect - Systems + summary: /systems/backups - Upload a configuration backup description: | - Cross-system parallel of `DELETE /systems/{id}/alerts/silences/{silence_id}`. - Removes a system-scoped silence; generic Alertmanager silences (no - `system_key` matcher) are not addressable through this endpoint and - return 404. Requires `manage:systems` permission. + Stream a GPG-encrypted configuration snapshot to the backup store. + The body is the ciphertext; SHA-256 is computed at ingest. The + appliance authenticates with HTTP Basic auth using its + `system_key:system_secret`. + + Server-side limits enforced before/during ingest: + - Per-system rate limit (default 6/min, 60/hour) — `429 Retry-After` on hit + - Per-system slot cap (default 10) — oldest pruned at retention + - Per-system size cap (default 500 MiB total) — oldest pruned + - Per-org aggregate quota (default 100 GiB) — `413` on hit + - Per-upload size cap (default 2 GiB) — `413` on hit + + Object key shape: `{org_id}/{system_key}/{uuidv7}.{ext}`. The + appliance never picks the destination prefix; it is server-derived + from the authenticated identity. security: - - BearerAuth: [] + - BasicAuth: [system_key:system_secret] parameters: - - name: silence_id - in: path - required: true - schema: - type: string - example: "d9f91c6e-1b33-484e-befa-bfb41020e178" - - name: organization_id - in: query - required: true + - name: X-Filename + in: header + required: false + description: User-facing filename (sanitized server-side) schema: type: string - example: "m4m3mdjdiizs" + example: "daily-backup-2026-04-12.tar.gz.gpg" + requestBody: + required: true + content: + application/octet-stream: + schema: + type: string + format: binary responses: - '200': - description: Silence disabled successfully + '201': + description: Backup stored content: application/json: schema: @@ -12540,156 +12799,39 @@ paths: properties: code: type: integer - example: 200 - message: - type: string - example: "silence disabled successfully" - '400': - $ref: '#/components/responses/BadRequest' - '401': - $ref: '#/components/responses/Unauthorized' - '403': - $ref: '#/components/responses/Forbidden' - '404': - $ref: '#/components/responses/NotFound' - - # =========================================== - # ALERTING ENDPOINTS (Backend - Configuration) - # =========================================== - - /alerts/config: - post: - operationId: configureAlerts - tags: - - Backend - Alerts - summary: Save the caller's alerting layer - description: | - Saves the CALLER's alerting configuration layer (one row per - organization in alert_config_layers). The body is an - `AlertingConfigLayer`: three channel toggles plus three recipient - lists. Each recipient carries its own `severities[]`; email - recipients additionally carry `language` and `format`. - - After save, the effective per-tenant Mimir YAML is recomputed - server-side (merge of all layers walking up to the Owner) and - pushed to every tenant in the caller's hierarchy with bounded - concurrency. Per-tenant push failures are returned in `warnings[]`; - the caller's layer is saved regardless of push outcome (Mimir can - be reconciled by saving again). - - Additive-only contract: descendants can ADD recipients but cannot - disable channels enabled by ancestors. The server normalises any - explicit `false` in `enabled.{email,webhook,telegram}` from - non-Owner layers to null on storage. - - Save+propagate is serialised per-organization (in-process mutex) to - prevent two concurrent saves from racing at the Mimir push step. - Body is capped at 1 MiB; oversized payloads are rejected with 413. - - Requires `manage:alerts` permission. - security: - - BearerAuth: [] - requestBody: - required: true - content: - application/json: - schema: - $ref: '#/components/schemas/AlertingConfigLayer' - examples: - OwnerGlobalBaseline: - summary: Owner — global baseline - description: | - Owner enables email + webhook globally, sets a NOC - recipient on all severities in Italian HTML, plus a SIEM - webhook on every severity. Every descendant inherits. - value: - enabled: { email: true, webhook: true, telegram: false } - email_recipients: - - address: "noc@msp.example" - severities: [] - language: "it" - format: "html" - webhook_recipients: - - name: "central-siem" - url: "https://siem.example/api/alerts" - severities: [] - telegram_recipients: [] - DescendantAddRecipient: - summary: Reseller — additively add a recipient - description: | - Reseller does NOT touch channel toggles (null = "no - opinion"); it just adds a local NOC mailbox in English - for critical+warning. Merged with Owner's recipients. - value: - enabled: { email: null, webhook: null, telegram: null } - email_recipients: - - address: "noc@reseller.example" - severities: ["critical", "warning"] - language: "en" - format: "html" - webhook_recipients: [] - telegram_recipients: [] - CustomerMixedFormatAndLang: - summary: Customer — mixed languages and formats per recipient - description: | - Different recipients can request different bodies. The - on-call inbox wants plain text (alerts piped into a - ticketing tool); the manager wants HTML in Italian. - value: - enabled: { email: null, webhook: null, telegram: null } - email_recipients: - - address: "oncall@customer.example" - severities: ["critical"] - language: "en" - format: "plain" - - address: "manager@customer.example" - severities: [] - language: "it" - format: "html" - webhook_recipients: [] - telegram_recipients: [] - CustomerWebhookCriticalOnly: - summary: Customer — Slack webhook only for `critical` - description: | - Customer adds a Slack webhook scoped to critical. The - rendered Alertmanager route puts this webhook only on - the critical receiver. - value: - enabled: { email: null, webhook: null, telegram: null } - email_recipients: [] - webhook_recipients: - - name: "ops-slack" - url: "https://hooks.slack.com/services/T000/B000/XXX" - severities: ["critical"] - telegram_recipients: [] - TelegramAllSeverities: - summary: Customer — Telegram channel on every severity - description: | - Single Telegram bot pushing to a channel for every - severity (severities=[]). Telegram messages are - currently always rendered in English. - value: - enabled: { email: null, webhook: null, telegram: true } - email_recipients: [] - webhook_recipients: [] - telegram_recipients: - - bot_token: "123456:ABC-DEF1234ghIkl" - chat_id: -1001234567890 - severities: [] - InheritPurely: - summary: Descendant — explicit "inherit everything" - description: | - Saving an empty layer is meaningful: it just records - audit metadata (who/when) without contributing - recipients or toggles. - value: - enabled: { email: null, webhook: null, telegram: null } - email_recipients: [] - webhook_recipients: [] - telegram_recipients: [] + example: 201 + message: + type: string + example: "backup stored" + data: + $ref: '#/components/schemas/BackupMetadata' + '400': + $ref: '#/components/responses/BadRequest' + '401': + $ref: '#/components/responses/Unauthorized' + '413': + $ref: '#/components/responses/RequestEntityTooLarge' + '429': + description: Rate limit exceeded; retry after the time indicated by `Retry-After`. + '503': + description: Backup storage or org-quota service unavailable; retry later. + get: + operationId: listSystemBackupsAppliance + servers: + - url: https://collect.your-domain.com/api + description: Collect API server (port 8081) + tags: + - Collect - Systems + summary: /systems/backups - List backups for the authenticated system + description: | + Returns metadata for every backup currently stored under the + authenticated system's prefix. Cross-tenant access is impossible + — the listing scope is server-derived from the credentials. + security: + - BasicAuth: [system_key:system_secret] responses: '200': - description: Layer saved (and propagation attempted) + description: Backups listed content: application/json: schema: @@ -12698,33 +12840,122 @@ paths: code: type: integer example: 200 - message: - type: string - example: alerting configuration updated successfully data: type: object properties: - affected_tenants: - type: integer - description: Number of tenants in caller's hierarchy whose effective config was recomputed - propagated_to: - type: integer - description: Of `affected_tenants`, how many were successfully pushed to Mimir - warnings: + backups: type: array - description: | - Per-tenant push errors. Always present; empty when every push succeeded. - Each entry: `org : `. items: - type: string + $ref: '#/components/schemas/BackupMetadata' + '401': + $ref: '#/components/responses/Unauthorized' + '503': + description: Backup storage unavailable. + + /systems/backups/{id}: + get: + operationId: downloadSystemBackupAppliance + servers: + - url: https://collect.your-domain.com/api + description: Collect API server (port 8081) + tags: + - Collect - Systems + summary: /systems/backups/{id} - Download a backup + description: | + Stream the ciphertext body for a specific backup belonging to the + authenticated system. Foreign IDs return `404`; the path can never + escape the authenticated system's prefix. + security: + - BasicAuth: [system_key:system_secret] + parameters: + - name: id + in: path + required: true + description: Backup ID (UUIDv7 plus extension) + schema: + type: string + example: "01934fab-bc33-7890-a1b2-c3d4e5f6a7b8.tar.gz" + responses: + '200': + description: Backup body streamed as `application/octet-stream` + content: + application/octet-stream: + schema: + type: string + format: binary '400': $ref: '#/components/responses/BadRequest' '401': $ref: '#/components/responses/Unauthorized' - '403': - $ref: '#/components/responses/Forbidden' - '413': - description: Request body exceeds the configured maximum (1 MiB). + '404': + $ref: '#/components/responses/NotFound' + '503': + description: Backup storage unavailable. + delete: + operationId: deleteSystemBackupAppliance + servers: + - url: https://collect.your-domain.com/api + description: Collect API server (port 8081) + tags: + - Collect - Systems + summary: /systems/backups/{id} - Delete a backup + description: | + Remove a specific backup belonging to the authenticated system. + Foreign IDs return `404`. + security: + - BasicAuth: [system_key:system_secret] + parameters: + - name: id + in: path + required: true + schema: + type: string + responses: + '200': + description: Backup deleted + '400': + $ref: '#/components/responses/BadRequest' + '401': + $ref: '#/components/responses/Unauthorized' + '404': + $ref: '#/components/responses/NotFound' + + # =========================================================================== + # VALIDATORS + # =========================================================================== + /validators/vat/{entity_type}: + get: + operationId: validateVAT + tags: + - Backend - Validators + summary: /validators/vat/{entity_type} - Validate VAT number + description: Check if a VAT number exists in the specified entity type (distributors, resellers, customers) + parameters: + - name: entity_type + in: path + required: true + description: Type of entity to check VAT against + schema: + type: string + enum: [distributors, resellers, customers] + example: "customers" + - name: vat + in: query + required: true + description: VAT number to validate + schema: + type: string + example: "12345678901" + - name: exclude_id + in: query + required: false + description: Entity ID to exclude from check (useful for updates) + schema: + type: string + example: "cust_123456789" + responses: + '200': + description: VAT validation completed successfully content: application/json: schema: @@ -12732,30 +12963,35 @@ paths: properties: code: type: integer - example: 413 + example: 200 message: type: string - example: request body exceeds the configured maximum + example: "VAT validation completed" + data: + $ref: '#/components/schemas/VATValidationResponse' + '400': + $ref: '#/components/responses/BadRequest' + '401': + $ref: '#/components/responses/Unauthorized' '500': $ref: '#/components/responses/InternalServerError' - delete: - operationId: disableAlerts - tags: - - Backend - Alerts - summary: Remove the caller's alerting layer - description: | - Removes the CALLER's layer from alert_config_layers. The effective - config of all descendant tenants is recomputed without the caller's - contribution and re-pushed to Mimir; ancestor layers are preserved. - To completely silence a tenant, every layer in its chain must drop - its contribution. - Requires `manage:alerts` permission. + # =========================================== + # REBRANDING ENDPOINTS (Backend) + # =========================================== + + /rebranding/products: + get: + operationId: getRebrandingProducts + tags: + - Backend - Rebranding + summary: List rebrandable products + description: Returns all products that support rebranding security: - BearerAuth: [] responses: '200': - description: Layer removed (and propagation attempted) + description: Rebrandable products retrieved content: application/json: schema: @@ -12766,48 +13002,80 @@ paths: example: 200 message: type: string - example: alerting layer removed successfully data: type: object properties: - affected_tenants: - type: integer - propagated_to: - type: integer - warnings: + products: type: array items: - type: string - '400': - $ref: '#/components/responses/BadRequest' + $ref: '#/components/schemas/RebrandableProduct' '401': $ref: '#/components/responses/Unauthorized' + + /rebranding/{org_id}/enable: + patch: + operationId: enableRebranding + tags: + - Backend - Rebranding + summary: Enable rebranding for an organization + description: Owner-only. Enables rebranding capability for a specific organization. + security: + - BearerAuth: [] + parameters: + - name: org_id + in: path + required: true + schema: + type: string + description: Logto organization ID (logto_id, not the database UUID) + responses: + '200': + description: Rebranding enabled + '400': + $ref: '#/components/responses/BadRequest' '403': $ref: '#/components/responses/Forbidden' - '500': - $ref: '#/components/responses/InternalServerError' - get: - operationId: getAlertingConfig - tags: - - Backend - Alerts - summary: Get the caller's alerting layer - description: | - Returns the CALLER's own alerting configuration layer. No inherited - ancestor layers, no merged effective view: every organization sees - only its own configuration. The server-side merge that backs the - Mimir YAML stays inside the backend. - When the caller has never saved a layer the response body contains - an empty layer (toggles all null, recipient lists empty) and the - two audit fields set to null. The UI uses that state to render a - first-save form. + /rebranding/{org_id}/disable: + patch: + operationId: disableRebranding + tags: + - Backend - Rebranding + summary: Disable rebranding for an organization + description: Owner-only. Disables rebranding for a specific organization. + security: + - BearerAuth: [] + parameters: + - name: org_id + in: path + required: true + schema: + type: string + description: Logto organization ID (logto_id, not the database UUID) + responses: + '200': + description: Rebranding disabled + '403': + $ref: '#/components/responses/Forbidden' - Requires `read:alerts` permission. + /rebranding/{org_id}/status: + get: + operationId: getRebrandingStatus + tags: + - Backend - Rebranding + summary: Get rebranding status for an organization security: - BearerAuth: [] + parameters: + - name: org_id + in: path + required: true + schema: + type: string + description: Logto organization ID (logto_id, not the database UUID) responses: '200': - description: Caller's layer + description: Rebranding status retrieved content: application/json: schema: @@ -12815,95 +13083,31 @@ paths: properties: code: type: integer - example: 200 message: type: string - example: alerting layer retrieved successfully data: - allOf: - - $ref: '#/components/schemas/AlertingConfigLayer' - - type: object - properties: - updated_by_name: - type: string - nullable: true - updated_at: - type: string - format: date-time - nullable: true - examples: - Configured: - summary: Caller has saved a layer - value: - code: 200 - message: "alerting layer retrieved successfully" - data: - enabled: { email: true, webhook: null, telegram: null } - email_recipients: - - address: "noc@reseller.example" - severities: ["critical"] - language: "it" - format: "html" - webhook_recipients: [] - telegram_recipients: [] - updated_by_name: "Reseller Admin" - updated_at: "2026-05-09T10:14:00Z" - FirstTime: - summary: Caller has not saved a layer yet - value: - code: 200 - message: "alerting layer retrieved successfully" - data: - enabled: { email: null, webhook: null, telegram: null } - email_recipients: [] - webhook_recipients: [] - telegram_recipients: [] - updated_by_name: null - updated_at: null - '401': - $ref: '#/components/responses/Unauthorized' + $ref: '#/components/schemas/RebrandingOrgStatus' '403': $ref: '#/components/responses/Forbidden' - /alerts/config/effective: + /rebranding/{org_id}/products: get: - operationId: getEffectiveAlertingConfig + operationId: getRebrandingOrgProducts tags: - - Backend - Alerts - summary: Inspect a tenant's effective (merged) alerting config - description: | - Privileged troubleshooting view. Returns the configuration a tenant - ACTUALLY receives: the per-layer contribution of every organization - in its ancestor chain (Owner → tenant), the merged effective layer, - and the rendered Alertmanager YAML pushed to Mimir for that tenant. - - Unlike `GET /alerts/config` (which returns only the caller's own - layer), this exposes the full inherited + merged view, so it is - gated by the dedicated `config:alerts` permission. That permission - lives solely on the `super` user role, which is owner-assignable - only — so in practice only an Owner-org Super Admin can reach this. - It is not reachable by Distributor/Reseller admins. - - Secrets are redacted in the response: telegram `bot_token` and - webhook URL path/query in every layer and in the effective layer, - and SMTP credentials / bearer / bot tokens in the rendered YAML. - - `organization_id` is required and may target ANY tenant. A - nonexistent id returns an empty effective config (no error) — the - honest answer for a diagnostic tool. Read-only: no Mimir push, no - DB writes. + - Backend - Rebranding + summary: Get rebranding products for an organization security: - BearerAuth: [] parameters: - - name: organization_id - in: query + - name: org_id + in: path required: true schema: type: string - description: Logto organization id of the tenant to inspect. + description: Logto organization ID (logto_id, not the database UUID) responses: '200': - description: Effective configuration report + description: Rebranding products retrieved content: application/json: schema: @@ -12911,175 +13115,182 @@ paths: properties: code: type: integer - example: 200 message: type: string - example: effective alerting configuration retrieved successfully data: - type: object - properties: - organization_id: - type: string - chain: - type: array - description: | - Contributing layers ordered Owner first → tenant - last. Orgs with no saved layer are listed with - `has_layer: false` and an empty layer. - items: - type: object - properties: - organization_id: - type: string - organization_name: - type: string - organization_role: - type: string - enum: [owner, distributor, reseller, customer] - has_layer: - type: boolean - layer: - $ref: '#/components/schemas/AlertingConfigLayer' - updated_by_name: - type: string - nullable: true - updated_at: - type: string - format: date-time - nullable: true - effective: - $ref: '#/components/schemas/AlertingConfigLayer' - yaml: - type: string - description: Rendered Alertmanager YAML (secrets redacted). - '400': - $ref: '#/components/responses/BadRequest' - '401': - $ref: '#/components/responses/Unauthorized' + $ref: '#/components/schemas/RebrandingOrgStatus' '403': $ref: '#/components/responses/Forbidden' - '422': - $ref: '#/components/responses/UnprocessableEntity' - /alerts: - get: - operationId: getAlerts + /rebranding/{org_id}/products/{product_id}: + put: + operationId: uploadRebrandingAssets tags: - - Backend - Alerts - summary: List active alerts + - Backend - Rebranding + summary: Upload rebranding assets for a product description: | - Retrieves active alerts from Mimir for the caller's scope, paginated. Each - alert is enriched with a `system` object (`name`, `type`) looked up from the - local `systems` table, so the UI can render the system column without an - extra round-trip per row. - - Sortable by `starts_at` (default desc), `severity` (criticality rank: critical - > warning > info), `alertname`, or `status` (Alertmanager state). `fingerprint` - is used as a stable tiebreaker so pagination doesn't shift between requests. - - Scope follows the same three modes as `/alerts/totals`: - - `organization_id` omitted → caller's full hierarchy (cross-tenant fan-out). - - `organization_id=X` → single tenant `X`. - - `organization_id=X&include=descendants` → `X` plus its sub-tree. - - All filter params support **multiple values** (repeat the param): values within - the same filter are matched as OR; different filters AND together. Example: - `?severity=critical&severity=warning&alertname=CVE-2024-1234` returns - CVE-2024-1234 alerts that are critical or warning. - - Per-tenant failures during fan-out (timeout, 5xx) are non-fatal: the rest of the - result is returned and the failure is reported in the `warnings` array. + Multipart upload of logos, favicon, background, and product name. + All fields optional. Only provided fields are updated. + Requires rebranding to be enabled for the organization. security: - BearerAuth: [] parameters: - - name: organization_id - in: query - description: | - Target organization ID(s). Repeat the param to pass multiple values. - Optional for all roles except Customer (where it is ignored). + - name: org_id + in: path + required: true schema: - type: array - items: - type: string - style: form - explode: true - - name: include - in: query - description: Set to `descendants` together with `organization_id` to expand each value to its sub-tree. + type: string + description: Logto organization ID (logto_id, not the database UUID) + - name: product_id + in: path + required: true schema: type: string - enum: [descendants] - - name: page - in: query - description: 1-based page number. + requestBody: + content: + multipart/form-data: + schema: + type: object + properties: + product_name: + type: string + maxLength: 100 + logo_light_rect: + type: string + format: binary + logo_dark_rect: + type: string + format: binary + logo_light_square: + type: string + format: binary + logo_dark_square: + type: string + format: binary + favicon: + type: string + format: binary + background_image: + type: string + format: binary + responses: + '200': + description: Assets uploaded + '400': + $ref: '#/components/responses/BadRequest' + '403': + $ref: '#/components/responses/Forbidden' + delete: + operationId: deleteRebrandingProduct + tags: + - Backend - Rebranding + summary: Delete all rebranding assets for a product + security: + - BearerAuth: [] + parameters: + - name: org_id + in: path + required: true schema: - type: integer - minimum: 1 - default: 1 - - name: page_size - in: query - description: Page size. Default 50, max 100. + type: string + description: Logto organization ID (logto_id, not the database UUID) + - name: product_id + in: path + required: true schema: - type: integer - minimum: 1 - maximum: 100 - default: 50 - - name: sort_by - in: query - description: Sort column (allowlist). + type: string + responses: + '200': + description: Product rebranding deleted + '403': + $ref: '#/components/responses/Forbidden' + + /rebranding/{org_id}/products/{product_id}/{asset}: + get: + operationId: getRebrandingAsset + tags: + - Backend - Rebranding + summary: Get a rebranding asset binary + security: + - BearerAuth: [] + parameters: + - name: org_id + in: path + required: true schema: type: string - enum: [starts_at, severity, alertname, status] - default: starts_at - - name: sort_direction - in: query + description: Logto organization ID (logto_id, not the database UUID) + - name: product_id + in: path + required: true schema: type: string - enum: [asc, desc] - default: desc - - name: status - in: query - description: Filter alerts by Alertmanager state. Supports multiple values. + - name: asset + in: path + required: true schema: - type: array - items: - type: string - enum: [active, suppressed, unprocessed] - style: form - explode: true - - name: severity - in: query - description: Filter alerts by severity label. Supports multiple values. + type: string + enum: [logo_light_rect, logo_dark_rect, logo_light_square, logo_dark_square, favicon, background_image] + responses: + '200': + description: Asset binary + content: + image/*: + schema: + type: string + format: binary + '404': + $ref: '#/components/responses/NotFound' + delete: + operationId: deleteRebrandingAsset + tags: + - Backend - Rebranding + summary: Delete a single rebranding asset + security: + - BearerAuth: [] + parameters: + - name: org_id + in: path + required: true schema: - type: array - items: - type: string - enum: [critical, warning, info] - style: form - explode: true - - name: system_key - in: query - description: Filter alerts by system_key label. Supports multiple values. + type: string + description: Logto organization ID (logto_id, not the database UUID) + - name: product_id + in: path + required: true schema: - type: array - items: - type: string - style: form - explode: true - - name: alertname - in: query - description: | - Filter alerts by alertname label (the alert "type" — e.g. `HighCPU`, - `DiskFull`, `CVE-2024-1234`). Supports multiple values. + type: string + - name: asset + in: path + required: true schema: - type: array - items: - type: string - style: form - explode: true + type: string + enum: [logo_light_rect, logo_dark_rect, logo_light_square, logo_dark_square, favicon, background_image] responses: '200': - description: Paginated list of active alerts + description: Asset deleted + '404': + $ref: '#/components/responses/NotFound' + + # =========================================== + # REBRANDING ENDPOINTS (Collect) + # =========================================== + + /systems/rebranding: + get: + operationId: getSystemRebranding + tags: + - Collect - Rebranding + summary: Get system rebranding configuration + description: | + Returns rebranding configuration for the authenticated system. + Resolves hierarchy inheritance (customer -> reseller -> distributor). + Response groups products by type (system vs application). + security: + - BasicAuth: [] + responses: + '200': + description: System rebranding configuration content: application/json: schema: @@ -13087,97 +13298,47 @@ paths: properties: code: type: integer - example: 200 message: type: string - example: alerts retrieved successfully data: - type: object - properties: - alerts: - type: array - items: - $ref: '#/components/schemas/ActiveAlert' - pagination: - $ref: '#/components/schemas/Pagination' - warnings: - type: array - description: | - Per-tenant errors encountered during fan-out. Always present - (empty array when every tenant responded OK). Each entry is - a string `org : `. - items: - type: string - examples: - ActiveAlertExample: - summary: One active warning across the caller's hierarchy - description: | - A single warning alert. System identity (id, key, name, type) is - carried as labels, stamped at ingest time. `state="active"` means - Mimir has not been told to silence it; an actively-muted alert - would have `state="suppressed"` and a non-empty `silencedBy`. - value: - code: 200 - message: alerts retrieved successfully - data: - alerts: - - fingerprint: "0a9d04bb6eed523f" - labels: - alertname: "DiskFilling" - severity: "warning" - system_id: "e4eb4844-46f6-448c-8279-7cfedf5e1037" - system_key: "NETH-D417-A2C2-7810-43D2-984B-2164-34C1-B22E" - system_name: "test-sys" - system_type: "ns8" - annotations: - summary: "/var is 92% full on test-sys" - description: "Disk usage exceeded the warning threshold." - status: - state: "active" - silencedBy: [] - inhibitedBy: [] - startsAt: "2026-05-12T08:14:00Z" - endsAt: "2026-05-12T08:44:00Z" - pagination: - page: 1 - page_size: 50 - total_count: 1 - total_pages: 1 - has_next: false - has_prev: false - warnings: [] - PartialFanoutWarning: - summary: One tenant timed out during fan-out - description: | - When `organization_id` is omitted (or `include=descendants` is - used), the request fans out to every tenant in scope. A single - slow Mimir does not fail the whole request — the rest of the - results are returned and the failing tenant lands in `warnings`. - value: - code: 200 - message: alerts retrieved successfully - data: - alerts: [] - pagination: - page: 1 - page_size: 50 - total_count: 0 - total_pages: 0 - has_next: false - has_prev: false - warnings: - - "org pt8gqs6y5wpr: context deadline exceeded" - '400': - $ref: '#/components/responses/BadRequest' + $ref: '#/components/schemas/SystemRebrandingResponse' '401': $ref: '#/components/responses/Unauthorized' - '403': - $ref: '#/components/responses/Forbidden' - '500': - $ref: '#/components/responses/InternalServerError' + + /systems/rebranding/{product_id}/{asset}: + get: + operationId: getSystemRebrandingAsset + tags: + - Collect - Rebranding + summary: Get a rebranding asset binary for the system + description: Serves asset binary with hierarchy resolution + security: + - BasicAuth: [] + parameters: + - name: product_id + in: path + required: true + schema: + type: string + - name: asset + in: path + required: true + schema: + type: string + enum: [logo_light_rect, logo_dark_rect, logo_light_square, logo_dark_square, favicon, background_image] + responses: + '200': + description: Asset binary + content: + image/*: + schema: + type: string + format: binary + '404': + $ref: '#/components/responses/NotFound' # =========================================== - # ALERTING ENDPOINTS (Collect - Mimir Proxy) + # ALERTING ENDPOINTS (Backend - Totals & Trend) # =========================================== /services/mimir/alertmanager/api/v2/alerts: From feea17d01b407264a2fda435e8689d64abf178d7 Mon Sep 17 00:00:00 2001 From: Edoardo Spadoni Date: Mon, 18 May 2026 16:09:06 +0200 Subject: [PATCH 10/10] docs(alerts): full examples for cross-system silence endpoints Bring /alerts/silences* to parity with /systems/{id}/alerts/silences*: add DurationBased + Created (POST), ActiveSilence (GET), ExtendEndTime + Updated (PUT), Disabled (DELETE). Reuses shared identifiers across the family so the mute->get->update->delete flow stays coherent. --- backend/openapi.yaml | 93 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) diff --git a/backend/openapi.yaml b/backend/openapi.yaml index 94283ecc..d2a4c836 100644 --- a/backend/openapi.yaml +++ b/backend/openapi.yaml @@ -10364,10 +10364,25 @@ paths: examples: ExplicitEndAt: summary: Silence until a specific date/time + description: | + When `end_at` is set, the silence expires at that moment + regardless of `duration_minutes`. The backend resolves the + alert by `fingerprint` inside the tenant given by + `?organization_id=`, extracts `system_key` from its labels, + and creates the silence. value: fingerprint: "0a9d04bb6eed523f" comment: "silenced during maintenance window" end_at: "2026-05-12T09:16:36Z" + DurationBased: + summary: Silence for the next 60 minutes + description: | + Without `end_at`, `duration_minutes` applies. If both are + omitted, the silence defaults to 60 minutes from creation. + value: + fingerprint: "0a9d04bb6eed523f" + comment: "investigating" + duration_minutes: 60 responses: '200': description: Alert silenced successfully @@ -10388,6 +10403,19 @@ paths: silence_id: type: string example: "d9f91c6e-1b33-484e-befa-bfb41020e178" + examples: + Created: + summary: Silence created + description: | + `silence_id` is the Alertmanager-assigned UUID; use it to + look up, update, or delete the silence later. The + corresponding `silenced` event is appended to the alert's + activity timeline (`GET /alerts/activity/{fingerprint}`). + value: + code: 200 + message: "alert silenced successfully" + data: + silence_id: "d9f91c6e-1b33-484e-befa-bfb41020e178" '400': $ref: '#/components/responses/BadRequest' '401': @@ -10458,6 +10486,38 @@ paths: system_key: type: string example: "NETH-D417-A2C2-7810-43D2-984B-2164-34C1-B22E" + examples: + ActiveSilence: + summary: Silence found and active + description: | + Same `AlertmanagerSilence` shape as the per-system + endpoint, plus `organization_id` (the tenant that owns the + silence) and `system_key` (extracted from the matchers). + value: + code: 200 + message: "silence retrieved successfully" + data: + silence: + id: "d9f91c6e-1b33-484e-befa-bfb41020e178" + organization_id: "m4m3mdjdiizs" + system_key: "NETH-D417-A2C2-7810-43D2-984B-2164-34C1-B22E" + matchers: + - name: "system_key" + value: "NETH-D417-A2C2-7810-43D2-984B-2164-34C1-B22E" + isRegex: false + - name: "alertname" + value: "HighCPUUsage" + isRegex: false + - name: "severity" + value: "warning" + isRegex: false + startsAt: "2026-05-12T08:16:36Z" + endsAt: "2026-05-12T09:16:36Z" + updatedAt: "2026-05-12T08:16:36Z" + createdBy: "amelia.foster" + comment: "silenced during maintenance window" + status: + state: "active" '401': $ref: '#/components/responses/Unauthorized' '403': @@ -10505,6 +10565,17 @@ paths: type: string format: date-time example: "2026-05-12T12:16:36Z" + examples: + ExtendEndTime: + summary: Extend the silence by 3 more hours + description: | + Alertmanager treats an update as "create a new silence with + the same matchers and start_at, then drop the old one", so + the response carries a new `silence_id`. The activity + timeline records this as a `silence_updated` event. + value: + comment: "extended for maintenance window" + end_at: "2026-05-12T12:16:36Z" responses: '200': description: Silence updated successfully @@ -10525,6 +10596,18 @@ paths: silence_id: type: string example: "f1e1c2a4-7e57-4b1a-aaa0-2b96c8b5a3aa" + examples: + Updated: + summary: New silence id after update + description: | + The `silence_id` differs from the one passed in the path: + Alertmanager replaced the silence under the hood while + preserving its matchers and start time. + value: + code: 200 + message: "silence updated successfully" + data: + silence_id: "f1e1c2a4-7e57-4b1a-aaa0-2b96c8b5a3aa" '400': $ref: '#/components/responses/BadRequest' '401': @@ -10572,6 +10655,16 @@ paths: message: type: string example: "silence disabled successfully" + examples: + Disabled: + summary: Silence removed + description: | + The silence is deleted from Alertmanager but stays + referenced in the alert's activity timeline as an + `unsilenced` event (`GET /alerts/activity/{fingerprint}`). + value: + code: 200 + message: "silence disabled successfully" '400': $ref: '#/components/responses/BadRequest' '401':