diff --git a/charts/sophora-cluster-common/Chart.yaml b/charts/sophora-cluster-common/Chart.yaml index 60ed408f..3594a65f 100644 --- a/charts/sophora-cluster-common/Chart.yaml +++ b/charts/sophora-cluster-common/Chart.yaml @@ -2,11 +2,15 @@ apiVersion: v2 name: sophora-cluster-common description: A Helm chart containing some common resources useful for Sophora cloud setups type: application -version: 1.4.4 +version: 1.5.0 annotations: artifacthub.io/changes: | + - kind: added + description: "added prometheus alert rule configuration options" + - kind: added + description: "added prometheus alert rule SophoraStagingServerNotInSync" - kind: changed - description: "updated comment format in values.yaml for properties" + description: "The prometheus alert rule SophoraReplicaServerNotInSync does no longer include staging servers" appVersion: "4" sources: diff --git a/charts/sophora-cluster-common/alerting-runbook.md b/charts/sophora-cluster-common/alerting-runbook.md index b30ad491..c6d962f8 100644 --- a/charts/sophora-cluster-common/alerting-runbook.md +++ b/charts/sophora-cluster-common/alerting-runbook.md @@ -22,18 +22,17 @@ replication will happen to other running servers, if there are any. * Try to restart the server, if it is running but unresponsive * Restore the server from a working backup -### SophoraServerNotInSync +### SophoraReplicaServerNotInSync **Severity:** high -**Summary:** The Sophora server is not in sync. This is concluded from comparing the server's *SourceTime* with the -SourceTime of the primary server. The SourceTime is the timestamp of the latest event that occured on the primary -server. -Usually the SourceTimes of the servers should not diverge too much and stay equal when compared over a short time frame. +**Summary:** A Sophora replica server is not in sync. This alert applies only to replica servers, not to staging servers. It is triggered by comparing the +replica server's *SourceTime* with the SourceTime of the primary server. The SourceTime is the timestamp of the latest event that occurred on the primary server. +Usually, the SourceTimes of replica servers should not diverge too much from the primary server and should stay equal when compared over a short time frame. **Remediation steps:** -* Check if the primary server logged a message containing "ReplicationMaster stopped" or "StagingMaster stopped". If +* Check if the primary server logged a message containing "ReplicationMaster stopped". If yes: The primary server needs to be restarted. If "ReplicationMaster stopped" is logged, this needs to happen **without electing another server to the primary**. The last part is absolutely critical to preventing data loss. Depending on the version of the Server Helm @@ -56,6 +55,22 @@ Usually the SourceTimes of the servers should not diverge too much and stay equa * Check the server's and the primary server's logs for errors or warnings * Restart the server +### SophoraStagingServerNotInSync + +**Severity:** high + +**Summary:** A Sophora staging server is not in sync. This alert applies only to staging servers, not to replica servers. +It is triggered when the servers' state is `SYNCHRONIZATION_DELAYED` or `QUEUE_TOO_LONG`. + +**Remediation steps:** + +* Check if the primary server logged a message containing "StagingMaster stopped". If yes: The primary server needs to be restarted. +* Check if there is a large replication queue or a large amount of activity on the server, which could result in a short delay +* Check whether the not-in-sync staging server is in an erroneous state and stopped receiving staging messages +* Check whether network connection issues between the staging servers exist +* Check the staging servers' logs for errors or warnings +* Restart the affected staging server + ### MultiplePrimarySophoraServers **Severity:** critical diff --git a/charts/sophora-cluster-common/templates/alerts/prometheusrule.yaml b/charts/sophora-cluster-common/templates/alerts/prometheusrule.yaml index b6728138..c332e9e8 100644 --- a/charts/sophora-cluster-common/templates/alerts/prometheusrule.yaml +++ b/charts/sophora-cluster-common/templates/alerts/prometheusrule.yaml @@ -11,7 +11,7 @@ spec: rules: {{- if $defaultRulesEnabled }} - alert: NoPrimarySophoraServer - for: 1m + for: {{ .Values.prometheusRules.config.NoPrimarySophoraServer.for }} expr: 'absent(sophora_server_replication_mode{namespace="{{ .Release.Namespace }}"} == 1)' labels: severity: critical @@ -20,18 +20,28 @@ spec: summary: The Sophora Cluster has no primary. description: No primary server elected in the cluster. runbook_url: 'https://github.com/subshell/helm-charts/blob/main/charts/sophora-cluster-common/alerting-runbook.md' - - alert: SophoraServerNotInSync - for: 2m - expr: 'max(sophora_server_source_time and sophora_server_is_primary_server{namespace="{{ .Release.Namespace }}"} == 1) - ignoring(pod) group_right max by (pod) (sophora_server_source_time and sophora_server_state{namespace="{{ .Release.Namespace }}"} == 2) > 60000' + - alert: SophoraReplicaServerNotInSync + for: {{ .Values.prometheusRules.config.SophoraReplicaServerNotInSync.for }} + expr: 'max(sophora_server_source_time{namespace="{{ .Release.Namespace }}"} and sophora_server_is_primary_server{namespace="{{ .Release.Namespace }}"} == 1) - ignoring(pod) group_right max by (pod) (sophora_server_source_time{namespace="{{ .Release.Namespace }}"} and sophora_server_state{namespace="{{ .Release.Namespace }}"} == 2 and and sophora_server_replication_mode{namespace="{{ .Release.Namespace }}"} == 2) > 60000' labels: severity: high namespace: "{{ .Release.Namespace }}" annotations: - summary: Server is not in sync + summary: Replica is not in sync + description: The server "{{`{{ $labels.pod }}`}}" is not in sync. + runbook_url: 'https://github.com/subshell/helm-charts/blob/main/charts/sophora-cluster-common/alerting-runbook.md' + - alert: SophoraStagingServerNotInSync + for: {{ .Values.prometheusRules.config.SophoraStagingServerNotInSync.for }} + expr: 'sophora_server_source_time{namespace="{{ .Release.Namespace }}"} and on(pod) (sophora_server_state{namespace="{{ .Release.Namespace }}"} == 3 or sophora_server_state{namespace="{{ .Release.Namespace }}"} == 4) and on(pod) sophora_server_replication_mode{namespace="{{ .Release.Namespace }}"} == 3 > 0' + labels: + severity: high + namespace: "{{ .Release.Namespace }}" + annotations: + summary: Staging is not in sync description: The server "{{`{{ $labels.pod }}`}}" is not in sync. runbook_url: 'https://github.com/subshell/helm-charts/blob/main/charts/sophora-cluster-common/alerting-runbook.md' - alert: MultiplePrimarySophoraServers - for: 1m + for: {{ .Values.prometheusRules.config.MultiplePrimarySophoraServers.for }} expr: 'count(sophora_server_replication_mode{namespace="{{ .Release.Namespace }}"} == 1) > 1' labels: severity: critical diff --git a/charts/sophora-cluster-common/values.yaml b/charts/sophora-cluster-common/values.yaml index f9b48952..692f2d1c 100644 --- a/charts/sophora-cluster-common/values.yaml +++ b/charts/sophora-cluster-common/values.yaml @@ -91,6 +91,15 @@ prometheusRules: enabled: false # prometheusRules.defaultRulesEnabled Whether the default rules should be installed defaultRulesEnabled: true + config: + NoPrimarySophoraServer: + for: 1m + SophoraReplicaServerNotInSync: + for: 2m + SophoraStagingServerNotInSync: + for: 10m + MultiplePrimarySophoraServers: + for: 1m # prometheusRules.rules allows to add custom rules rules: []