subshell · tom-schoener · Jun 18, 2026 · Jun 18, 2026 · Jun 18, 2026
diff --git a/charts/sophora-cluster-common/Chart.yaml b/charts/sophora-cluster-common/Chart.yaml
@@ -2,11 +2,15 @@ apiVersion: v2
 name: sophora-cluster-common
 description: A Helm chart containing some common resources useful for Sophora cloud setups
 type: application
-version: 1.4.4
+version: 1.5.0
 annotations:
   artifacthub.io/changes: |
+    - kind: added
+      description: "added prometheus alert rule configuration options"
+    - kind: added
+      description: "added prometheus alert rule SophoraStagingServerNotInSync"
     - kind: changed
-      description: "updated comment format in values.yaml for properties"
+      description: "The prometheus alert rule SophoraReplicaServerNotInSync does no longer include staging servers"
 
 appVersion: "4"
 sources:

diff --git a/charts/sophora-cluster-common/alerting-runbook.md b/charts/sophora-cluster-common/alerting-runbook.md
@@ -22,18 +22,17 @@ replication will happen to other running servers, if there are any.
 * Try to restart the server, if it is running but unresponsive
 * Restore the server from a working backup
 
-### SophoraServerNotInSync
+### SophoraReplicaServerNotInSync
 
 **Severity:** high
 
-**Summary:** The Sophora server is not in sync. This is concluded from comparing the server's *SourceTime* with the
-SourceTime of the primary server. The SourceTime is the timestamp of the latest event that occured on the primary
-server.
-Usually the SourceTimes of the servers should not diverge too much and stay equal when compared over a short time frame.
+**Summary:** A Sophora replica server is not in sync. This alert applies only to replica servers, not to staging servers. It is triggered by comparing the 
+replica server's *SourceTime* with the SourceTime of the primary server. The SourceTime is the timestamp of the latest event that occurred on the primary server.
+Usually, the SourceTimes of replica servers should not diverge too much from the primary server and should stay equal when compared over a short time frame.
 
 **Remediation steps:**
 
-* Check if the primary server logged a message containing "ReplicationMaster stopped" or "StagingMaster stopped". If
+* Check if the primary server logged a message containing "ReplicationMaster stopped". If
   yes: The primary server needs to be
   restarted. If "ReplicationMaster stopped" is logged, this needs to happen **without electing another server to the
   primary**. The last part is absolutely critical to preventing data loss. Depending on the version of the Server Helm
@@ -56,6 +55,22 @@ Usually the SourceTimes of the servers should not diverge too much and stay equa
 * Check the server's and the primary server's logs for errors or warnings
 * Restart the server
 
+### SophoraStagingServerNotInSync
+
+**Severity:** high
+
+**Summary:** A Sophora staging server is not in sync. This alert applies only to staging servers, not to replica servers.
+It is triggered when the servers' state is `SYNCHRONIZATION_DELAYED` or `QUEUE_TOO_LONG`.
+
+**Remediation steps:**
+
+* Check if the primary server logged a message containing "StagingMaster stopped". If yes: The primary server needs to be restarted.
+* Check if there is a large replication queue or a large amount of activity on the server, which could result in a short delay
+* Check whether the not-in-sync staging server is in an erroneous state and stopped receiving staging messages
+* Check whether network connection issues between the staging servers exist
+* Check the staging servers' logs for errors or warnings
+* Restart the affected staging server
+
 ### MultiplePrimarySophoraServers
 
 **Severity:** critical

diff --git a/charts/sophora-cluster-common/templates/alerts/prometheusrule.yaml b/charts/sophora-cluster-common/templates/alerts/prometheusrule.yaml
@@ -11,7 +11,7 @@ spec:
       rules:
         {{- if $defaultRulesEnabled }}
         - alert: NoPrimarySophoraServer
-          for: 1m
+          for: {{ .Values.prometheusRules.config.NoPrimarySophoraServer.for }}
           expr: 'absent(sophora_server_replication_mode{namespace="{{ .Release.Namespace }}"} == 1)'
           labels:
             severity: critical
@@ -20,18 +20,28 @@ spec:
             summary: The Sophora Cluster has no primary.
             description: No primary server elected in the cluster.
             runbook_url: 'https://github.com/subshell/helm-charts/blob/main/charts/sophora-cluster-common/alerting-runbook.md'
-        - alert: SophoraServerNotInSync
-          for: 2m
-          expr: 'max(sophora_server_source_time and sophora_server_is_primary_server{namespace="{{ .Release.Namespace }}"} == 1) - ignoring(pod) group_right max by (pod) (sophora_server_source_time and sophora_server_state{namespace="{{ .Release.Namespace }}"} == 2) > 60000'
+        - alert: SophoraReplicaServerNotInSync
+          for: {{ .Values.prometheusRules.config.SophoraReplicaServerNotInSync.for }}
+          expr: 'max(sophora_server_source_time{namespace="{{ .Release.Namespace }}"} and sophora_server_is_primary_server{namespace="{{ .Release.Namespace }}"} == 1) - ignoring(pod) group_right max by (pod) (sophora_server_source_time{namespace="{{ .Release.Namespace }}"}  and sophora_server_state{namespace="{{ .Release.Namespace }}"} == 2 and and sophora_server_replication_mode{namespace="{{ .Release.Namespace }}"} == 2) > 60000'
           labels:
             severity: high
             namespace: "{{ .Release.Namespace }}"
           annotations:
-            summary: Server is not in sync
+            summary: Replica is not in sync
+            description: The server  "{{`{{ $labels.pod }}`}}" is not in sync.
+            runbook_url: 'https://github.com/subshell/helm-charts/blob/main/charts/sophora-cluster-common/alerting-runbook.md'
+        - alert: SophoraStagingServerNotInSync
+          for: {{ .Values.prometheusRules.config.SophoraStagingServerNotInSync.for }}
+          expr: 'sophora_server_source_time{namespace="{{ .Release.Namespace }}"} and on(pod) (sophora_server_state{namespace="{{ .Release.Namespace }}"} == 3 or sophora_server_state{namespace="{{ .Release.Namespace }}"} == 4) and on(pod) sophora_server_replication_mode{namespace="{{ .Release.Namespace }}"} == 3 > 0'
+          labels:
+            severity: high
+            namespace: "{{ .Release.Namespace }}"
+          annotations:
+            summary: Staging is not in sync
             description: The server  "{{`{{ $labels.pod }}`}}" is not in sync.
             runbook_url: 'https://github.com/subshell/helm-charts/blob/main/charts/sophora-cluster-common/alerting-runbook.md'
         - alert: MultiplePrimarySophoraServers
-          for: 1m
+          for: {{ .Values.prometheusRules.config.MultiplePrimarySophoraServers.for }}
           expr: 'count(sophora_server_replication_mode{namespace="{{ .Release.Namespace }}"} == 1) > 1'
           labels:
             severity: critical

diff --git a/charts/sophora-cluster-common/values.yaml b/charts/sophora-cluster-common/values.yaml
@@ -91,6 +91,15 @@ prometheusRules:
   enabled: false
   # prometheusRules.defaultRulesEnabled Whether the default rules should be installed
   defaultRulesEnabled: true
+  config:
+    NoPrimarySophoraServer:
+      for: 1m
+    SophoraReplicaServerNotInSync:
+      for: 2m
+    SophoraStagingServerNotInSync:
+      for: 10m
+    MultiplePrimarySophoraServers:
+      for: 1m
   # prometheusRules.rules allows to add custom rules
   rules: []