Skip to content

Commit bf0e0cd

Browse files
committed
[PLAT-1563] Fix instance down alerts + make sure instance restart alert is not fired on universe operations
Summary: In case master is missing from particular node - current alert query threats it as down. Need to create new metrics which indicate if master is expected on particular instance or not and include it in alert query. Also, need to make sure we're not raising instance restart alerts during universe operations, like upgrade. Test Plan: 1. Restart master, wait for health check, make sure instance restart alert is raised. 2. Wait for 30 minutes, make sure alert is cleaned. 3. Restart tserver, wait for health check, make sure instance restart alert is raised. 4. Wait for 30 minutes, make sure alert is cleaned. 5. Upgrade DB version. Make sure no unexcepted alerts are raised. 6. Create universe with 3 nodes and rf = 1. Make sure no instance down alert is raised. 7. Stop one of tservers for 15 minutes. Make sure instance down alert is raised. Reviewers: spotachev Reviewed By: spotachev Subscribers: jenkins-bot, sanketh, yugaware Differential Revision: https://phabricator.dev.yugabyte.com/D12895
1 parent 7f0d1d7 commit bf0e0cd

File tree

15 files changed

+518
-222
lines changed

15 files changed

+518
-222
lines changed

managed/src/main/java/com/yugabyte/yw/common/AlertTemplate.java

Lines changed: 228 additions & 166 deletions
Large diffs are not rendered by default.

managed/src/main/java/com/yugabyte/yw/common/alerts/AlertConfigurationService.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -584,7 +584,9 @@ public AlertConfigurationTemplate createConfigurationTemplate(
584584
.setDefaultConfiguration(configuration)
585585
.setThresholdMinValue(template.getThresholdMinValue())
586586
.setThresholdMaxValue(template.getThresholdMaxValue())
587-
.setThresholdInteger(template.getDefaultThresholdUnit().isInteger());
587+
.setThresholdInteger(template.getDefaultThresholdUnit().isInteger())
588+
.setThresholdReadOnly(template.isThresholdReadOnly())
589+
.setThresholdUnitName(template.getThresholdUnitName());
588590
}
589591

590592
private AlertDefinition createEmptyDefinition(AlertConfiguration configuration) {

managed/src/main/java/com/yugabyte/yw/common/alerts/impl/AlertConfigurationTemplate.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,4 +34,10 @@ public class AlertConfigurationTemplate {
3434

3535
@ApiModelProperty(value = "Is alert threshold integer or floating point", accessMode = READ_ONLY)
3636
private boolean thresholdInteger;
37+
38+
@ApiModelProperty(value = "Is alert threshold read-only or configurable", accessMode = READ_ONLY)
39+
private boolean thresholdReadOnly;
40+
41+
@ApiModelProperty(value = "Threshold unit name", accessMode = READ_ONLY)
42+
private String thresholdUnitName;
3743
}

managed/src/main/java/com/yugabyte/yw/common/metrics/UniverseMetricProvider.java

Lines changed: 101 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
*/
1010
package com.yugabyte.yw.common.metrics;
1111

12+
import static com.yugabyte.yw.common.metrics.MetricService.STATUS_OK;
1213
import static com.yugabyte.yw.common.metrics.MetricService.buildMetricTemplate;
1314

1415
import com.google.common.collect.ImmutableList;
@@ -17,43 +18,134 @@
1718
import com.yugabyte.yw.models.Metric;
1819
import com.yugabyte.yw.models.Universe;
1920
import com.yugabyte.yw.models.filters.MetricFilter;
21+
import com.yugabyte.yw.models.helpers.KnownAlertLabels;
22+
import com.yugabyte.yw.models.helpers.NodeDetails;
2023
import com.yugabyte.yw.models.helpers.PlatformMetrics;
2124
import java.util.ArrayList;
2225
import java.util.Collections;
2326
import java.util.List;
27+
import lombok.extern.slf4j.Slf4j;
2428

2529
@Singleton
30+
@Slf4j
2631
public class UniverseMetricProvider implements MetricsProvider {
2732

2833
private static final List<PlatformMetrics> UNIVERSE_METRICS =
2934
ImmutableList.of(
3035
PlatformMetrics.UNIVERSE_EXISTS,
3136
PlatformMetrics.UNIVERSE_PAUSED,
3237
PlatformMetrics.UNIVERSE_UPDATE_IN_PROGRESS,
33-
PlatformMetrics.UNIVERSE_BACKUP_IN_PROGRESS);
38+
PlatformMetrics.UNIVERSE_BACKUP_IN_PROGRESS,
39+
PlatformMetrics.UNIVERSE_NODE_FUNCTION);
3440

3541
@Override
3642
public List<Metric> getMetrics() throws Exception {
3743
List<Metric> metrics = new ArrayList<>();
3844
for (Customer customer : Customer.getAll()) {
3945
for (Universe universe : Universe.getAllWithoutResources(customer)) {
4046
metrics.add(
41-
buildMetricTemplate(PlatformMetrics.UNIVERSE_EXISTS, customer, universe)
42-
.setValue(MetricService.STATUS_OK));
47+
createUniverseMetric(customer, universe, PlatformMetrics.UNIVERSE_EXISTS, STATUS_OK));
4348
metrics.add(
44-
buildMetricTemplate(PlatformMetrics.UNIVERSE_PAUSED, customer, universe)
45-
.setValue(statusValue(universe.getUniverseDetails().universePaused)));
49+
createUniverseMetric(
50+
customer,
51+
universe,
52+
PlatformMetrics.UNIVERSE_PAUSED,
53+
statusValue(universe.getUniverseDetails().universePaused)));
4654
metrics.add(
47-
buildMetricTemplate(PlatformMetrics.UNIVERSE_UPDATE_IN_PROGRESS, customer, universe)
48-
.setValue(statusValue(universe.getUniverseDetails().updateInProgress)));
55+
createUniverseMetric(
56+
customer,
57+
universe,
58+
PlatformMetrics.UNIVERSE_UPDATE_IN_PROGRESS,
59+
statusValue(universe.getUniverseDetails().updateInProgress)));
4960
metrics.add(
50-
buildMetricTemplate(PlatformMetrics.UNIVERSE_BACKUP_IN_PROGRESS, customer, universe)
51-
.setValue(statusValue(universe.getUniverseDetails().backupInProgress)));
61+
createUniverseMetric(
62+
customer,
63+
universe,
64+
PlatformMetrics.UNIVERSE_BACKUP_IN_PROGRESS,
65+
statusValue(universe.getUniverseDetails().backupInProgress)));
66+
67+
if (universe.getUniverseDetails().nodeDetailsSet != null) {
68+
for (NodeDetails nodeDetails : universe.getUniverseDetails().nodeDetailsSet) {
69+
if (nodeDetails.cloudInfo == null || nodeDetails.cloudInfo.private_ip == null) {
70+
log.warn(
71+
"Universe {} does not seem to be created correctly"
72+
+ " - skipping per-node metrics",
73+
universe.getUniverseUUID());
74+
break;
75+
}
76+
77+
String ipAddress = nodeDetails.cloudInfo.private_ip;
78+
createNodeMetric(
79+
customer,
80+
universe,
81+
PlatformMetrics.UNIVERSE_NODE_FUNCTION,
82+
ipAddress,
83+
nodeDetails.masterHttpPort,
84+
"master_export",
85+
statusValue(nodeDetails.isMaster));
86+
createNodeMetric(
87+
customer,
88+
universe,
89+
PlatformMetrics.UNIVERSE_NODE_FUNCTION,
90+
ipAddress,
91+
nodeDetails.tserverHttpPort,
92+
"tserver_export",
93+
statusValue(nodeDetails.isTserver));
94+
createNodeMetric(
95+
customer,
96+
universe,
97+
PlatformMetrics.UNIVERSE_NODE_FUNCTION,
98+
ipAddress,
99+
nodeDetails.ysqlServerHttpPort,
100+
"ysql_export",
101+
statusValue(nodeDetails.isYsqlServer));
102+
createNodeMetric(
103+
customer,
104+
universe,
105+
PlatformMetrics.UNIVERSE_NODE_FUNCTION,
106+
ipAddress,
107+
nodeDetails.yqlServerHttpPort,
108+
"cql_export",
109+
statusValue(nodeDetails.isYqlServer));
110+
createNodeMetric(
111+
customer,
112+
universe,
113+
PlatformMetrics.UNIVERSE_NODE_FUNCTION,
114+
ipAddress,
115+
nodeDetails.redisServerHttpPort,
116+
"redis_export",
117+
statusValue(nodeDetails.isRedisServer));
118+
}
119+
}
52120
}
53121
}
54122
return metrics;
55123
}
56124

125+
private Metric createUniverseMetric(
126+
Customer customer, Universe universe, PlatformMetrics metric, double value) {
127+
String nodePrefix = universe.getUniverseDetails().nodePrefix;
128+
return buildMetricTemplate(metric, customer, universe)
129+
.setLabel(KnownAlertLabels.NODE_PREFIX, nodePrefix)
130+
.setValue(value);
131+
}
132+
133+
private Metric createNodeMetric(
134+
Customer customer,
135+
Universe universe,
136+
PlatformMetrics metric,
137+
String ipAddress,
138+
int port,
139+
String exportType,
140+
double value) {
141+
String nodePrefix = universe.getUniverseDetails().nodePrefix;
142+
return buildMetricTemplate(metric, customer, universe)
143+
.setKeyLabel(KnownAlertLabels.NODE_PREFIX, nodePrefix)
144+
.setKeyLabel(KnownAlertLabels.INSTANCE, ipAddress + ":" + port)
145+
.setLabel(KnownAlertLabels.EXPORT_TYPE, exportType)
146+
.setValue(value);
147+
}
148+
57149
@Override
58150
public List<MetricFilter> getMetricsToRemove() throws Exception {
59151
return Collections.singletonList(MetricFilter.builder().metrics(UNIVERSE_METRICS).build());

managed/src/main/java/com/yugabyte/yw/models/AlertConfigurationThreshold.java

Lines changed: 1 addition & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
import static io.swagger.annotations.ApiModelProperty.AccessMode.READ_WRITE;
1414

15+
import com.yugabyte.yw.models.common.Condition;
1516
import io.swagger.annotations.ApiModel;
1617
import io.swagger.annotations.ApiModelProperty;
1718
import lombok.Data;
@@ -27,21 +28,6 @@
2728
"Alert configuration threshold. Conditions can be either greater than a specified value, or less than a specified value.")
2829
public class AlertConfigurationThreshold {
2930

30-
public enum Condition {
31-
GREATER_THAN(">"),
32-
LESS_THAN("<");
33-
34-
private final String value;
35-
36-
Condition(String value) {
37-
this.value = value;
38-
}
39-
40-
public String getValue() {
41-
return value;
42-
}
43-
}
44-
4531
@ApiModelProperty(
4632
value = "Threshold condition (greater than, or less than)",
4733
allowableValues = ">, <",
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
/*
2+
* Copyright 2021 YugaByte, Inc. and Contributors
3+
*
4+
* Licensed under the Polyform Free Trial License 1.0.0 (the "License"); you
5+
* may not use this file except in compliance with the License. You
6+
* may obtain a copy of the License at
7+
*
8+
* http://github.com/YugaByte/yugabyte-db/blob/master/licenses/POLYFORM-FREE-TRIAL-LICENSE-1.0.0.txt
9+
*/
10+
package com.yugabyte.yw.models.common;
11+
12+
public enum Condition {
13+
GREATER_THAN(">"),
14+
LESS_THAN("<");
15+
16+
private final String value;
17+
18+
Condition(String value) {
19+
this.value = value;
20+
}
21+
22+
public String getValue() {
23+
return value;
24+
}
25+
}

managed/src/main/java/com/yugabyte/yw/models/common/Unit.java

Lines changed: 62 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -9,34 +9,56 @@
99
*/
1010
package com.yugabyte.yw.models.common;
1111

12+
import static com.yugabyte.yw.models.common.Condition.GREATER_THAN;
13+
import static com.yugabyte.yw.models.common.Condition.LESS_THAN;
14+
15+
import lombok.Data;
16+
import lombok.experimental.Accessors;
17+
1218
public enum Unit {
13-
STATUS(Measure.STATUS, "", "", 0, 1, true),
14-
COUNT(Measure.COUNT, "", "", 0, Double.MAX_VALUE, true),
15-
PERCENT(Measure.PERCENTAGE, "%", "pct", 0, 100, false),
16-
MILLISECOND(Measure.TIME, "ms", "ms", 0, Double.MAX_VALUE, true),
17-
SECOND(Measure.TIME, "sec", "sec", 0, Double.MAX_VALUE, true),
18-
DAY(Measure.TIME, "day", "day", 0, Double.MAX_VALUE, true);
19+
STATUS(
20+
new UnitBuilder()
21+
.measure(Measure.STATUS)
22+
.maxValue(1)
23+
.integer(true)
24+
.thresholdReadOnly(true)
25+
.thresholdCondition(LESS_THAN)),
26+
COUNT(new UnitBuilder().measure(Measure.COUNT).integer(true)),
27+
PERCENT(
28+
new UnitBuilder()
29+
.measure(Measure.PERCENTAGE)
30+
.displayName("%")
31+
.metricName("pct")
32+
.maxValue(100)),
33+
MILLISECOND(
34+
new UnitBuilder().measure(Measure.TIME).displayName("ms").metricName("ms").integer(true)),
35+
SECOND(
36+
new UnitBuilder().measure(Measure.TIME).displayName("sec").metricName("sec").integer(true)),
37+
DAY(
38+
new UnitBuilder()
39+
.measure(Measure.TIME)
40+
.displayName("day(s)")
41+
.metricName("day")
42+
.integer(true));
1943

2044
private final Measure measure;
2145
private final String displayName;
2246
private final String metricName;
2347
private final double minValue;
2448
private final double maxValue;
2549
private final boolean integer;
50+
private final boolean thresholdReadOnly;
51+
private final Condition thresholdCondition;
2652

27-
Unit(
28-
Measure measure,
29-
String displayName,
30-
String metricName,
31-
double minValue,
32-
double maxValue,
33-
boolean integer) {
34-
this.measure = measure;
35-
this.displayName = displayName;
36-
this.metricName = metricName;
37-
this.minValue = minValue;
38-
this.maxValue = maxValue;
39-
this.integer = integer;
53+
Unit(UnitBuilder builder) {
54+
this.measure = builder.measure();
55+
this.displayName = builder.displayName();
56+
this.metricName = builder.metricName();
57+
this.minValue = builder.minValue();
58+
this.maxValue = builder.maxValue();
59+
this.integer = builder.integer();
60+
this.thresholdReadOnly = builder.thresholdReadOnly();
61+
this.thresholdCondition = builder.thresholdCondition();
4062
}
4163

4264
public Measure getMeasure() {
@@ -62,4 +84,25 @@ public double getMaxValue() {
6284
public boolean isInteger() {
6385
return integer;
6486
}
87+
88+
public boolean isThresholdReadOnly() {
89+
return thresholdReadOnly;
90+
}
91+
92+
public Condition getThresholdCondition() {
93+
return thresholdCondition;
94+
}
95+
96+
@Data
97+
@Accessors(chain = true, fluent = true)
98+
private static class UnitBuilder {
99+
private Measure measure;
100+
private String displayName = "";
101+
private String metricName = "";
102+
private double minValue = 0;
103+
private double maxValue = Double.MAX_VALUE;
104+
private boolean integer = false;
105+
private boolean thresholdReadOnly = false;
106+
private Condition thresholdCondition = GREATER_THAN;
107+
}
65108
}

managed/src/main/java/com/yugabyte/yw/models/helpers/KnownAlertLabels.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,10 @@ public enum KnownAlertLabels {
2525
SEVERITY,
2626
THRESHOLD,
2727
ERROR_MESSAGE,
28-
NODE_NAME;
28+
NODE_NAME,
29+
NODE_PREFIX,
30+
INSTANCE,
31+
EXPORT_TYPE;
2932

3033
public String labelName() {
3134
return name().toLowerCase();

managed/src/main/java/com/yugabyte/yw/models/helpers/PlatformMetrics.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,8 @@ public enum PlatformMetrics {
8383
UNIVERSE_EXISTS("Flag, indicating that universe exists", Unit.STATUS),
8484
UNIVERSE_PAUSED("Flag, indicating that universe is paused", Unit.STATUS),
8585
UNIVERSE_UPDATE_IN_PROGRESS("Flag, indicating that universe update is in progress", Unit.STATUS),
86-
UNIVERSE_BACKUP_IN_PROGRESS("Flag, indicating that universe backup is in progress", Unit.STATUS);
86+
UNIVERSE_BACKUP_IN_PROGRESS("Flag, indicating that universe backup is in progress", Unit.STATUS),
87+
UNIVERSE_NODE_FUNCTION("Flag, indicating expected node functions", Unit.STATUS);
8788

8889
private final String help;
8990
private final Unit unit;
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
-- Copyright (c) YugaByte, Inc.
2+
3+
-- No need to fix anything for h2.

0 commit comments

Comments
 (0)