From 690623cb685660917dfb43942ebfbcc73a326b8a Mon Sep 17 00:00:00 2001 From: Matt Anson Date: Sat, 22 Feb 2025 09:34:13 +0000 Subject: [PATCH 01/16] Configuration for the StackHPC fork of Redfish Exporter --- .../grafana/dashboards/openstack/redfish.json | 2465 +++++------------ .../prometheus.yml.d/60-redfish.yml | 30 +- etc/kayobe/seed.yml | 4 +- 3 files changed, 716 insertions(+), 1783 deletions(-) diff --git a/etc/kayobe/kolla/config/grafana/dashboards/openstack/redfish.json b/etc/kayobe/kolla/config/grafana/dashboards/openstack/redfish.json index 92001f8421..e4d959a140 100644 --- a/etc/kayobe/kolla/config/grafana/dashboards/openstack/redfish.json +++ b/etc/kayobe/kolla/config/grafana/dashboards/openstack/redfish.json @@ -20,9 +20,8 @@ "fiscalYearStartMonth": 0, "gnetId": 12403, "graphTooltip": 0, - "id": 28, + "id": 30, "links": [], - "liveNow": false, "panels": [ { "datasource": { @@ -60,6 +59,7 @@ "graphMode": "area", "justifyMode": "auto", "orientation": "auto", + "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "mean" @@ -71,15 +71,14 @@ "textMode": "auto", "wideLayout": true }, - "pluginVersion": "11.0.0", + "pluginVersion": "11.2.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "editorMode": "code", - "expr": "count(up{job=~\"redfish-exporter.*\"} == 1)", + "expr": "count(up{job=\"redfish-exporter\"} == 1)", "format": "table", "hide": false, "instant": true, @@ -140,6 +139,7 @@ "graphMode": "area", "justifyMode": "auto", "orientation": "auto", + "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "mean" @@ -151,7 +151,7 @@ "textMode": "auto", "wideLayout": true }, - "pluginVersion": "11.0.0", + "pluginVersion": "11.2.0", "targets": [ { "datasource": { @@ -175,11 +175,9 @@ "type": "prometheus", "uid": "${datasource}" }, - "description": "", "fieldConfig": { "defaults": { "mappings": [], - "noValue": "0", "thresholds": { "mode": "absolute", "steps": [ @@ -208,6 +206,7 @@ "graphMode": "area", "justifyMode": "auto", "orientation": "auto", + "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "mean" @@ -219,7 +218,7 @@ "textMode": "auto", "wideLayout": true }, - "pluginVersion": "11.0.0", + "pluginVersion": "11.2.0", "targets": [ { "datasource": { @@ -228,14 +227,11 @@ }, "expr": "count(redfish_system_power_state != 1)", "format": "table", - "fullMetaSearch": false, "hide": false, - "includeNullMetadata": true, "instant": true, "interval": "", - "legendFormat": "__auto", - "refId": "A", - "useBackend": false + "legendFormat": "", + "refId": "A" } ], "title": "Powered Off", @@ -249,7 +245,6 @@ "fieldConfig": { "defaults": { "mappings": [], - "noValue": "0", "thresholds": { "mode": "absolute", "steps": [ @@ -278,6 +273,7 @@ "graphMode": "area", "justifyMode": "auto", "orientation": "auto", + "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "mean" @@ -289,7 +285,7 @@ "textMode": "auto", "wideLayout": true }, - "pluginVersion": "11.0.0", + "pluginVersion": "11.2.0", "targets": [ { "datasource": { @@ -313,11 +309,9 @@ "type": "prometheus", "uid": "${datasource}" }, - "description": "", "fieldConfig": { "defaults": { "mappings": [], - "noValue": "0", "thresholds": { "mode": "absolute", "steps": [ @@ -346,6 +340,7 @@ "graphMode": "area", "justifyMode": "auto", "orientation": "auto", + "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "mean" @@ -357,7 +352,7 @@ "textMode": "auto", "wideLayout": true }, - "pluginVersion": "11.0.0", + "pluginVersion": "11.2.0", "targets": [ { "datasource": { @@ -419,8 +414,9 @@ "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, - "drawStyle": "points", - "fillOpacity": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, @@ -434,7 +430,7 @@ "scaleDistribution": { "type": "linear" }, - "showPoints": "auto", + "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", @@ -464,26 +460,28 @@ "overrides": [] }, "gridPos": { - "h": 15, - "w": 5, + "h": 11, + "w": 7, "x": 0, "y": 4 }, "id": 36, "options": { "legend": { - "calcs": [], + "calcs": [ + "mean", + "max" + ], "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { - "maxHeight": 600, "mode": "multi", "sort": "none" } }, - "pluginVersion": "10.2.2", + "pluginVersion": "7.1.5", "targets": [ { "datasource": { @@ -515,8 +513,9 @@ "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, - "drawStyle": "points", - "fillOpacity": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, @@ -530,7 +529,7 @@ "scaleDistribution": { "type": "linear" }, - "showPoints": "auto", + "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", @@ -560,26 +559,29 @@ "overrides": [] }, "gridPos": { - "h": 15, + "h": 11, "w": 6, - "x": 5, + "x": 7, "y": 4 }, "id": 44, "options": { "legend": { - "calcs": [], - "displayMode": "list", + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { - "maxHeight": 600, "mode": "multi", "sort": "none" } }, - "pluginVersion": "10.2.2", + "pluginVersion": "7.1.5", "targets": [ { "datasource": { @@ -602,13 +604,43 @@ }, "fieldConfig": { "defaults": { + "color": { + "mode": "palette-classic" + }, "custom": { - "align": "auto", - "cellOptions": { - "type": "auto" + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false }, - "inspect": false + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } }, + "links": [], "mappings": [], "thresholds": { "mode": "absolute", @@ -622,504 +654,60 @@ "value": 80 } ] - } - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Time" - }, - "properties": [ - { - "id": "displayName", - "value": "Time" - }, - { - "id": "custom.hidden", - "value": true - }, - { - "id": "custom.align" - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "instance" - }, - "properties": [ - { - "id": "displayName", - "value": "BMC" - }, - { - "id": "unit", - "value": "short" - }, - { - "id": "decimals", - "value": 2 - }, - { - "id": "custom.hidden", - "value": true - }, - { - "id": "links", - "value": [ - { - "targetBlank": true, - "title": "", - "url": "https://${__cell}" - } - ] - }, - { - "id": "custom.align", - "value": "left" - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Value" - }, - "properties": [ - { - "id": "displayName", - "value": "Power state" - }, - { - "id": "unit", - "value": "short" - }, - { - "id": "decimals", - "value": 2 - }, - { - "id": "custom.cellOptions", - "value": { - "type": "color-background" - } - }, - { - "id": "custom.align" - }, - { - "id": "thresholds", - "value": { - "mode": "absolute", - "steps": [ - { - "color": "#73BF69", - "value": null - }, - { - "color": "#73BF69", - "value": 0 - }, - { - "color": "#C4162A", - "value": 1 - } - ] - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "__name__" - }, - "properties": [ - { - "id": "unit", - "value": "short" - }, - { - "id": "decimals", - "value": 2 - }, - { - "id": "custom.hidden", - "value": true - }, - { - "id": "custom.align" - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "env" - }, - "properties": [ - { - "id": "unit", - "value": "short" - }, - { - "id": "decimals", - "value": 2 - }, - { - "id": "custom.hidden", - "value": true - }, - { - "id": "custom.align" - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "job" - }, - "properties": [ - { - "id": "unit", - "value": "short" - }, - { - "id": "decimals", - "value": 2 - }, - { - "id": "custom.hidden", - "value": true - }, - { - "id": "custom.align" - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "resource" - }, - "properties": [ - { - "id": "unit", - "value": "short" - }, - { - "id": "decimals", - "value": 2 - }, - { - "id": "custom.hidden", - "value": true - }, - { - "id": "custom.align" - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "system_id" - }, - "properties": [ - { - "id": "unit", - "value": "short" - }, - { - "id": "decimals", - "value": 2 - }, - { - "id": "custom.hidden", - "value": true - }, - { - "id": "custom.align" - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "server" - }, - "properties": [ - { - "id": "unit", - "value": "short" - }, - { - "id": "decimals", - "value": 2 - }, - { - "id": "links", - "value": [ - { - "targetBlank": true, - "title": "", - "url": "https://${__cell_3}" - } - ] - }, - { - "id": "custom.align" - } - ] }, - { - "matcher": { - "id": "byName", - "options": "hostname" - }, - "properties": [ - { - "id": "unit", - "value": "short" - }, - { - "id": "decimals", - "value": 2 - }, - { - "id": "custom.hidden", - "value": true - }, - { - "id": "custom.align" - } - ] - } - ] + "unit": "short" + }, + "overrides": [] }, "gridPos": { - "h": 15, - "w": 5, - "x": 11, + "h": 11, + "w": 6, + "x": 13, "y": 4 }, - "id": 38, + "id": 42, "options": { - "cellHeight": "sm", - "footer": { - "countRows": false, - "fields": "", - "reducer": [ - "sum" + "legend": { + "calcs": [ + "lastNotNull", + "max" ], - "show": false + "displayMode": "table", + "placement": "bottom", + "showLegend": true }, - "showHeader": true + "tooltip": { + "mode": "multi", + "sort": "none" + } }, - "pluginVersion": "11.0.0", + "pluginVersion": "7.1.5", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "expr": "sort_desc(redfish_system_power_state)", - "format": "table", - "instant": true, + "expr": "count(redfish_system_power_state == 1) by (env)", + "hide": false, "interval": "", - "legendFormat": "", + "legendFormat": "Powered up {{ env }}", "refId": "A" - } - ], - "title": "Power states", - "transformations": [ - { - "id": "merge", - "options": { - "reducers": [] - } - } - ], - "type": "table" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "custom": { - "align": "auto", - "cellOptions": { - "type": "auto" - }, - "inspect": false - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [ - { - "matcher": { - "id": "byRegexp", - "options": "/__name__|chassis_id|Time|env|job|resource|instance/" - }, - "properties": [ - { - "id": "displayName", - "value": "Time" - }, - { - "id": "custom.hidden", - "value": true - }, - { - "id": "custom.align" - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Value" - }, - "properties": [ - { - "id": "displayName", - "value": "Status" - }, - { - "id": "unit", - "value": "short" - }, - { - "id": "decimals", - "value": 2 - }, - { - "id": "custom.cellOptions", - "value": { - "type": "color-background" - } - }, - { - "id": "custom.align" - }, - { - "id": "thresholds", - "value": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(50, 172, 45, 0.97)", - "value": null - }, - { - "color": "rgba(237, 129, 40, 0.89)", - "value": 1 - }, - { - "color": "rgba(245, 54, 54, 0.9)", - "value": 2 - } - ] - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "server" - }, - "properties": [ - { - "id": "unit", - "value": "short" - }, - { - "id": "decimals", - "value": 2 - }, - { - "id": "links", - "value": [ - { - "targetBlank": true, - "title": "", - "url": "https://$__cell_4" - } - ] - }, - { - "id": "custom.align" - } - ] - } - ] - }, - "gridPos": { - "h": 15, - "w": 8, - "x": 16, - "y": 4 - }, - "id": 33, - "interval": "", - "options": { - "cellHeight": "sm", - "footer": { - "countRows": false, - "fields": "", - "reducer": [ - "sum" - ], - "show": false }, - "showHeader": true - }, - "pluginVersion": "11.0.0", - "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "expr": "sort(redfish_chassis_health)", - "format": "table", - "hide": false, - "instant": true, + "expr": "count(redfish_system_power_state == 2) by (env) * -1", + "hide": true, "interval": "", - "legendFormat": "", - "refId": "A" - } - ], - "title": "Chassis status", - "transformations": [ - { - "id": "merge", - "options": { - "reducers": [] - } + "legendFormat": "Powered down {{ env }}", + "refId": "B" } ], - "type": "table" + "title": "Powered ON by Rack", + "type": "timeseries" }, { "datasource": { @@ -1138,8 +726,9 @@ "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, - "drawStyle": "points", - "fillOpacity": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, @@ -1153,7 +742,7 @@ "scaleDistribution": { "type": "linear" }, - "showPoints": "auto", + "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", @@ -1165,7 +754,6 @@ }, "links": [], "mappings": [], - "min": 16, "thresholds": { "mode": "absolute", "steps": [ @@ -1179,56 +767,66 @@ } ] }, - "unit": "celsius" + "unit": "short" }, "overrides": [] }, "gridPos": { "h": 11, "w": 5, - "x": 0, - "y": 19 + "x": 19, + "y": 4 }, - "id": 39, - "interval": "5m", + "id": 43, "options": { "legend": { - "calcs": [], - "displayMode": "list", + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { - "maxHeight": 600, "mode": "multi", "sort": "none" } }, - "pluginVersion": "10.2.2", + "pluginVersion": "7.1.5", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "editorMode": "code", - "expr": "max(redfish_chassis_temperature_celsius{sensor_id=~\".*InletTemp\"} or redfish_chassis_temperature_celsius{sensor=~\".*Inlet.*\"}) by (env)", - "hide": false, + "expr": "count(redfish_system_power_state == 1) by (env)", + "hide": true, "interval": "", - "legendFormat": "max inlet {{ env }}", - "range": true, + "legendFormat": "Powered up {{ env }}", "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "count(redfish_system_power_state == 2) by (env)", + "hide": false, + "interval": "", + "legendFormat": "Powered down {{ env }}", + "refId": "B" } ], - "title": "Max Inlet Temp", + "title": "Powered OFF by Rack", "type": "timeseries" }, { "datasource": { + "default": false, "type": "prometheus", "uid": "${datasource}" }, - "description": "", "fieldConfig": { "defaults": { "color": { @@ -1241,8 +839,9 @@ "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, - "drawStyle": "points", - "fillOpacity": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, @@ -1256,7 +855,7 @@ "scaleDistribution": { "type": "linear" }, - "showPoints": "auto", + "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", @@ -1268,7 +867,7 @@ }, "links": [], "mappings": [], - "min": 40, + "min": 16, "thresholds": { "mode": "absolute", "steps": [ @@ -1288,26 +887,29 @@ }, "gridPos": { "h": 11, - "w": 6, - "x": 5, - "y": 19 + "w": 7, + "x": 0, + "y": 15 }, - "id": 40, + "id": 39, "interval": "5m", "options": { "legend": { - "calcs": [], - "displayMode": "list", + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { - "maxHeight": 600, "mode": "multi", "sort": "none" } }, - "pluginVersion": "10.2.2", + "pluginVersion": "7.1.5", "targets": [ { "datasource": { @@ -1315,22 +917,24 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "max(redfish_chassis_temperature_celsius{sensor_id=~\".*CPU1Temp\"} or redfish_chassis_temperature_celsius{sensor=~\".*CPU1.*\"}) by (env)", + "expr": "max(redfish_chassis_temperature_celsius{sensor=~\".*Ambient.*Temp\"}) by (env)", "hide": false, "interval": "", - "legendFormat": "{{ env }}", + "legendFormat": "max inlet {{ env }}", "range": true, "refId": "A" } ], - "title": "Max CPU1 Temp", + "title": "Max Inlet Temp", "type": "timeseries" }, { "datasource": { + "default": false, "type": "prometheus", "uid": "${datasource}" }, + "description": "", "fieldConfig": { "defaults": { "color": { @@ -1343,8 +947,9 @@ "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, - "drawStyle": "points", - "fillOpacity": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, @@ -1358,7 +963,7 @@ "scaleDistribution": { "type": "linear" }, - "showPoints": "auto", + "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", @@ -1370,7 +975,7 @@ }, "links": [], "mappings": [], - "min": 40, + "min": 20, "thresholds": { "mode": "absolute", "steps": [ @@ -1386,30 +991,74 @@ }, "unit": "celsius" }, - "overrides": [] + "overrides": [ + { + "matcher": { + "id": "byValue", + "options": { + "op": "gte", + "reducer": "allIsZero", + "value": 0 + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": true, + "tooltip": true, + "viz": false + } + } + ] + }, + { + "matcher": { + "id": "byValue", + "options": { + "op": "gte", + "reducer": "allIsNull", + "value": 0 + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": true, + "tooltip": true, + "viz": false + } + } + ] + } + ] }, "gridPos": { "h": 11, - "w": 5, - "x": 11, - "y": 19 + "w": 8, + "x": 7, + "y": 15 }, - "id": 41, + "id": 40, "interval": "5m", "options": { "legend": { - "calcs": [], - "displayMode": "list", + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { - "maxHeight": 600, "mode": "multi", "sort": "none" } }, - "pluginVersion": "10.2.2", + "pluginVersion": "7.1.5", "targets": [ { "datasource": { @@ -1417,7 +1066,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "max(redfish_chassis_temperature_celsius{sensor_id=~\".*CPU2Temp\"} or redfish_chassis_temperature_celsius{sensor=~\"CPU2.*\"}) by (env)", + "expr": "max(redfish_chassis_temperature_celsius{sensor=~\".*CPU.*1.*Temp\"}) by (env)", "hide": false, "interval": "", "legendFormat": "{{ env }}", @@ -1425,11 +1074,12 @@ "refId": "A" } ], - "title": "Max CPU2 Temp", + "title": "Max CPU1 Temp", "type": "timeseries" }, { "datasource": { + "default": false, "type": "prometheus", "uid": "${datasource}" }, @@ -1445,8 +1095,9 @@ "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, - "drawStyle": "points", - "fillOpacity": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, @@ -1460,7 +1111,7 @@ "scaleDistribution": { "type": "linear" }, - "showPoints": "auto", + "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", @@ -1472,6 +1123,7 @@ }, "links": [], "mappings": [], + "min": 20, "thresholds": { "mode": "absolute", "steps": [ @@ -1485,140 +1137,76 @@ } ] }, - "unit": "short" + "unit": "celsius" }, - "overrides": [] - }, - "gridPos": { - "h": 11, - "w": 4, - "x": 16, - "y": 19 - }, - "id": 42, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "maxHeight": 600, - "mode": "multi", - "sort": "none" - } - }, - "pluginVersion": "10.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "expr": "count(redfish_system_power_state == 1) by (env)", - "hide": false, - "interval": "", - "legendFormat": "Powered up {{ env }}", - "refId": "A" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "expr": "count(redfish_system_power_state == 2) by (env) * -1", - "hide": true, - "interval": "", - "legendFormat": "Powered down {{ env }}", - "refId": "B" - } - ], - "title": "Powered ON by Rack", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "points", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" + "overrides": [ + { + "matcher": { + "id": "byValue", + "options": { + "op": "gte", + "reducer": "allIsZero", + "value": 0 + } }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, + "properties": [ { - "color": "red", - "value": 80 + "id": "custom.hideFrom", + "value": { + "legend": true, + "tooltip": true, + "viz": false + } } ] }, - "unit": "short" - }, - "overrides": [] + { + "matcher": { + "id": "byValue", + "options": { + "op": "gte", + "reducer": "allIsNull", + "value": 0 + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": true, + "tooltip": true, + "viz": false + } + } + ] + } + ] }, "gridPos": { "h": 11, - "w": 4, - "x": 20, - "y": 19 + "w": 9, + "x": 15, + "y": 15 }, - "id": 43, + "id": 41, + "interval": "5m", "options": { "legend": { - "calcs": [], - "displayMode": "list", + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { - "maxHeight": 600, "mode": "multi", "sort": "none" } }, - "pluginVersion": "10.2.2", + "pluginVersion": "7.1.5", "targets": [ { "datasource": { @@ -1626,26 +1214,15 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "count(redfish_system_power_state == 1) by (env)", - "hide": true, + "expr": "max(redfish_chassis_temperature_celsius{sensor=~\".*CPU.2.*Temp\"}) by (env)", + "hide": false, "interval": "", - "legendFormat": "Powered up {{ env }}", + "legendFormat": "{{ env }}", "range": true, "refId": "A" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "expr": "count(redfish_system_power_state == 2) by (env)", - "hide": false, - "interval": "", - "legendFormat": "Powered down {{ env }}", - "refId": "B" } ], - "title": "Powered OFF by Rack", + "title": "Max CPU2 Temp", "type": "timeseries" }, { @@ -1659,6 +1236,7 @@ }, "dataFormat": "timeseries", "datasource": { + "default": false, "type": "prometheus", "uid": "${datasource}" }, @@ -1679,9 +1257,9 @@ }, "gridPos": { "h": 10, - "w": 7, + "w": 5, "x": 0, - "y": 30 + "y": 26 }, "heatmap": {}, "hideZeroBuckets": false, @@ -1719,7 +1297,6 @@ }, "showValue": "never", "tooltip": { - "maxHeight": 600, "mode": "single", "showColorScale": false, "yHistogram": true @@ -1730,7 +1307,7 @@ "unit": "short" } }, - "pluginVersion": "11.0.0", + "pluginVersion": "11.2.0", "reverseYBuckets": false, "targets": [ { @@ -1739,7 +1316,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "redfish_chassis_temperature_celsius{sensor_id=~\".*InletTemp\"} or redfish_chassis_temperature_celsius{sensor=~\".*Inlet.*\"}", + "expr": "redfish_chassis_temperature_celsius{sensor=~\".*Ambient.*Temp\"}", "hide": false, "interval": "", "intervalFactor": 1, @@ -1775,6 +1352,7 @@ }, "dataFormat": "timeseries", "datasource": { + "default": false, "type": "prometheus", "uid": "${datasource}" }, @@ -1796,14 +1374,14 @@ }, "gridPos": { "h": 10, - "w": 5, - "x": 7, - "y": 30 + "w": 6, + "x": 5, + "y": 26 }, "heatmap": {}, "hideZeroBuckets": false, "highlightCards": true, - "id": 51, + "id": 48, "interval": "1m", "legend": { "show": false @@ -1836,18 +1414,19 @@ }, "showValue": "never", "tooltip": { - "maxHeight": 600, "mode": "single", "showColorScale": false, "yHistogram": true }, "yAxis": { "axisPlacement": "left", + "max": "95", + "min": "25", "reverse": false, "unit": "short" } }, - "pluginVersion": "11.0.0", + "pluginVersion": "11.2.0", "reverseYBuckets": false, "targets": [ { @@ -1855,16 +1434,17 @@ "type": "prometheus", "uid": "${datasource}" }, - "expr": "max(redfish_chassis_fan_rpm_percentage) by (server) > 0", + "editorMode": "code", + "expr": "redfish_chassis_temperature_celsius{sensor=~\".*CPU.*1.*Temp\"} != 0", "hide": false, "interval": "", "intervalFactor": 1, - "legendFormat": "", + "legendFormat": "{{ env }}", "range": true, "refId": "A" } ], - "title": "Max server fan speed", + "title": "CPU1 Temp", "tooltip": { "show": true, "showHistogram": true @@ -1876,6 +1456,8 @@ "yAxis": { "format": "short", "logBase": 1, + "max": "95", + "min": "25", "show": true }, "yBucketBound": "auto" @@ -1891,6 +1473,7 @@ }, "dataFormat": "timeseries", "datasource": { + "default": false, "type": "prometheus", "uid": "${datasource}" }, @@ -1913,13 +1496,13 @@ "gridPos": { "h": 10, "w": 6, - "x": 12, - "y": 30 + "x": 11, + "y": 26 }, "heatmap": {}, "hideZeroBuckets": false, "highlightCards": true, - "id": 48, + "id": 50, "interval": "1m", "legend": { "show": false @@ -1952,7 +1535,6 @@ }, "showValue": "never", "tooltip": { - "maxHeight": 600, "mode": "single", "showColorScale": false, "yHistogram": true @@ -1965,7 +1547,7 @@ "unit": "short" } }, - "pluginVersion": "11.0.0", + "pluginVersion": "11.2.0", "reverseYBuckets": false, "targets": [ { @@ -1974,7 +1556,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "(redfish_chassis_temperature_celsius{sensor_id=~\".*CPU1Temp\"} or redfish_chassis_temperature_celsius{sensor=~\"CPU1.*\"}) != 0", + "expr": "redfish_chassis_temperature_celsius{sensor=~\".*CPU.*2.*Temp\"} != 0", "hide": false, "interval": "", "intervalFactor": 1, @@ -1983,7 +1565,7 @@ "refId": "A" } ], - "title": "CPU1 Temp", + "title": "CPU2 Temp", "tooltip": { "show": true, "showHistogram": true @@ -2033,14 +1615,14 @@ }, "gridPos": { "h": 10, - "w": 6, + "w": 5, "x": 18, - "y": 30 + "y": 26 }, "heatmap": {}, "hideZeroBuckets": false, "highlightCards": true, - "id": 50, + "id": 51, "interval": "1m", "legend": { "show": false @@ -2073,20 +1655,17 @@ }, "showValue": "never", "tooltip": { - "maxHeight": 600, "mode": "single", "showColorScale": false, "yHistogram": true }, "yAxis": { "axisPlacement": "left", - "max": "95", - "min": "25", "reverse": false, "unit": "short" } }, - "pluginVersion": "11.0.0", + "pluginVersion": "11.2.0", "reverseYBuckets": false, "targets": [ { @@ -2094,17 +1673,15 @@ "type": "prometheus", "uid": "${datasource}" }, - "editorMode": "code", - "expr": "(redfish_chassis_temperature_celsius{sensor_id=~\".*CPU2Temp\"} or redfish_chassis_temperature_celsius{sensor=~\"CPU2.*\"}) != 0", + "expr": "max(redfish_chassis_fan_rpm_percentage) by (server) > 0", "hide": false, "interval": "", "intervalFactor": 1, - "legendFormat": "{{ env }}", - "range": true, + "legendFormat": "", "refId": "A" } ], - "title": "CPU2 Temp", + "title": "Max server fan speed", "tooltip": { "show": true, "showHistogram": true @@ -2116,14 +1693,13 @@ "yAxis": { "format": "short", "logBase": 1, - "max": "95", - "min": "25", "show": true }, "yBucketBound": "auto" }, { "datasource": { + "default": false, "type": "prometheus", "uid": "${datasource}" }, @@ -2139,6 +1715,7 @@ "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, + "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", @@ -2185,9 +1762,9 @@ }, "gridPos": { "h": 10, - "w": 13, + "w": 12, "x": 0, - "y": 40 + "y": 36 }, "id": 47, "interval": "5m", @@ -2203,12 +1780,11 @@ "showLegend": true }, "tooltip": { - "maxHeight": 600, "mode": "multi", "sort": "none" } }, - "pluginVersion": "10.2.2", + "pluginVersion": "7.1.5", "targets": [ { "datasource": { @@ -2216,7 +1792,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "redfish_chassis_temperature_celsius{sensor_id=~\".*InletTemp\"} or redfish_chassis_temperature_celsius{sensor=~\"CPU2.*\"}", + "expr": "redfish_chassis_temperature_celsius{sensor=~\".*Ambient.*Temp\"}", "hide": false, "interval": "", "legendFormat": "{{ env }} {{ server }}", @@ -2229,6 +1805,7 @@ }, { "datasource": { + "default": false, "type": "prometheus", "uid": "${datasource}" }, @@ -2245,6 +1822,7 @@ "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, + "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", @@ -2291,9 +1869,9 @@ }, "gridPos": { "h": 10, - "w": 11, - "x": 13, - "y": 40 + "w": 12, + "x": 12, + "y": 36 }, "id": 49, "interval": "1m", @@ -2308,12 +1886,11 @@ "showLegend": true }, "tooltip": { - "maxHeight": 600, "mode": "multi", "sort": "none" } }, - "pluginVersion": "10.2.2", + "pluginVersion": "7.1.5", "targets": [ { "datasource": { @@ -2321,7 +1898,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "(redfish_chassis_temperature_celsius{sensor_id=~\".*CPU1Temp\"} or redfish_chassis_temperature_celsius{sensor=~\"CPU1.*\"}) != 0", + "expr": "redfish_chassis_temperature_celsius{sensor=~\".*CPU.*1.*Temp\"} != 0", "hide": false, "interval": "", "legendFormat": "{{ env }} {{ server }}", @@ -2332,252 +1909,97 @@ "title": "Max CPU1 Temp", "type": "timeseries" }, + { + "collapsed": false, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 46 + }, + "id": 29, + "panels": [], + "repeat": "server", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "refId": "A" + } + ], + "title": "$server", + "type": "row" + }, { "datasource": { + "default": false, "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { - "custom": { - "align": "auto", - "cellOptions": { - "type": "auto" - }, - "inspect": false - }, "mappings": [], + "max": 500, + "min": 0, "thresholds": { - "mode": "absolute", + "mode": "percentage", "steps": [ { "color": "green", "value": null - }, - { - "color": "red", - "value": 80 } ] - } + }, + "unit": "watt" }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Time" - }, - "properties": [ - { - "id": "displayName", - "value": "Time" - }, - { - "id": "custom.hidden", - "value": true - }, - { - "id": "custom.align" - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Value" - }, - "properties": [ - { - "id": "displayName", - "value": "Count" - }, - { - "id": "unit", - "value": "short" - }, - { - "id": "custom.cellOptions", - "value": { - "type": "color-background" - } - }, - { - "id": "custom.align" - }, - { - "id": "thresholds", - "value": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(50, 172, 45, 0.97)", - "value": null - }, - { - "color": "rgba(237, 129, 40, 0.89)", - "value": 0 - }, - { - "color": "rgba(245, 54, 54, 0.9)", - "value": 1 - } - ] - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "instance" - }, - "properties": [ - { - "id": "displayName", - "value": "BMC" - }, - { - "id": "unit", - "value": "short" - }, - { - "id": "decimals", - "value": 2 - }, - { - "id": "links", - "value": [ - { - "targetBlank": true, - "title": "", - "url": "https://${__cell:raw}" - } - ] - }, - { - "id": "custom.align" - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "__name__" - }, - "properties": [ - { - "id": "unit", - "value": "short" - }, - { - "id": "decimals", - "value": 2 - }, - { - "id": "custom.hidden", - "value": true - }, - { - "id": "custom.align" - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "job" - }, - "properties": [ - { - "id": "unit", - "value": "short" - }, - { - "id": "decimals", - "value": 2 - }, - { - "id": "custom.hidden", - "value": true - }, - { - "id": "custom.align" - } - ] - } - ] + "overrides": [] }, "gridPos": { - "h": 7, - "w": 24, + "h": 4, + "w": 8, "x": 0, - "y": 50 + "y": 47 }, - "id": 34, + "id": 19, "options": { - "cellHeight": "sm", - "footer": { - "countRows": false, - "fields": "", - "reducer": [ - "sum" + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "mean" ], - "show": false + "fields": "", + "values": false }, - "showHeader": true + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" }, - "pluginVersion": "11.0.0", + "pluginVersion": "11.2.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "expr": "sort(redfish_logservices_entry_count{name=\"SEL Log Service\", severity!=\"OK\"})", - "format": "table", - "instant": true, + "editorMode": "code", + "expr": "redfish_chassis_power_average_consumed_watts{server=~\"$server\", job=\"redfish-exporter\"}", + "hide": false, "interval": "", - "legendFormat": "{{instance}}", - "refId": "A" - } - ], - "title": "Errors in event log", - "transformations": [ - { - "id": "merge", - "options": { - "reducers": [] - } - } - ], - "type": "table" - }, - { - "collapsed": false, - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 57 - }, - "id": 29, - "panels": [], - "repeat": "server", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, + "legendFormat": "{{power_voltage}}", + "range": true, "refId": "A" } ], - "title": "$server", - "type": "row" + "title": "Power consumption", + "type": "gauge" }, { "datasource": { @@ -2586,8 +2008,8 @@ }, "fieldConfig": { "defaults": { + "displayName": "Disks", "mappings": [], - "max": 500, "min": 0, "thresholds": { "mode": "absolute", @@ -2601,18 +2023,17 @@ "value": 80 } ] - }, - "unit": "watt" + } }, "overrides": [] }, "gridPos": { "h": 4, - "w": 8, - "x": 0, - "y": 58 + "w": 2, + "x": 8, + "y": 47 }, - "id": 19, + "id": 24, "options": { "minVizHeight": 75, "minVizWidth": 75, @@ -2628,446 +2049,23 @@ "showThresholdMarkers": true, "sizing": "auto" }, - "pluginVersion": "11.0.0", + "pluginVersion": "11.2.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "expr": "redfish_chassis_power_average_consumed_watts{server=~\"$server\"}", - "hide": false, + "expr": "count(redfish_system_storage_drive_state{server=~\"$server\"} != 1) or vector(0)", + "format": "table", + "instant": true, "interval": "", - "legendFormat": "{{type}}", + "legendFormat": "", "refId": "A" } ], - "title": "Power consumption", - "type": "gauge" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 3, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "line+area" - } - }, - "decimals": 1, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "red", - "value": null - }, - { - "color": "transparent", - "value": 10000 - } - ] - }, - "unit": "rpm" - }, - "overrides": [ - { - "matcher": { - "id": "byValue", - "options": { - "op": "gte", - "reducer": "allIsZero", - "value": 0 - } - }, - "properties": [ - { - "id": "custom.hideFrom", - "value": { - "legend": true, - "tooltip": true, - "viz": false - } - } - ] - }, - { - "matcher": { - "id": "byValue", - "options": { - "op": "gte", - "reducer": "allIsNull", - "value": 0 - } - }, - "properties": [ - { - "id": "custom.hideFrom", - "value": { - "legend": true, - "tooltip": true, - "viz": false - } - } - ] - } - ] - }, - "gridPos": { - "h": 14, - "w": 8, - "x": 8, - "y": 58 - }, - "id": 4, - "interval": "", - "options": { - "legend": { - "calcs": [ - "max" - ], - "displayMode": "table", - "placement": "right", - "showLegend": true - }, - "tooltip": { - "maxHeight": 600, - "mode": "multi", - "sort": "none" - } - }, - "pluginVersion": "10.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "expr": "redfish_chassis_fan_rpm_percentage{server=~\"$server\"}", - "format": "time_series", - "instant": false, - "interval": "", - "legendFormat": "{{fan}}", - "refId": "A" - } - ], - "title": "Fans", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "decimals": 0, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { - "h": 7, - "w": 8, - "x": 16, - "y": 58 - }, - "id": 13, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "maxHeight": 600, - "mode": "multi", - "sort": "none" - } - }, - "pluginVersion": "10.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "expr": "redfish_chassis_power_average_consumed_watts{server=~\"$server\"}", - "interval": "", - "legendFormat": "{{resource}}", - "refId": "A" - } - ], - "title": "Power consumption", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [ - { - "options": { - "1": { - "text": "Up" - }, - "2": { - "text": "Down" - } - }, - "type": "value" - } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "#d44a3a", - "value": null - }, - { - "color": "#299c46", - "value": 0 - }, - { - "color": "#299c46", - "value": 2 - } - ] - }, - "unit": "none" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 2, - "x": 0, - "y": 62 - }, - "id": 6, - "maxDataPoints": 100, - "options": { - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.0.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "expr": "redfish_system_power_state{server=~\"$server\"}", - "format": "time_series", - "instant": true, - "interval": "", - "legendFormat": "", - "refId": "A" - } - ], - "title": "Power Status", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [ - { - "options": { - "1": { - "text": "OK" - }, - "2": { - "text": "WARNING" - }, - "3": { - "text": "CRITICAL" - } - }, - "type": "value" - } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "#299c46", - "value": null - }, - { - "color": "#299c46", - "value": 1 - }, - { - "color": "#d44a3a", - "value": 2 - } - ] - }, - "unit": "none" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 4, - "x": 2, - "y": 62 - }, - "id": 7, - "maxDataPoints": 100, - "options": { - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "vertical", - "reduceOptions": { - "calcs": [ - "max" - ], - "fields": "/^Value$/", - "values": false - }, - "showPercentChange": false, - "textMode": "value", - "wideLayout": true - }, - "pluginVersion": "11.0.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "editorMode": "code", - "exemplar": false, - "expr": "redfish_chassis_health{server=~\"$server\"}", - "format": "table", - "instant": true, - "interval": "", - "legendFormat": "{{chassis_id}}", - "range": false, - "refId": "A" - } - ], - "title": "General Health", - "type": "stat" + "title": "Disk with errors", + "type": "gauge" }, { "datasource": { @@ -3076,103 +2074,76 @@ }, "fieldConfig": { "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [ - { - "options": { - "1": { - "text": "Healthy" - }, - "2": { - "text": "Warning" - }, - "3": { - "text": "Critical" - } - }, - "type": "value" - } - ], + "displayName": "Controllers", + "mappings": [], + "min": 0, "thresholds": { "mode": "absolute", "steps": [ { - "color": "#d44a3a", + "color": "green", "value": null }, { - "color": "#299c46", - "value": 1 - }, - { - "color": "#299c46", - "value": 2 + "color": "red", + "value": 80 } ] - }, - "unit": "none" + } }, "overrides": [] }, "gridPos": { - "h": 2, + "h": 4, "w": 2, - "x": 6, - "y": 62 + "x": 10, + "y": 47 }, - "id": 8, - "maxDataPoints": 100, + "id": 25, "options": { - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "horizontal", + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", "reduceOptions": { "calcs": [ - "lastNotNull" + "mean" ], "fields": "", "values": false }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" }, - "pluginVersion": "11.0.0", + "pluginVersion": "11.2.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "expr": "redfish_manager_health_state{server=~\"$server\"}", - "format": "time_series", + "expr": "count(redfish_system_pcie_device_health_state{server=~\"$server\"} != 1) or vector(0)", + "format": "table", "instant": true, "interval": "", "legendFormat": "", "refId": "A" } ], - "title": "BMC Health", - "type": "stat" + "title": "PCI-E with errors", + "type": "gauge" }, { "datasource": { + "default": false, "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { - "custom": { - "align": "auto", - "cellOptions": { - "type": "auto" - }, - "inspect": false - }, + "displayName": "Sensor", "mappings": [], + "min": 0, "thresholds": { "mode": "absolute", "steps": [ @@ -3187,224 +2158,39 @@ ] } }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Time" - }, - "properties": [ - { - "id": "unit", - "value": "short" - }, - { - "id": "decimals", - "value": 2 - }, - { - "id": "custom.hidden", - "value": true - }, - { - "id": "custom.align" - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "__name__" - }, - "properties": [ - { - "id": "unit", - "value": "short" - }, - { - "id": "decimals", - "value": 2 - }, - { - "id": "custom.hidden", - "value": true - }, - { - "id": "custom.align" - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "instance" - }, - "properties": [ - { - "id": "unit", - "value": "short" - }, - { - "id": "decimals", - "value": 2 - }, - { - "id": "custom.hidden", - "value": true - }, - { - "id": "custom.align" - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "job" - }, - "properties": [ - { - "id": "unit", - "value": "short" - }, - { - "id": "decimals", - "value": 2 - }, - { - "id": "custom.hidden", - "value": true - }, - { - "id": "custom.align" - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "env" - }, - "properties": [ - { - "id": "unit", - "value": "short" - }, - { - "id": "decimals", - "value": 2 - }, - { - "id": "custom.hidden", - "value": true - }, - { - "id": "custom.align" - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Value" - }, - "properties": [ - { - "id": "displayName", - "value": "Status" - }, - { - "id": "unit", - "value": "short" - }, - { - "id": "decimals", - "value": 2 - }, - { - "id": "custom.cellOptions", - "value": { - "type": "color-background" - } - }, - { - "id": "custom.align" - }, - { - "id": "thresholds", - "value": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(245, 54, 54, 0.9)", - "value": null - }, - { - "color": "rgba(237, 129, 40, 0.89)", - "value": 0 - }, - { - "color": "rgba(50, 172, 45, 0.97)", - "value": 1 - } - ] - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "resource" - }, - "properties": [ - { - "id": "unit", - "value": "short" - }, - { - "id": "decimals", - "value": 2 - }, - { - "id": "custom.hidden", - "value": true - }, - { - "id": "custom.align" - } - ] - } - ] + "overrides": [] }, "gridPos": { - "h": 12, - "w": 8, - "x": 0, - "y": 64 + "h": 4, + "w": 2, + "x": 12, + "y": 47 }, - "id": 2, - "interval": "", + "id": 26, "options": { - "cellHeight": "sm", - "footer": { - "countRows": false, - "fields": "", - "reducer": [ - "sum" + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "mean" ], - "show": false + "fields": "", + "values": false }, - "showHeader": true + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" }, - "pluginVersion": "11.0.0", + "pluginVersion": "11.2.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "expr": "redfish_system_storage_drive_state{server=~\"$server\"}", + "editorMode": "code", + "expr": "count(redfish_chassis_temperature_sensor_state{server=~\"$server\"} > 2) or vector(0)", "format": "table", "instant": true, "interval": "", @@ -3412,16 +2198,8 @@ "refId": "A" } ], - "title": "Disk states / health", - "transformations": [ - { - "id": "merge", - "options": { - "reducers": [] - } - } - ], - "type": "table" + "title": "Sensors with errors", + "type": "gauge" }, { "datasource": { @@ -3430,7 +2208,7 @@ }, "fieldConfig": { "defaults": { - "displayName": "Disks", + "displayName": "Power Supply", "mappings": [], "min": 0, "thresholds": { @@ -3452,10 +2230,10 @@ "gridPos": { "h": 4, "w": 2, - "x": 8, - "y": 72 + "x": 14, + "y": 47 }, - "id": 24, + "id": 27, "options": { "minVizHeight": 75, "minVizWidth": 75, @@ -3471,14 +2249,14 @@ "showThresholdMarkers": true, "sizing": "auto" }, - "pluginVersion": "11.0.0", + "pluginVersion": "11.2.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "expr": "count(redfish_system_storage_drive_state{server=~\"$server\"} != 1) or vector(0)", + "expr": "count(redfish_chassis_power_powersupply_health{server=~\"$server\"} > 1) or vector(0)", "format": "table", "instant": true, "interval": "", @@ -3486,18 +2264,29 @@ "refId": "A" } ], - "title": "Disk with errors", + "title": "PS with errors", "type": "gauge" }, { "datasource": { + "default": false, "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { - "displayName": "Controllers", + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "color-text" + }, + "inspect": false + }, "mappings": [], + "max": 80, "min": 0, "thresholds": { "mode": "absolute", @@ -3508,52 +2297,69 @@ }, { "color": "red", - "value": 80 + "value": 70 } ] - } + }, + "unit": "degree" }, "overrides": [] }, "gridPos": { - "h": 4, - "w": 2, - "x": 10, - "y": 72 + "h": 18, + "w": 8, + "x": 16, + "y": 47 }, - "id": 25, + "id": 17, "options": { - "minVizHeight": 75, - "minVizWidth": 75, - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "mean" - ], + "cellHeight": "sm", + "footer": { + "countRows": false, "fields": "", - "values": false + "reducer": [ + "sum" + ], + "show": false }, - "showThresholdLabels": false, - "showThresholdMarkers": true, - "sizing": "auto" + "showHeader": true }, - "pluginVersion": "11.0.0", + "pluginVersion": "11.2.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "expr": "count(redfish_system_pcie_device_health_state{server=~\"$server\"} != 1) or vector(0)", + "editorMode": "code", + "exemplar": false, + "expr": "sum by (sensor) (sum_over_time(redfish_chassis_temperature_celsius{server=~\"oscephpor01\"}[15m])) / sum by (sensor) (count_over_time(redfish_chassis_temperature_celsius{server=~\"oscephpor01\"}[15m])) > 0", "format": "table", "instant": true, "interval": "", - "legendFormat": "", + "legendFormat": "{{sensor}}", + "range": false, "refId": "A" } ], - "title": "PCI-E with errors", - "type": "gauge" + "title": "Temperatures", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true + }, + "includeByName": {}, + "indexByName": {}, + "renameByName": { + "Value": "Temperature", + "sensor": "Sensor" + } + } + } + ], + "type": "table" }, { "datasource": { @@ -3562,130 +2368,186 @@ }, "fieldConfig": { "defaults": { - "displayName": "Fans", - "mappings": [], - "min": 0, + "mappings": [ + { + "options": { + "1": { + "text": "Up" + } + }, + "type": "value" + }, + { + "options": { + "2": { + "text": "Down" + } + }, + "type": "value" + } + ], "thresholds": { "mode": "absolute", "steps": [ { - "color": "green", + "color": "#d44a3a", "value": null }, { - "color": "red", - "value": 80 + "color": "#299c46", + "value": 0 + }, + { + "color": "#299c46", + "value": 2 } ] - } + }, + "unit": "none" }, "overrides": [] }, "gridPos": { "h": 4, - "w": 2, - "x": 12, - "y": 72 + "w": 3, + "x": 0, + "y": 51 }, - "id": 26, + "id": 6, + "maxDataPoints": 100, "options": { - "minVizHeight": 75, - "minVizWidth": 75, - "orientation": "auto", + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ - "mean" + "lastNotNull" ], "fields": "", "values": false }, - "showThresholdLabels": false, - "showThresholdMarkers": true, - "sizing": "auto" + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true }, - "pluginVersion": "11.0.0", + "pluginVersion": "11.2.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "expr": "count(redfish_chassis_temperature_sensor_state{server=~\"$server\"} != 1) or vector(0)", - "format": "table", + "expr": "redfish_system_power_state{server=~\"$server\"}", + "format": "time_series", "instant": true, "interval": "", "legendFormat": "", "refId": "A" } ], - "title": "Sensors with errors", - "type": "gauge" + "title": "Power Status", + "type": "stat" }, { "datasource": { + "default": false, "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { - "displayName": "Power Supply", - "mappings": [], - "min": 0, + "mappings": [ + { + "options": { + "1": { + "text": "OK" + } + }, + "type": "value" + }, + { + "options": { + "2": { + "text": "WARNING" + } + }, + "type": "value" + }, + { + "options": { + "3": { + "text": "CRITICAL" + } + }, + "type": "value" + } + ], "thresholds": { "mode": "absolute", "steps": [ { - "color": "green", + "color": "#299c46", "value": null }, { - "color": "red", - "value": 80 + "color": "#299c46", + "value": 1 + }, + { + "color": "#d44a3a", + "value": 2 } ] - } + }, + "unit": "none" }, "overrides": [] }, "gridPos": { "h": 4, - "w": 2, - "x": 14, - "y": 72 + "w": 5, + "x": 3, + "y": 51 }, - "id": 27, + "id": 7, + "maxDataPoints": 100, "options": { - "minVizHeight": 75, - "minVizWidth": 75, - "orientation": "auto", + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ - "mean" + "lastNotNull" ], "fields": "", "values": false }, - "showThresholdLabels": false, - "showThresholdMarkers": true, - "sizing": "auto" + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true }, - "pluginVersion": "11.0.0", + "pluginVersion": "11.2.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "expr": "count(redfish_chassis_power_powersupply_health{server=~\"$server\"} > 1) or vector(0)", - "format": "table", + "editorMode": "code", + "expr": "redfish_chassis_health{server=~\"$server\", job=\"redfish-exporter\"}", + "format": "time_series", "instant": true, "interval": "", - "legendFormat": "", + "legendFormat": "Chassis {{chassis_id}}", "refId": "A" } ], - "title": "PS with errors", - "type": "gauge" + "title": "General Health", + "type": "stat" }, { "datasource": { @@ -3694,6 +2556,42 @@ }, "fieldConfig": { "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, "mappings": [], "max": 80, "min": 0, @@ -3704,9 +2602,13 @@ "color": "green", "value": null }, + { + "color": "orange", + "value": 50 + }, { "color": "red", - "value": 80 + "value": 70 } ] }, @@ -3715,28 +2617,29 @@ "overrides": [] }, "gridPos": { - "h": 4, - "w": 24, - "x": 0, - "y": 76 + "h": 7, + "w": 8, + "x": 8, + "y": 51 }, - "id": 17, + "id": 45, "options": { - "minVizHeight": 75, - "minVizWidth": 75, - "orientation": "vertical", - "reduceOptions": { + "legend": { "calcs": [ - "mean" + "lastNotNull", + "max", + "min" ], - "fields": "", - "values": false + "displayMode": "table", + "placement": "bottom", + "showLegend": true }, - "showThresholdLabels": false, - "showThresholdMarkers": true, - "sizing": "auto" + "tooltip": { + "mode": "multi", + "sort": "none" + } }, - "pluginVersion": "11.0.0", + "pluginVersion": "7.1.5", "targets": [ { "datasource": { @@ -3750,10 +2653,11 @@ } ], "title": "Temperatures", - "type": "gauge" + "type": "timeseries" }, { "datasource": { + "default": false, "type": "prometheus", "uid": "${datasource}" }, @@ -3769,8 +2673,9 @@ "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, - "drawStyle": "points", - "fillOpacity": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, @@ -3784,7 +2689,7 @@ "scaleDistribution": { "type": "linear" }, - "showPoints": "auto", + "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", @@ -3794,6 +2699,7 @@ "mode": "off" } }, + "decimals": 0, "links": [], "mappings": [], "thresholds": { @@ -3809,48 +2715,53 @@ } ] }, - "unit": "degree" + "unit": "short" }, "overrides": [] }, "gridPos": { - "h": 10, - "w": 24, - "x": 0, - "y": 80 + "h": 7, + "w": 8, + "x": 8, + "y": 58 }, - "id": 45, + "id": 13, "options": { "legend": { - "calcs": [], - "displayMode": "list", + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { - "maxHeight": 600, "mode": "multi", "sort": "none" } }, - "pluginVersion": "10.2.2", + "pluginVersion": "7.1.5", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "expr": "redfish_chassis_temperature_celsius{server=~\"$server\"}", + "editorMode": "code", + "expr": "redfish_chassis_power_average_consumed_watts{server=~\"$server\", job=\"redfish-exporter\"}", "interval": "", - "legendFormat": "{{sensor}}", + "legendFormat": "{{power_voltage}}", + "range": true, "refId": "A" } ], - "title": "Temperatures", + "title": "Power consumption", "type": "timeseries" } ], - "refresh": "", + "refresh": false, "schemaVersion": 39, "tags": [], "templating": { @@ -3877,10 +2788,9 @@ }, { "current": { - "isNone": true, - "selected": false, - "text": "None", - "value": "" + "selected": true, + "text": "oscomppor04", + "value": "oscomppor04" }, "datasource": { "type": "prometheus", @@ -3906,10 +2816,9 @@ ] }, "time": { - "from": "now-30m", - "to": "now-5m" + "from": "now-15m", + "to": "now-1m" }, - "timeRangeUpdatedDuringEditOrView": false, "timepicker": { "nowDelay": "1m", "refresh_intervals": [ @@ -3928,7 +2837,7 @@ "timezone": "", "title": "Redfish exporter", "uid": "b02mElQGX", - "version": 1, + "version": 2, "weekStart": "" } {% endraw %} diff --git a/etc/kayobe/kolla/config/prometheus/prometheus.yml.d/60-redfish.yml b/etc/kayobe/kolla/config/prometheus/prometheus.yml.d/60-redfish.yml index 6f234e5a04..7f929c4502 100644 --- a/etc/kayobe/kolla/config/prometheus/prometheus.yml.d/60-redfish.yml +++ b/etc/kayobe/kolla/config/prometheus/prometheus.yml.d/60-redfish.yml @@ -2,10 +2,34 @@ --- {% if seed_redfish_exporter_container_enabled | bool %} scrape_configs: - - job_name: redfish-exporter-seed + - job_name: redfish-exporter + params: + collectlogs: ['false'] metrics_path: /redfish - scrape_timeout: 120s - scrape_interval: {{ [8 * groups['redfish_exporter_targets'] | length, 120] | max }}s + scrape_timeout: 300s + scrape_interval: {{ [8 * groups['redfish_exporter_targets'] | length, 300] | max }}s + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: "{{ lookup('vars', admin_oc_net_name ~ '_ips')[groups.seed.0] }}:9610" + static_configs: +{% for host in groups.get('redfish_exporter_targets', []) %} + - targets: + - '{{ hostvars[host]["redfish_exporter_target_address"] }}' + labels: + server: '{{ host }}' + env: "{{ kayobe_environment | default('openstack') }}" + group: "{{ hostvars[host]['redfish_exporter_scrape_group'] | default('overcloud') }}" +{% endfor %} + - job_name: redfish-exporter-collectlog + params: + collectlogs: ['true'] + metrics_path: /redfish + scrape_timeout: 1200s + scrape_interval: 3600s relabel_configs: - source_labels: [__address__] target_label: __param_target diff --git a/etc/kayobe/seed.yml b/etc/kayobe/seed.yml index c76b82f8a4..fd8fd298e8 100644 --- a/etc/kayobe/seed.yml +++ b/etc/kayobe/seed.yml @@ -145,9 +145,9 @@ seed_redfish_exporter_container: image: ghcr.io/stackhpc/redfish-exporter pre: "{{ kayobe_config_path }}/containers/redfish_exporter/pre.yml" post: "{{ kayobe_config_path }}/containers/redfish_exporter/post.yml" - tag: "v1.0.2" + tag: "v2.0.0-stackhpc" network_mode: host - command: ./main --config.file /redfish_exporter.yml + command: redfish_exporter --config.file /redfish_exporter.yml volumes: "/opt/kayobe/containers/redfish_exporter/redfish_exporter.yml:/redfish_exporter.yml:ro" restart_policy: unless-stopped From 0459eab3d616c0c9a29b6729e50bee4ed17673f0 Mon Sep 17 00:00:00 2001 From: Will Szumski Date: Fri, 28 Feb 2025 18:09:32 +0000 Subject: [PATCH 02/16] Update redfish dashboard - Compatability with Dell servers - Added health round up panels - Add variables for different groups e.g overcloud vs compute --- .../grafana/dashboards/openstack/redfish.json | 791 ++++++++++++++++-- 1 file changed, 727 insertions(+), 64 deletions(-) diff --git a/etc/kayobe/kolla/config/grafana/dashboards/openstack/redfish.json b/etc/kayobe/kolla/config/grafana/dashboards/openstack/redfish.json index e4d959a140..e91a08a10f 100644 --- a/etc/kayobe/kolla/config/grafana/dashboards/openstack/redfish.json +++ b/etc/kayobe/kolla/config/grafana/dashboards/openstack/redfish.json @@ -1,4 +1,3 @@ -{% raw %} { "annotations": { "list": [ @@ -20,11 +19,12 @@ "fiscalYearStartMonth": 0, "gnetId": 12403, "graphTooltip": 0, - "id": 30, + "id": 85, "links": [], "panels": [ { "datasource": { + "default": false, "type": "prometheus", "uid": "${datasource}" }, @@ -78,6 +78,7 @@ "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", "expr": "count(up{job=\"redfish-exporter\"} == 1)", "format": "table", "hide": false, @@ -105,6 +106,7 @@ }, { "datasource": { + "default": false, "type": "prometheus", "uid": "${datasource}" }, @@ -134,6 +136,7 @@ "y": 0 }, "id": 54, + "interval": "30m", "options": { "colorMode": "value", "graphMode": "area", @@ -158,7 +161,8 @@ "type": "prometheus", "uid": "${datasource}" }, - "expr": "count(redfish_system_power_state == 1)", + "editorMode": "code", + "expr": "count(redfish_system_power_state{group=\"$group\"} == 1)", "format": "table", "hide": false, "instant": true, @@ -172,6 +176,7 @@ }, { "datasource": { + "default": false, "type": "prometheus", "uid": "${datasource}" }, @@ -225,7 +230,8 @@ "type": "prometheus", "uid": "${datasource}" }, - "expr": "count(redfish_system_power_state != 1)", + "editorMode": "code", + "expr": "count(redfish_system_power_state{group=\"$group\"} != 1)", "format": "table", "hide": false, "instant": true, @@ -239,6 +245,7 @@ }, { "datasource": { + "default": false, "type": "prometheus", "uid": "${datasource}" }, @@ -292,7 +299,8 @@ "type": "prometheus", "uid": "${datasource}" }, - "expr": "count(redfish_chassis_health != 1)", + "editorMode": "code", + "expr": "count(redfish_chassis_health{group=\"$group\"} != 1)", "format": "table", "hide": false, "instant": true, @@ -306,6 +314,7 @@ }, { "datasource": { + "default": false, "type": "prometheus", "uid": "${datasource}" }, @@ -359,7 +368,8 @@ "type": "prometheus", "uid": "${datasource}" }, - "expr": "count(redfish_logservices_entry_count{name=\"SEL Log Service\", severity!=\"OK\"} != 0)", + "editorMode": "code", + "expr": "count(redfish_logservices_entry_count{name=\"SEL Log Service\", severity!=\"OK\", group=\"$group\"} != 0)", "format": "table", "hide": false, "instant": true, @@ -371,6 +381,570 @@ "title": "Nodes with SEL Logs", "type": "stat" }, + { + "datasource": { + "default": false, + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [ + { + "options": { + "1": { + "color": "dark-green", + "index": 1, + "text": "On" + }, + "2": { + "color": "dark-red", + "index": 0, + "text": "Off" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 2 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "displayName", + "value": "Time" + }, + { + "id": "custom.hidden", + "value": true + }, + { + "id": "custom.align" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "instance" + }, + "properties": [ + { + "id": "displayName", + "value": "BMC" + }, + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.hidden", + "value": true + }, + { + "id": "links", + "value": [ + { + "targetBlank": true, + "title": "", + "url": "https://${__data.fields.instance}" + } + ] + }, + { + "id": "custom.align", + "value": "left" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "Power state" + }, + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.cellOptions", + "value": { + "type": "color-background" + } + }, + { + "id": "custom.align" + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "#73BF69", + "value": null + } + ] + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "__name__" + }, + "properties": [ + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.hidden", + "value": true + }, + { + "id": "custom.align" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "env" + }, + "properties": [ + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.hidden", + "value": true + }, + { + "id": "custom.align" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "job" + }, + "properties": [ + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.hidden", + "value": true + }, + { + "id": "custom.align" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "resource" + }, + "properties": [ + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.hidden", + "value": true + }, + { + "id": "custom.align" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "system_id" + }, + "properties": [ + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.hidden", + "value": true + }, + { + "id": "custom.align" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "server" + }, + "properties": [ + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "links", + "value": [ + { + "targetBlank": true, + "title": "", + "url": "https://${__data.fields.instance}" + } + ] + }, + { + "id": "custom.align" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "hostname" + }, + "properties": [ + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.hidden", + "value": true + }, + { + "id": "custom.align" + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 3 + }, + "id": 59, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "Power state" + } + ] + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sort_desc(redfish_system_power_state{group=\"$group\"})", + "format": "table", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Power states", + "transformations": [ + { + "id": "merge", + "options": { + "reducers": [] + } + } + ], + "type": "table" + }, + { + "datasource": { + "default": false, + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [ + { + "options": { + "1": { + "index": 1, + "text": "Healthy" + }, + "2": { + "index": 2, + "text": "Unknown" + }, + "3": { + "index": 0, + "text": "Unhealthy" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/__name__|chassis_id|Time|env|job|resource|instance/" + }, + "properties": [ + { + "id": "displayName", + "value": "Time" + }, + { + "id": "custom.hidden", + "value": true + }, + { + "id": "custom.align" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "Status" + }, + { + "id": "unit", + "value": "" + }, + { + "id": "custom.cellOptions", + "value": { + "type": "color-background" + } + }, + { + "id": "custom.align" + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)" + }, + { + "color": "#EAB839", + "value": 2 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 3 + } + ] + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "server" + }, + "properties": [ + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "links", + "value": [ + { + "targetBlank": true, + "title": "", + "url": "https://${__data.fields.instance}" + } + ] + }, + { + "id": "custom.align" + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 3 + }, + "id": 61, + "interval": "", + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "Status" + } + ] + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sort(redfish_chassis_health{group=\"$group\"})", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Chassis status", + "transformations": [ + { + "id": "merge", + "options": { + "reducers": [] + } + } + ], + "type": "table" + }, { "collapsed": false, "datasource": { @@ -381,7 +955,7 @@ "h": 1, "w": 24, "x": 0, - "y": 3 + "y": 11 }, "id": 31, "panels": [], @@ -399,6 +973,7 @@ }, { "datasource": { + "default": false, "type": "prometheus", "uid": "${datasource}" }, @@ -463,7 +1038,7 @@ "h": 11, "w": 7, "x": 0, - "y": 4 + "y": 12 }, "id": 36, "options": { @@ -488,8 +1063,10 @@ "type": "prometheus", "uid": "${datasource}" }, - "expr": "sum(redfish_chassis_power_average_consumed_watts)", + "editorMode": "code", + "expr": "sum(redfish_chassis_power_average_consumed_watts{group=\"$group\"})", "legendFormat": "Rack power consumption", + "range": true, "refId": "A" } ], @@ -498,6 +1075,7 @@ }, { "datasource": { + "default": false, "type": "prometheus", "uid": "${datasource}" }, @@ -562,7 +1140,7 @@ "h": 11, "w": 6, "x": 7, - "y": 4 + "y": 12 }, "id": 44, "options": { @@ -588,9 +1166,11 @@ "type": "prometheus", "uid": "${datasource}" }, - "expr": "sum(redfish_chassis_power_average_consumed_watts) by (env)", + "editorMode": "code", + "expr": "sum(redfish_chassis_power_average_consumed_watts{group=\"$group\"}) by (env)", "interval": "", "legendFormat": "{{ env }}", + "range": true, "refId": "A" } ], @@ -599,6 +1179,7 @@ }, { "datasource": { + "default": false, "type": "prometheus", "uid": "${datasource}" }, @@ -663,7 +1244,7 @@ "h": 11, "w": 6, "x": 13, - "y": 4 + "y": 12 }, "id": 42, "options": { @@ -688,10 +1269,12 @@ "type": "prometheus", "uid": "${datasource}" }, - "expr": "count(redfish_system_power_state == 1) by (env)", + "editorMode": "code", + "expr": "count(redfish_system_power_state{group=\"$group\"} == 1) by (env)", "hide": false, "interval": "", "legendFormat": "Powered up {{ env }}", + "range": true, "refId": "A" }, { @@ -699,10 +1282,12 @@ "type": "prometheus", "uid": "${datasource}" }, - "expr": "count(redfish_system_power_state == 2) by (env) * -1", + "editorMode": "code", + "expr": "count(redfish_system_power_state{group=\"$group\"} == 2) by (env) * -1", "hide": true, "interval": "", "legendFormat": "Powered down {{ env }}", + "range": true, "refId": "B" } ], @@ -711,6 +1296,7 @@ }, { "datasource": { + "default": false, "type": "prometheus", "uid": "${datasource}" }, @@ -775,7 +1361,7 @@ "h": 11, "w": 5, "x": 19, - "y": 4 + "y": 12 }, "id": 43, "options": { @@ -800,10 +1386,12 @@ "type": "prometheus", "uid": "${datasource}" }, - "expr": "count(redfish_system_power_state == 1) by (env)", + "editorMode": "code", + "expr": "count(redfish_system_power_state{group=\"$group\"} == 1) by (env)", "hide": true, "interval": "", "legendFormat": "Powered up {{ env }}", + "range": true, "refId": "A" }, { @@ -811,10 +1399,12 @@ "type": "prometheus", "uid": "${datasource}" }, - "expr": "count(redfish_system_power_state == 2) by (env)", + "editorMode": "code", + "expr": "count(redfish_system_power_state{group=\"$group\"} == 2) by (env)", "hide": false, "interval": "", "legendFormat": "Powered down {{ env }}", + "range": true, "refId": "B" } ], @@ -889,7 +1479,7 @@ "h": 11, "w": 7, "x": 0, - "y": 15 + "y": 23 }, "id": 39, "interval": "5m", @@ -917,7 +1507,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "max(redfish_chassis_temperature_celsius{sensor=~\".*Ambient.*Temp\"}) by (env)", + "expr": "max(redfish_chassis_temperature_celsius{sensor=~\".*Ambient.*Temp|.*Inlet.*\", group=\"$group\"}) by (env)", "hide": false, "interval": "", "legendFormat": "max inlet {{ env }}", @@ -1038,7 +1628,7 @@ "h": 11, "w": 8, "x": 7, - "y": 15 + "y": 23 }, "id": 40, "interval": "5m", @@ -1066,7 +1656,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "max(redfish_chassis_temperature_celsius{sensor=~\".*CPU.*1.*Temp\"}) by (env)", + "expr": "max(redfish_chassis_temperature_celsius{sensor=~\".*CPU.*1.*Temp\", group=\"$group\"}) by (env)", "hide": false, "interval": "", "legendFormat": "{{ env }}", @@ -1186,7 +1776,7 @@ "h": 11, "w": 9, "x": 15, - "y": 15 + "y": 23 }, "id": 41, "interval": "5m", @@ -1214,7 +1804,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "max(redfish_chassis_temperature_celsius{sensor=~\".*CPU.2.*Temp\"}) by (env)", + "expr": "max(redfish_chassis_temperature_celsius{sensor=~\"CPU2 Temp|.*CPU.2.*Temp\", group=\"$group\"}) by (env)", "hide": false, "interval": "", "legendFormat": "{{ env }}", @@ -1259,7 +1849,7 @@ "h": 10, "w": 5, "x": 0, - "y": 26 + "y": 34 }, "heatmap": {}, "hideZeroBuckets": false, @@ -1316,7 +1906,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "redfish_chassis_temperature_celsius{sensor=~\".*Ambient.*Temp\"}", + "expr": "redfish_chassis_temperature_celsius{sensor=~\".*Ambient.*Temp|.*Inlet.*\", group=\"$group\"}", "hide": false, "interval": "", "intervalFactor": 1, @@ -1376,7 +1966,7 @@ "h": 10, "w": 6, "x": 5, - "y": 26 + "y": 34 }, "heatmap": {}, "hideZeroBuckets": false, @@ -1435,7 +2025,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "redfish_chassis_temperature_celsius{sensor=~\".*CPU.*1.*Temp\"} != 0", + "expr": "redfish_chassis_temperature_celsius{sensor=~\".*CPU.*1.*Temp\", group=\"$group\"} != 0", "hide": false, "interval": "", "intervalFactor": 1, @@ -1497,7 +2087,7 @@ "h": 10, "w": 6, "x": 11, - "y": 26 + "y": 34 }, "heatmap": {}, "hideZeroBuckets": false, @@ -1556,7 +2146,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "redfish_chassis_temperature_celsius{sensor=~\".*CPU.*2.*Temp\"} != 0", + "expr": "redfish_chassis_temperature_celsius{sensor=~\".*CPU.*2.*Temp\", group=\"$group\"} != 0", "hide": false, "interval": "", "intervalFactor": 1, @@ -1594,6 +2184,7 @@ }, "dataFormat": "timeseries", "datasource": { + "default": false, "type": "prometheus", "uid": "${datasource}" }, @@ -1617,7 +2208,7 @@ "h": 10, "w": 5, "x": 18, - "y": 26 + "y": 34 }, "heatmap": {}, "hideZeroBuckets": false, @@ -1673,11 +2264,13 @@ "type": "prometheus", "uid": "${datasource}" }, - "expr": "max(redfish_chassis_fan_rpm_percentage) by (server) > 0", + "editorMode": "code", + "expr": "max(redfish_chassis_fan_rpm_percentage{group=\"$group\"}) by (server) > 0", "hide": false, "interval": "", "intervalFactor": 1, "legendFormat": "", + "range": true, "refId": "A" } ], @@ -1764,7 +2357,7 @@ "h": 10, "w": 12, "x": 0, - "y": 36 + "y": 44 }, "id": 47, "interval": "5m", @@ -1792,7 +2385,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "redfish_chassis_temperature_celsius{sensor=~\".*Ambient.*Temp\"}", + "expr": "redfish_chassis_temperature_celsius{sensor=~\".*Ambient.*Temp\"} or redfish_chassis_temperature_celsius{sensor_id=~\".*InletTemp\"} or redfish_chassis_temperature_celsius{sensor=~\".*Inlet.*\"}", "hide": false, "interval": "", "legendFormat": "{{ env }} {{ server }}", @@ -1871,7 +2464,7 @@ "h": 10, "w": 12, "x": 12, - "y": 36 + "y": 44 }, "id": 49, "interval": "1m", @@ -1898,7 +2491,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "redfish_chassis_temperature_celsius{sensor=~\".*CPU.*1.*Temp\"} != 0", + "expr": "redfish_chassis_temperature_celsius{sensor=~\".*CPU.*1.*Temp\", group=\"$group\"} != 0", "hide": false, "interval": "", "legendFormat": "{{ env }} {{ server }}", @@ -1919,7 +2512,7 @@ "h": 1, "w": 24, "x": 0, - "y": 46 + "y": 54 }, "id": 29, "panels": [], @@ -1964,7 +2557,7 @@ "h": 4, "w": 8, "x": 0, - "y": 47 + "y": 55 }, "id": 19, "options": { @@ -1990,7 +2583,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "redfish_chassis_power_average_consumed_watts{server=~\"$server\", job=\"redfish-exporter\"}", + "expr": "redfish_chassis_power_average_consumed_watts{server=\"$server\"}", "hide": false, "interval": "", "legendFormat": "{{power_voltage}}", @@ -2031,7 +2624,7 @@ "h": 4, "w": 2, "x": 8, - "y": 47 + "y": 55 }, "id": 24, "options": { @@ -2097,7 +2690,7 @@ "h": 4, "w": 2, "x": 10, - "y": 47 + "y": 55 }, "id": 25, "options": { @@ -2164,7 +2757,7 @@ "h": 4, "w": 2, "x": 12, - "y": 47 + "y": 55 }, "id": 26, "options": { @@ -2231,7 +2824,7 @@ "h": 4, "w": 2, "x": 14, - "y": 47 + "y": 55 }, "id": 27, "options": { @@ -2309,7 +2902,7 @@ "h": 18, "w": 8, "x": 16, - "y": 47 + "y": 55 }, "id": 17, "options": { @@ -2333,7 +2926,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum by (sensor) (sum_over_time(redfish_chassis_temperature_celsius{server=~\"oscephpor01\"}[15m])) / sum by (sensor) (count_over_time(redfish_chassis_temperature_celsius{server=~\"oscephpor01\"}[15m])) > 0", + "expr": "sum by (sensor) (sum_over_time(redfish_chassis_temperature_celsius{server=\"$server\"}[15m])) / sum by (sensor) (count_over_time(redfish_chassis_temperature_celsius{server=\"$server\"}[15m])) > 0", "format": "table", "instant": true, "interval": "", @@ -2363,11 +2956,15 @@ }, { "datasource": { + "default": false, "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "mappings": [ { "options": { @@ -2411,15 +3008,15 @@ "h": 4, "w": 3, "x": 0, - "y": 51 + "y": 59 }, "id": 6, "maxDataPoints": 100, "options": { - "colorMode": "background", - "graphMode": "none", + "colorMode": "value", + "graphMode": "area", "justifyMode": "auto", - "orientation": "horizontal", + "orientation": "auto", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ @@ -2439,7 +3036,8 @@ "type": "prometheus", "uid": "${datasource}" }, - "expr": "redfish_system_power_state{server=~\"$server\"}", + "editorMode": "code", + "expr": "redfish_system_power_state{server=\"$server\"}", "format": "time_series", "instant": true, "interval": "", @@ -2509,7 +3107,7 @@ "h": 4, "w": 5, "x": 3, - "y": 51 + "y": 59 }, "id": 7, "maxDataPoints": 100, @@ -2538,10 +3136,10 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "redfish_chassis_health{server=~\"$server\", job=\"redfish-exporter\"}", + "expr": "redfish_chassis_health{server=\"$server\"}", "format": "time_series", "instant": true, - "interval": "", + "interval": "30m", "legendFormat": "Chassis {{chassis_id}}", "refId": "A" } @@ -2620,7 +3218,7 @@ "h": 7, "w": 8, "x": 8, - "y": 51 + "y": 59 }, "id": 45, "options": { @@ -2723,7 +3321,7 @@ "h": 7, "w": 8, "x": 8, - "y": 58 + "y": 66 }, "id": 13, "options": { @@ -2750,7 +3348,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "redfish_chassis_power_average_consumed_watts{server=~\"$server\", job=\"redfish-exporter\"}", + "expr": "redfish_chassis_power_average_consumed_watts{server=~\"$server\"}", "interval": "", "legendFormat": "{{power_voltage}}", "range": true, @@ -2761,7 +3359,7 @@ "type": "timeseries" } ], - "refresh": false, + "refresh": "", "schemaVersion": 39, "tags": [], "templating": { @@ -2789,22 +3387,88 @@ { "current": { "selected": true, - "text": "oscomppor04", - "value": "oscomppor04" + "text": "production", + "value": "production" }, "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "definition": "label_values(redfish_exporter_collector_duration_seconds, server)", + "definition": "label_values(redfish_exporter_collector_duration_seconds,env)", + "hide": 0, + "includeAll": false, + "label": "env", + "multi": false, + "name": "env", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(redfish_exporter_collector_duration_seconds,env)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "current": { + "selected": true, + "text": "compute", + "value": "compute" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(redfish_exporter_collector_duration_seconds{env=\"$env\"},group)", + "hide": 0, + "includeAll": false, + "label": "group", + "multi": false, + "name": "group", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(redfish_exporter_collector_duration_seconds{env=\"$env\"},group)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "current": { + "selected": false, + "text": "svn1-dr07-u10", + "value": "svn1-dr07-u10" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(redfish_exporter_collector_duration_seconds{group=\"$group\", env=\"$env\"},server)", "hide": 0, "includeAll": false, "label": "server", "multi": false, "name": "server", "options": [], - "query": "label_values(redfish_exporter_collector_duration_seconds, server)", - "refresh": 1, + "query": { + "qryType": 1, + "query": "label_values(redfish_exporter_collector_duration_seconds{group=\"$group\", env=\"$env\"},server)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 2, "regex": "", "skipUrlSync": false, "sort": 0, @@ -2816,7 +3480,7 @@ ] }, "time": { - "from": "now-15m", + "from": "now-12h", "to": "now-1m" }, "timepicker": { @@ -2836,8 +3500,7 @@ }, "timezone": "", "title": "Redfish exporter", - "uid": "b02mElQGX", + "uid": "redfish", "version": 2, "weekStart": "" } -{% endraw %} From 7fcabe20c873ded2149ea08a370c0ddb6f227b80 Mon Sep 17 00:00:00 2001 From: Will Szumski Date: Fri, 28 Feb 2025 18:12:19 +0000 Subject: [PATCH 03/16] Increase query lookback delta Prometheus will regard metrics collected over this perioid as stale as such they won't show up in grafana. --- etc/kayobe/kolla/globals.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etc/kayobe/kolla/globals.yml b/etc/kayobe/kolla/globals.yml index c36b659b5c..262c54815b 100644 --- a/etc/kayobe/kolla/globals.yml +++ b/etc/kayobe/kolla/globals.yml @@ -26,7 +26,7 @@ kolla_image_tags: # Monitoring and alerting related settings opensearch_heap_size: 8g -prometheus_cmdline_extras: "--storage.tsdb.retention.time=30d" +prometheus_cmdline_extras: "--storage.tsdb.retention.time=30d --query.lookback-delta=15m" # Additional command line flags for node exporter to enable texfile collector for disk metrics and create textfile docker volume prometheus_node_exporter_extra_volumes: From d4b2607c8913984995d78323100a1e37758eede1 Mon Sep 17 00:00:00 2001 From: Will Szumski Date: Fri, 28 Feb 2025 18:25:38 +0000 Subject: [PATCH 04/16] Add raw tags --- .../kolla/config/grafana/dashboards/openstack/redfish.json | 2 ++ 1 file changed, 2 insertions(+) diff --git a/etc/kayobe/kolla/config/grafana/dashboards/openstack/redfish.json b/etc/kayobe/kolla/config/grafana/dashboards/openstack/redfish.json index e91a08a10f..9aa7b61f00 100644 --- a/etc/kayobe/kolla/config/grafana/dashboards/openstack/redfish.json +++ b/etc/kayobe/kolla/config/grafana/dashboards/openstack/redfish.json @@ -1,3 +1,4 @@ +{% raw %} { "annotations": { "list": [ @@ -3504,3 +3505,4 @@ "version": 2, "weekStart": "" } +{% endraw %} From b86b28ddb4a17613bd4cbae04f57f5ff6793d394 Mon Sep 17 00:00:00 2001 From: Matt Anson Date: Fri, 7 Mar 2025 15:52:02 +0000 Subject: [PATCH 05/16] Re-tweak Redfish dashboard for Lenovo --- .../grafana/dashboards/openstack/redfish.json | 84 +++++++++---------- 1 file changed, 42 insertions(+), 42 deletions(-) diff --git a/etc/kayobe/kolla/config/grafana/dashboards/openstack/redfish.json b/etc/kayobe/kolla/config/grafana/dashboards/openstack/redfish.json index 9aa7b61f00..7d9d6b4316 100644 --- a/etc/kayobe/kolla/config/grafana/dashboards/openstack/redfish.json +++ b/etc/kayobe/kolla/config/grafana/dashboards/openstack/redfish.json @@ -20,12 +20,12 @@ "fiscalYearStartMonth": 0, "gnetId": 12403, "graphTooltip": 0, - "id": 85, + "id": 30, "links": [], "panels": [ { "datasource": { - "default": false, + "default": false, "type": "prometheus", "uid": "${datasource}" }, @@ -102,7 +102,7 @@ "refId": "B" } ], - "title": "iDRAC Up", + "title": "Redfish Up", "type": "stat" }, { @@ -163,7 +163,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "count(redfish_system_power_state{group=\"$group\"} == 1)", + "expr": "count(redfish_system_power_state{group=\"$group\", job=\"redfish-exporter\"} == 1)", "format": "table", "hide": false, "instant": true, @@ -232,7 +232,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "count(redfish_system_power_state{group=\"$group\"} != 1)", + "expr": "count(redfish_system_power_state{group=\"$group\", job=\"redfish-exporter\"} != 1)", "format": "table", "hide": false, "instant": true, @@ -301,7 +301,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "count(redfish_chassis_health{group=\"$group\"} != 1)", + "expr": "count(redfish_chassis_health{group=\"$group\", job=\"redfish-exporter\" } != 1)", "format": "table", "hide": false, "instant": true, @@ -730,7 +730,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "sort_desc(redfish_system_power_state{group=\"$group\"})", + "expr": "sort_desc(redfish_system_power_state{group=\"$group\", job=\"redfish-exporter\"})", "format": "table", "instant": true, "interval": "", @@ -926,7 +926,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "sort(redfish_chassis_health{group=\"$group\"})", + "expr": "sort(redfish_chassis_health{group=\"$group\", job=\"redfish-exporter\"})", "format": "table", "hide": false, "instant": true, @@ -1065,8 +1065,8 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "sum(redfish_chassis_power_average_consumed_watts{group=\"$group\"})", - "legendFormat": "Rack power consumption", + "expr": "sum(redfish_chassis_power_average_consumed_watts{group=\"$group\", job=\"redfish-exporter\"})", + "legendFormat": "Total power consumption", "range": true, "refId": "A" } @@ -1168,7 +1168,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "sum(redfish_chassis_power_average_consumed_watts{group=\"$group\"}) by (env)", + "expr": "sum(redfish_chassis_power_average_consumed_watts{group=\"$group\", job=\"redfish-exporter\"}) by (env)", "interval": "", "legendFormat": "{{ env }}", "range": true, @@ -1271,7 +1271,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "count(redfish_system_power_state{group=\"$group\"} == 1) by (env)", + "expr": "count(redfish_system_power_state{group=\"$group\", job=\"redfish-exporter\"} == 1) by (env)", "hide": false, "interval": "", "legendFormat": "Powered up {{ env }}", @@ -1284,7 +1284,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "count(redfish_system_power_state{group=\"$group\"} == 2) by (env) * -1", + "expr": "count(redfish_system_power_state{group=\"$group\", job=\"redfish-exporter\"} == 2) by (env) * -1", "hide": true, "interval": "", "legendFormat": "Powered down {{ env }}", @@ -1292,7 +1292,7 @@ "refId": "B" } ], - "title": "Powered ON by Rack", + "title": "Powered ON", "type": "timeseries" }, { @@ -1388,7 +1388,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "count(redfish_system_power_state{group=\"$group\"} == 1) by (env)", + "expr": "count(redfish_system_power_state{group=\"$group\", job=\"redfish-exporter\"} == 1) by (env)", "hide": true, "interval": "", "legendFormat": "Powered up {{ env }}", @@ -1401,7 +1401,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "count(redfish_system_power_state{group=\"$group\"} == 2) by (env)", + "expr": "count(redfish_system_power_state{group=\"$group\", job=\"redfish-exporter\"} == 2) by (env)", "hide": false, "interval": "", "legendFormat": "Powered down {{ env }}", @@ -1409,7 +1409,7 @@ "refId": "B" } ], - "title": "Powered OFF by Rack", + "title": "Powered OFF", "type": "timeseries" }, { @@ -1508,7 +1508,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "max(redfish_chassis_temperature_celsius{sensor=~\".*Ambient.*Temp|.*Inlet.*\", group=\"$group\"}) by (env)", + "expr": "max(redfish_chassis_temperature_celsius{sensor=~\".*Ambient.*Temp|.*Inlet.*\", group=\"$group\", job=\"redfish-exporter\"}) by (env)", "hide": false, "interval": "", "legendFormat": "max inlet {{ env }}", @@ -1657,7 +1657,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "max(redfish_chassis_temperature_celsius{sensor=~\".*CPU.*1.*Temp\", group=\"$group\"}) by (env)", + "expr": "max(redfish_chassis_temperature_celsius{sensor=~\".*CPU.*1.*Temp\", group=\"$group\", job=\"redfish-exporter\"}) by (env)", "hide": false, "interval": "", "legendFormat": "{{ env }}", @@ -1805,7 +1805,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "max(redfish_chassis_temperature_celsius{sensor=~\"CPU2 Temp|.*CPU.2.*Temp\", group=\"$group\"}) by (env)", + "expr": "max(redfish_chassis_temperature_celsius{sensor=~\"CPU2 Temp|.*CPU.2.*Temp\", group=\"$group\", job=\"redfish-exporter\"}) by (env)", "hide": false, "interval": "", "legendFormat": "{{ env }}", @@ -1907,7 +1907,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "redfish_chassis_temperature_celsius{sensor=~\".*Ambient.*Temp|.*Inlet.*\", group=\"$group\"}", + "expr": "redfish_chassis_temperature_celsius{sensor=~\".*Ambient.*Temp|.*Inlet.*\", group=\"$group\", job=\"redfish-exporter\"}", "hide": false, "interval": "", "intervalFactor": 1, @@ -2026,7 +2026,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "redfish_chassis_temperature_celsius{sensor=~\".*CPU.*1.*Temp\", group=\"$group\"} != 0", + "expr": "redfish_chassis_temperature_celsius{sensor=~\".*CPU.*1.*Temp\", group=\"$group\", job=\"redfish-exporter\"} != 0", "hide": false, "interval": "", "intervalFactor": 1, @@ -2147,7 +2147,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "redfish_chassis_temperature_celsius{sensor=~\".*CPU.*2.*Temp\", group=\"$group\"} != 0", + "expr": "redfish_chassis_temperature_celsius{sensor=~\".*CPU.*2.*Temp\", group=\"$group\", job=\"redfish-exporter\"} != 0", "hide": false, "interval": "", "intervalFactor": 1, @@ -2266,7 +2266,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "max(redfish_chassis_fan_rpm_percentage{group=\"$group\"}) by (server) > 0", + "expr": "max(redfish_chassis_fan_rpm_percentage{group=\"$group\", job=\"redfish-exporter\"}) by (server) > 0", "hide": false, "interval": "", "intervalFactor": 1, @@ -2386,7 +2386,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "redfish_chassis_temperature_celsius{sensor=~\".*Ambient.*Temp\"} or redfish_chassis_temperature_celsius{sensor_id=~\".*InletTemp\"} or redfish_chassis_temperature_celsius{sensor=~\".*Inlet.*\"}", + "expr": "redfish_chassis_temperature_celsius{sensor=~\".*Ambient.*Temp\", job=\"redfish-exporter\"} or redfish_chassis_temperature_celsius{sensor_id=~\".*InletTemp\", job=\"redfish-exporter\"} or redfish_chassis_temperature_celsius{sensor=~\".*Inlet.*\", job=\"redfish-exporter\"}", "hide": false, "interval": "", "legendFormat": "{{ env }} {{ server }}", @@ -2492,7 +2492,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "redfish_chassis_temperature_celsius{sensor=~\".*CPU.*1.*Temp\", group=\"$group\"} != 0", + "expr": "redfish_chassis_temperature_celsius{sensor=~\".*CPU.*1.*Temp\", group=\"$group\", job=\"redfish-exporter\"} != 0", "hide": false, "interval": "", "legendFormat": "{{ env }} {{ server }}", @@ -2584,7 +2584,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "redfish_chassis_power_average_consumed_watts{server=\"$server\"}", + "expr": "redfish_chassis_power_average_consumed_watts{server=\"$server\", job=\"redfish-exporter\"}", "hide": false, "interval": "", "legendFormat": "{{power_voltage}}", @@ -2614,7 +2614,7 @@ }, { "color": "red", - "value": 80 + "value": 1 } ] } @@ -2650,7 +2650,7 @@ "type": "prometheus", "uid": "${datasource}" }, - "expr": "count(redfish_system_storage_drive_state{server=~\"$server\"} != 1) or vector(0)", + "expr": "count(redfish_system_storage_drive_state{server=~\"$server\", job=\"redfish-exporter\"} != 1) or vector(0)", "format": "table", "instant": true, "interval": "", @@ -2658,7 +2658,7 @@ "refId": "A" } ], - "title": "Disk with errors", + "title": "Disks with errors", "type": "gauge" }, { @@ -2680,7 +2680,7 @@ }, { "color": "red", - "value": 80 + "value": 1 } ] } @@ -2716,7 +2716,7 @@ "type": "prometheus", "uid": "${datasource}" }, - "expr": "count(redfish_system_pcie_device_health_state{server=~\"$server\"} != 1) or vector(0)", + "expr": "count(redfish_system_pcie_device_health_state{server=~\"$server\", job=\"redfish-exporter\"} != 1) or vector(0)", "format": "table", "instant": true, "interval": "", @@ -2747,7 +2747,7 @@ }, { "color": "red", - "value": 80 + "value": 1 } ] } @@ -2784,7 +2784,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "count(redfish_chassis_temperature_sensor_state{server=~\"$server\"} > 2) or vector(0)", + "expr": "count(redfish_chassis_temperature_sensor_state{server=~\"$server\", job=\"redfish-exporter\"} > 2) or vector(0)", "format": "table", "instant": true, "interval": "", @@ -2814,7 +2814,7 @@ }, { "color": "red", - "value": 80 + "value": 1 } ] } @@ -2850,7 +2850,7 @@ "type": "prometheus", "uid": "${datasource}" }, - "expr": "count(redfish_chassis_power_powersupply_health{server=~\"$server\"} > 1) or vector(0)", + "expr": "count(redfish_chassis_power_powersupply_health{server=~\"$server\", job=\"redfish-exporter\"} > 1) or vector(0)", "format": "table", "instant": true, "interval": "", @@ -2927,7 +2927,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum by (sensor) (sum_over_time(redfish_chassis_temperature_celsius{server=\"$server\"}[15m])) / sum by (sensor) (count_over_time(redfish_chassis_temperature_celsius{server=\"$server\"}[15m])) > 0", + "expr": "sum by (sensor) (sum_over_time(redfish_chassis_temperature_celsius{server=\"$server\", job=\"redfish-exporter\"}[15m])) / sum by (sensor) (count_over_time(redfish_chassis_temperature_celsius{server=\"$server\", job=\"redfish-exporter\"}[15m])) > 0", "format": "table", "instant": true, "interval": "", @@ -3038,7 +3038,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "redfish_system_power_state{server=\"$server\"}", + "expr": "redfish_system_power_state{server=\"$server\", job=\"redfish-exporter\"}", "format": "time_series", "instant": true, "interval": "", @@ -3137,7 +3137,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "redfish_chassis_health{server=\"$server\"}", + "expr": "redfish_chassis_health{server=\"$server\", job=\"redfish-exporter\"}", "format": "time_series", "instant": true, "interval": "30m", @@ -3245,7 +3245,7 @@ "type": "prometheus", "uid": "${datasource}" }, - "expr": "redfish_chassis_temperature_celsius{server=~\"$server\"}", + "expr": "redfish_chassis_temperature_celsius{server=~\"$server\", job=\"redfish-exporter\"}", "interval": "", "legendFormat": "{{sensor}}", "refId": "A" @@ -3349,7 +3349,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "redfish_chassis_power_average_consumed_watts{server=~\"$server\"}", + "expr": "redfish_chassis_power_average_consumed_watts{server=~\"$server\", job=\"redfish-exporter\"}", "interval": "", "legendFormat": "{{power_voltage}}", "range": true, @@ -3481,7 +3481,7 @@ ] }, "time": { - "from": "now-12h", + "from": "now-15m", "to": "now-1m" }, "timepicker": { From 15e2cba17778d944648bd0d5dbd70843c68ce062 Mon Sep 17 00:00:00 2001 From: technowhizz <7688823+technowhizz@users.noreply.github.com> Date: Wed, 27 Nov 2024 13:48:10 +0000 Subject: [PATCH 06/16] Restore old UUID of the redfish dashboard The redfish dashboard was updated to use a new UUID, however this caused errors in the Grafana logs and Grafana didn't update the dashboard. Reverting this to the old UUID fixes the issue. --- .../kolla/config/grafana/dashboards/openstack/redfish.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/etc/kayobe/kolla/config/grafana/dashboards/openstack/redfish.json b/etc/kayobe/kolla/config/grafana/dashboards/openstack/redfish.json index 7d9d6b4316..81cca89930 100644 --- a/etc/kayobe/kolla/config/grafana/dashboards/openstack/redfish.json +++ b/etc/kayobe/kolla/config/grafana/dashboards/openstack/redfish.json @@ -3501,8 +3501,8 @@ }, "timezone": "", "title": "Redfish exporter", - "uid": "redfish", - "version": 2, + "uid": "b02mElQGX", + "version": 1, "weekStart": "" } {% endraw %} From a36b66768aaaa9317d0b46888102e892580237b3 Mon Sep 17 00:00:00 2001 From: Will Szumski Date: Mon, 10 Mar 2025 16:04:39 +0000 Subject: [PATCH 07/16] Make query lookback delta setting smarter --- .../prometheus/prometheus.yml.d/60-redfish.yml | 2 +- .../prometheus/prometheus.yml.d/70-oscapacity.yml | 2 +- etc/kayobe/stackhpc-monitoring.yml | 10 ++++++++++ ...mps-redfish-exporter-to-v2-11032fb9dde36283.yaml | 13 +++++++++++++ 4 files changed, 25 insertions(+), 2 deletions(-) create mode 100644 releasenotes/notes/bumps-redfish-exporter-to-v2-11032fb9dde36283.yaml diff --git a/etc/kayobe/kolla/config/prometheus/prometheus.yml.d/60-redfish.yml b/etc/kayobe/kolla/config/prometheus/prometheus.yml.d/60-redfish.yml index 7f929c4502..ebec44edc1 100644 --- a/etc/kayobe/kolla/config/prometheus/prometheus.yml.d/60-redfish.yml +++ b/etc/kayobe/kolla/config/prometheus/prometheus.yml.d/60-redfish.yml @@ -7,7 +7,7 @@ scrape_configs: collectlogs: ['false'] metrics_path: /redfish scrape_timeout: 300s - scrape_interval: {{ [8 * groups['redfish_exporter_targets'] | length, 300] | max }}s + scrape_interval: {{ redfish_exporter_scrape_interval }}s relabel_configs: - source_labels: [__address__] target_label: __param_target diff --git a/etc/kayobe/kolla/config/prometheus/prometheus.yml.d/70-oscapacity.yml b/etc/kayobe/kolla/config/prometheus/prometheus.yml.d/70-oscapacity.yml index afed8d9159..6bfc49de7c 100644 --- a/etc/kayobe/kolla/config/prometheus/prometheus.yml.d/70-oscapacity.yml +++ b/etc/kayobe/kolla/config/prometheus/prometheus.yml.d/70-oscapacity.yml @@ -7,7 +7,7 @@ scrape_configs: static_configs: - targets: - '{{ kolla_internal_fqdn | put_address_in_context('url') }}:9090' - scrape_interval: 15m + scrape_interval: "{{ stackhpc_os_capacity_scrape_interval }}s" scrape_timeout: 10m {% if kolla_enable_tls_internal | bool %} scheme: https diff --git a/etc/kayobe/stackhpc-monitoring.yml b/etc/kayobe/stackhpc-monitoring.yml index 5eee4b19cf..3323151183 100644 --- a/etc/kayobe/stackhpc-monitoring.yml +++ b/etc/kayobe/stackhpc-monitoring.yml @@ -1,5 +1,9 @@ --- # StackHPC monitoring configuration +############################################################################### +# Prometheus server configuration + +prometheus_query_lookback_delta: "{{ [redfish_exporter_scrape_interval, stackhpc_os_capacity_scrape_interval, 300] | max }}s" ############################################################################### # Alert configuration @@ -45,6 +49,9 @@ stackhpc_os_capacity_openstack_cacert: "" stackhpc_os_capacity_openstack_verify: true # Redfish exporter +# How often to scrape the os capacity exporter in seconds. +stackhpc_os_capacity_scrape_interval: 900 + # Whether the redfish exporter is enabled. stackhpc_enable_redfish_exporter: false @@ -55,6 +62,9 @@ redfish_exporter_default_password: "{{ ipmi_password }}" # The address of the BMC that is used to query redfish metrics. redfish_exporter_target_address: "{{ ipmi_address }}" +# How often to scrape the BMCs in seconds. +redfish_exporter_scrape_interval: "{{ [8 * groups['redfish_exporter_targets'] | length, 300] | max }}" + ############################################################################### # Whether the RADOS gateway usage exporter is enabled. diff --git a/releasenotes/notes/bumps-redfish-exporter-to-v2-11032fb9dde36283.yaml b/releasenotes/notes/bumps-redfish-exporter-to-v2-11032fb9dde36283.yaml new file mode 100644 index 0000000000..9898b9e4eb --- /dev/null +++ b/releasenotes/notes/bumps-redfish-exporter-to-v2-11032fb9dde36283.yaml @@ -0,0 +1,13 @@ +--- +features: + - Upgrades the redfish exporter container image to the v2.x series. + - Adds support for lenovo hardware to the redfish exporter dashboard. + - | + Adds the ``redfish_exporter_scrape_interval``, and + ``stackhpc_os_capacity_scrape_interval`` configuration variables. +fixes: + - | + Sets the prometheus server side option ``query.lookback-delta`` to + the largest scrape interval so that redfish exporter metrics are not + marked stale before the next scrape. + - Fixes various issues with the redfish exporter dashboard. From 68b8ea1fecf308abd4c0929c8d88268002786672 Mon Sep 17 00:00:00 2001 From: Will Szumski Date: Mon, 10 Mar 2025 16:09:58 +0000 Subject: [PATCH 08/16] Actually set lookback delta --- etc/kayobe/kolla/globals.yml | 2 +- etc/kayobe/stackhpc-monitoring.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/etc/kayobe/kolla/globals.yml b/etc/kayobe/kolla/globals.yml index 262c54815b..87339d35a3 100644 --- a/etc/kayobe/kolla/globals.yml +++ b/etc/kayobe/kolla/globals.yml @@ -26,7 +26,7 @@ kolla_image_tags: # Monitoring and alerting related settings opensearch_heap_size: 8g -prometheus_cmdline_extras: "--storage.tsdb.retention.time=30d --query.lookback-delta=15m" +prometheus_cmdline_extras: "--storage.tsdb.retention.time=30d --query.lookback-delta={{ stackhpc_prometheus_query_lookback_delta }}" # Additional command line flags for node exporter to enable texfile collector for disk metrics and create textfile docker volume prometheus_node_exporter_extra_volumes: diff --git a/etc/kayobe/stackhpc-monitoring.yml b/etc/kayobe/stackhpc-monitoring.yml index 3323151183..927f57ce97 100644 --- a/etc/kayobe/stackhpc-monitoring.yml +++ b/etc/kayobe/stackhpc-monitoring.yml @@ -3,7 +3,7 @@ ############################################################################### # Prometheus server configuration -prometheus_query_lookback_delta: "{{ [redfish_exporter_scrape_interval, stackhpc_os_capacity_scrape_interval, 300] | max }}s" +stackhpc_prometheus_query_lookback_delta: "{{ [redfish_exporter_scrape_interval, stackhpc_os_capacity_scrape_interval, 300] | max }}s" ############################################################################### # Alert configuration From cf085b918dbf15f1f4b9be52cbc3e931054c9017 Mon Sep 17 00:00:00 2001 From: Will Szumski Date: Mon, 10 Mar 2025 17:50:27 +0000 Subject: [PATCH 09/16] Adds openstack exporter scrape interval --- .../kolla/config/prometheus/prometheus.rules | 2 +- etc/kayobe/kolla/globals.yml | 2 ++ etc/kayobe/stackhpc-monitoring.yml | 14 +++++++++++--- ...ps-redfish-exporter-to-v2-11032fb9dde36283.yaml | 6 ++++-- 4 files changed, 18 insertions(+), 6 deletions(-) diff --git a/etc/kayobe/kolla/config/prometheus/prometheus.rules b/etc/kayobe/kolla/config/prometheus/prometheus.rules index 20e1b303a4..44d2898400 100644 --- a/etc/kayobe/kolla/config/prometheus/prometheus.rules +++ b/etc/kayobe/kolla/config/prometheus/prometheus.rules @@ -7,7 +7,7 @@ groups: rules: - alert: PrometheusTargetMissing - expr: up{job!="redfish-exporter-seed"} == 0 + expr: up{job!="redfish-exporter-seed", job!="redfish-exporter-collectlog"} == 0 for: 5m labels: severity: critical diff --git a/etc/kayobe/kolla/globals.yml b/etc/kayobe/kolla/globals.yml index 87339d35a3..04be96f0eb 100644 --- a/etc/kayobe/kolla/globals.yml +++ b/etc/kayobe/kolla/globals.yml @@ -56,3 +56,5 @@ prometheus_blackbox_exporter_endpoints_kayobe: - endpoints: - "pulp:http_2xx:{{ pulp_url }}/pulp/api/v3/status/" enabled: "{{ seed_pulp_container_enabled | bool }}" + +prometheus_openstack_exporter_interval: "{{ stackhpc_prometheus_openstack_exporter_interval }}s" diff --git a/etc/kayobe/stackhpc-monitoring.yml b/etc/kayobe/stackhpc-monitoring.yml index 927f57ce97..994320c132 100644 --- a/etc/kayobe/stackhpc-monitoring.yml +++ b/etc/kayobe/stackhpc-monitoring.yml @@ -3,7 +3,12 @@ ############################################################################### # Prometheus server configuration -stackhpc_prometheus_query_lookback_delta: "{{ [redfish_exporter_scrape_interval, stackhpc_os_capacity_scrape_interval, 300] | max }}s" +# How far prometheus will look back in time to find a metric. +stackhpc_prometheus_query_lookback_delta: >- + {{ [stackhpc_redfish_exporter_scrape_interval | int, + stackhpc_os_capacity_scrape_interval | int, + stackhpc_prometheus_openstack_exporter_interval | int, + 300] | max + 30 }}s ############################################################################### # Alert configuration @@ -55,6 +60,9 @@ stackhpc_os_capacity_scrape_interval: 900 # Whether the redfish exporter is enabled. stackhpc_enable_redfish_exporter: false +# How often to scrape the BMCs in seconds. +stackhpc_redfish_exporter_scrape_interval: "{{ [8 * groups['redfish_exporter_targets'] | length, 300] | max }}" + # Credentials redfish_exporter_default_username: "{{ ipmi_username }}" redfish_exporter_default_password: "{{ ipmi_password }}" @@ -62,8 +70,8 @@ redfish_exporter_default_password: "{{ ipmi_password }}" # The address of the BMC that is used to query redfish metrics. redfish_exporter_target_address: "{{ ipmi_address }}" -# How often to scrape the BMCs in seconds. -redfish_exporter_scrape_interval: "{{ [8 * groups['redfish_exporter_targets'] | length, 300] | max }}" +# How often to scrape OpenStack Exporter in seconds. +stackhpc_prometheus_openstack_exporter_interval: 300 ############################################################################### diff --git a/releasenotes/notes/bumps-redfish-exporter-to-v2-11032fb9dde36283.yaml b/releasenotes/notes/bumps-redfish-exporter-to-v2-11032fb9dde36283.yaml index 9898b9e4eb..cf4af3d538 100644 --- a/releasenotes/notes/bumps-redfish-exporter-to-v2-11032fb9dde36283.yaml +++ b/releasenotes/notes/bumps-redfish-exporter-to-v2-11032fb9dde36283.yaml @@ -3,8 +3,10 @@ features: - Upgrades the redfish exporter container image to the v2.x series. - Adds support for lenovo hardware to the redfish exporter dashboard. - | - Adds the ``redfish_exporter_scrape_interval``, and - ``stackhpc_os_capacity_scrape_interval`` configuration variables. + Adds the ``stackhpcredfish_exporter_scrape_interval``, + ``stackhpc_os_capacity_scrape_interval``, and + ``stackhpc_prometheus_openstack_exporter_interval`` + configuration variables. fixes: - | Sets the prometheus server side option ``query.lookback-delta`` to From 27dfdc4f74b5becf417acf5d6544bdf89728fa3b Mon Sep 17 00:00:00 2001 From: Will Szumski Date: Tue, 11 Mar 2025 17:17:10 +0000 Subject: [PATCH 10/16] Add some raw tags --- .../kolla/config/prometheus/prometheus.yml.d/70-oscapacity.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etc/kayobe/kolla/config/prometheus/prometheus.yml.d/70-oscapacity.yml b/etc/kayobe/kolla/config/prometheus/prometheus.yml.d/70-oscapacity.yml index 6bfc49de7c..145eac9e92 100644 --- a/etc/kayobe/kolla/config/prometheus/prometheus.yml.d/70-oscapacity.yml +++ b/etc/kayobe/kolla/config/prometheus/prometheus.yml.d/70-oscapacity.yml @@ -7,7 +7,7 @@ scrape_configs: static_configs: - targets: - '{{ kolla_internal_fqdn | put_address_in_context('url') }}:9090' - scrape_interval: "{{ stackhpc_os_capacity_scrape_interval }}s" + scrape_interval: {% endraw %}"{{ stackhpc_os_capacity_scrape_interval }}s"{% raw %} scrape_timeout: 10m {% if kolla_enable_tls_internal | bool %} scheme: https From 8f3ac48a9aed0c9ebb0099b6497f361ad7fc1867 Mon Sep 17 00:00:00 2001 From: Will Szumski Date: Tue, 11 Mar 2025 17:19:39 +0000 Subject: [PATCH 11/16] Slightly less offensive variant on raw tags --- .../config/prometheus/prometheus.yml.d/70-oscapacity.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/etc/kayobe/kolla/config/prometheus/prometheus.yml.d/70-oscapacity.yml b/etc/kayobe/kolla/config/prometheus/prometheus.yml.d/70-oscapacity.yml index 145eac9e92..7c868732da 100644 --- a/etc/kayobe/kolla/config/prometheus/prometheus.yml.d/70-oscapacity.yml +++ b/etc/kayobe/kolla/config/prometheus/prometheus.yml.d/70-oscapacity.yml @@ -7,8 +7,10 @@ scrape_configs: static_configs: - targets: - '{{ kolla_internal_fqdn | put_address_in_context('url') }}:9090' - scrape_interval: {% endraw %}"{{ stackhpc_os_capacity_scrape_interval }}s"{% raw %} +{% endraw %} + scrape_interval: "{{ stackhpc_os_capacity_scrape_interval }}s" scrape_timeout: 10m +{% raw %} {% if kolla_enable_tls_internal | bool %} scheme: https {% endif %} From a73dd627f58b1c2d8e911df021e2a1d1905c993c Mon Sep 17 00:00:00 2001 From: Will Szumski Date: Wed, 12 Mar 2025 14:52:09 +0000 Subject: [PATCH 12/16] Refresh release note --- .../bumps-redfish-exporter-to-v2-11032fb9dde36283.yaml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/releasenotes/notes/bumps-redfish-exporter-to-v2-11032fb9dde36283.yaml b/releasenotes/notes/bumps-redfish-exporter-to-v2-11032fb9dde36283.yaml index cf4af3d538..3372f052bf 100644 --- a/releasenotes/notes/bumps-redfish-exporter-to-v2-11032fb9dde36283.yaml +++ b/releasenotes/notes/bumps-redfish-exporter-to-v2-11032fb9dde36283.yaml @@ -10,6 +10,10 @@ features: fixes: - | Sets the prometheus server side option ``query.lookback-delta`` to - the largest scrape interval so that redfish exporter metrics are not - marked stale before the next scrape. + the largest scrape interval so that metrics are not from exporters + with large scrape intervals are not marked stale before the next scrape. - Fixes various issues with the redfish exporter dashboard. +upgrade: + - | + Increases default ``os_capacity_scrape_interval`` to ``5m``. If you already customise + this please move to the new ``stackhpc_os_capacity_scrape_interval`` variable. From ba7934ae05d10d4042362727139f64b4fd37f925 Mon Sep 17 00:00:00 2001 From: Will Szumski Date: Wed, 12 Mar 2025 16:16:37 +0000 Subject: [PATCH 13/16] Update dashboard --- .../grafana/dashboards/openstack/redfish.json | 828 ++++++++++-------- 1 file changed, 454 insertions(+), 374 deletions(-) diff --git a/etc/kayobe/kolla/config/grafana/dashboards/openstack/redfish.json b/etc/kayobe/kolla/config/grafana/dashboards/openstack/redfish.json index 81cca89930..6ba67ad27f 100644 --- a/etc/kayobe/kolla/config/grafana/dashboards/openstack/redfish.json +++ b/etc/kayobe/kolla/config/grafana/dashboards/openstack/redfish.json @@ -20,12 +20,38 @@ "fiscalYearStartMonth": 0, "gnetId": 12403, "graphTooltip": 0, - "id": 30, + "id": 91, "links": [], "panels": [ + { + "collapsed": false, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 31, + "panels": [], + "targets": [ + { + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "refId": "A" + } + ], + "title": "Overview", + "type": "row" + }, { "datasource": { - "default": false, + "default": false, "type": "prometheus", "uid": "${datasource}" }, @@ -52,7 +78,7 @@ "h": 3, "w": 4, "x": 0, - "y": 0 + "y": 1 }, "id": 53, "options": { @@ -80,7 +106,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "count(up{job=\"redfish-exporter\"} == 1)", + "expr": "count(redfish_system_power_state{group=\"$group\", job!=\"redfish-exporter-collectlog\"})", "format": "table", "hide": false, "instant": true, @@ -134,7 +160,7 @@ "h": 3, "w": 4, "x": 4, - "y": 0 + "y": 1 }, "id": 54, "interval": "30m", @@ -163,7 +189,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "count(redfish_system_power_state{group=\"$group\", job=\"redfish-exporter\"} == 1)", + "expr": "count(redfish_system_power_state{group=\"$group\", job!=\"redfish-exporter-collectlog\"} == 1)", "format": "table", "hide": false, "instant": true, @@ -204,7 +230,7 @@ "h": 3, "w": 4, "x": 8, - "y": 0 + "y": 1 }, "id": 55, "options": { @@ -232,7 +258,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "count(redfish_system_power_state{group=\"$group\", job=\"redfish-exporter\"} != 1)", + "expr": "count(redfish_system_power_state{group=\"$group\", job!=\"redfish-exporter-collectlog\"} != 1)", "format": "table", "hide": false, "instant": true, @@ -262,7 +288,7 @@ }, { "color": "red", - "value": 80 + "value": 1 } ] } @@ -273,7 +299,7 @@ "h": 3, "w": 4, "x": 12, - "y": 0 + "y": 1 }, "id": 56, "options": { @@ -301,7 +327,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "count(redfish_chassis_health{group=\"$group\", job=\"redfish-exporter\" } != 1)", + "expr": "count(redfish_chassis_health{group=\"$group\", job!=\"redfish-exporter-collectlog\" } != 1)", "format": "table", "hide": false, "instant": true, @@ -313,75 +339,6 @@ "title": "Unhealthy Nodes", "type": "stat" }, - { - "datasource": { - "default": false, - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 3, - "w": 4, - "x": 16, - "y": 0 - }, - "id": 57, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "mean" - ], - "fields": "", - "values": true - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.2.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "editorMode": "code", - "expr": "count(redfish_logservices_entry_count{name=\"SEL Log Service\", severity!=\"OK\", group=\"$group\"} != 0)", - "format": "table", - "hide": false, - "instant": true, - "interval": "", - "legendFormat": "", - "refId": "A" - } - ], - "title": "Nodes with SEL Logs", - "type": "stat" - }, { "datasource": { "default": false, @@ -694,6 +651,18 @@ "id": "custom.align" } ] + }, + { + "matcher": { + "id": "byName", + "options": "Power state" + }, + "properties": [ + { + "id": "custom.width", + "value": 197 + } + ] } ] }, @@ -701,7 +670,7 @@ "h": 8, "w": 12, "x": 0, - "y": 3 + "y": 4 }, "id": 59, "options": { @@ -730,7 +699,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "sort_desc(redfish_system_power_state{group=\"$group\", job=\"redfish-exporter\"})", + "expr": "sort_desc(redfish_system_power_state{group=\"$group\", job!=\"redfish-exporter-collectlog\"})", "format": "table", "instant": true, "interval": "", @@ -788,7 +757,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -846,7 +816,8 @@ "mode": "absolute", "steps": [ { - "color": "rgba(50, 172, 45, 0.97)" + "color": "rgba(50, 172, 45, 0.97)", + "value": null }, { "color": "#EAB839", @@ -896,7 +867,7 @@ "h": 8, "w": 12, "x": 12, - "y": 3 + "y": 4 }, "id": 61, "interval": "", @@ -926,7 +897,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "sort(redfish_chassis_health{group=\"$group\", job=\"redfish-exporter\"})", + "expr": "sort(redfish_chassis_health{group=\"$group\", job!=\"redfish-exporter-collectlog\"})", "format": "table", "hide": false, "instant": true, @@ -946,221 +917,174 @@ ], "type": "table" }, - { - "collapsed": false, - "datasource": { - "type": "datasource", - "uid": "grafana" - }, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 11 - }, - "id": 31, - "panels": [], - "targets": [ - { - "datasource": { - "type": "datasource", - "uid": "grafana" - }, - "refId": "A" - } - ], - "title": "Overview", - "type": "row" - }, { "datasource": { "default": false, "type": "prometheus", "uid": "${datasource}" }, + "description": "", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" + "align": "auto", + "cellOptions": { + "type": "auto" }, - "thresholdsStyle": { - "mode": "off" - } + "inspect": false }, - "links": [], - "mappings": [], + "mappings": [ + { + "options": { + "1": { + "index": 1, + "text": "Healthy" + }, + "2": { + "index": 2, + "text": "Warning" + }, + "3": { + "index": 0, + "text": "Critical" + } + }, + "type": "value" + } + ], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/__name__|chassis_id|Time|env|job|resource|instance/" + }, + "properties": [ + { + "id": "displayName", + "value": "Time" }, { - "color": "red", - "value": 80 + "id": "custom.hidden", + "value": true + }, + { + "id": "custom.align" } ] }, - "unit": "watt" - }, - "overrides": [] - }, - "gridPos": { - "h": 11, - "w": 7, - "x": 0, - "y": 12 - }, - "id": 36, - "options": { - "legend": { - "calcs": [ - "mean", - "max" - ], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - "pluginVersion": "7.1.5", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "editorMode": "code", - "expr": "sum(redfish_chassis_power_average_consumed_watts{group=\"$group\", job=\"redfish-exporter\"})", - "legendFormat": "Total power consumption", - "range": true, - "refId": "A" - } - ], - "title": "Power Consumption", - "type": "timeseries" - }, - { - "datasource": { - "default": false, - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" + { + "matcher": { + "id": "byName", + "options": "Value" }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ + "properties": [ { - "color": "green", - "value": null + "id": "displayName", + "value": "Status" }, { - "color": "red", - "value": 80 + "id": "unit", + "value": "" + }, + { + "id": "custom.cellOptions", + "value": { + "type": "color-background" + } + }, + { + "id": "custom.align" + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)", + "value": null + }, + { + "color": "#EAB839", + "value": 2 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 3 + } + ] + } } ] }, - "unit": "watt" - }, - "overrides": [] + { + "matcher": { + "id": "byName", + "options": "server" + }, + "properties": [ + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "links", + "value": [ + { + "targetBlank": true, + "title": "", + "url": "https://${__data.fields.instance}" + } + ] + }, + { + "id": "custom.align" + } + ] + } + ] }, "gridPos": { "h": 11, - "w": 6, - "x": 7, + "w": 12, + "x": 0, "y": 12 }, - "id": 44, + "id": 62, + "interval": "", "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true + "show": false }, - "tooltip": { - "mode": "multi", - "sort": "none" - } + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "Status" + } + ] }, - "pluginVersion": "7.1.5", + "pluginVersion": "11.2.0", "targets": [ { "datasource": { @@ -1168,15 +1092,45 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "sum(redfish_chassis_power_average_consumed_watts{group=\"$group\", job=\"redfish-exporter\"}) by (env)", + "expr": "max(last_over_time({__name__=~\"redfish_chassis_log_entry_severity_state|redfish_manager_log_entry_severity_state\", group=\"$group\"}[1h])) by (group, server, log_service_id, instance)", + "format": "table", + "hide": false, + "instant": true, "interval": "", - "legendFormat": "{{ env }}", - "range": true, + "legendFormat": "", "refId": "A" } ], - "title": "Power Consumption", - "type": "timeseries" + "title": "Log severity summary", + "transformations": [ + { + "id": "merge", + "options": { + "reducers": [] + } + }, + { + "id": "organize", + "options": { + "excludeByName": {}, + "includeByName": {}, + "indexByName": { + "Time": 0, + "Value": 4, + "group": 1, + "instance": 5, + "log_service_id": 3, + "server": 2 + }, + "renameByName": { + "Value": "Status", + "log_service_id": "Log Service", + "server": "Server" + } + } + } + ], + "type": "table" }, { "datasource": { @@ -1237,20 +1191,21 @@ } ] }, - "unit": "short" + "unit": "watt" }, "overrides": [] }, "gridPos": { "h": 11, "w": 6, - "x": 13, + "x": 12, "y": 12 }, - "id": 42, + "id": 44, "options": { "legend": { "calcs": [ + "mean", "lastNotNull", "max" ], @@ -1271,28 +1226,14 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "count(redfish_system_power_state{group=\"$group\", job=\"redfish-exporter\"} == 1) by (env)", - "hide": false, + "expr": "sum(redfish_chassis_power_average_consumed_watts{group=\"$group\", job!=\"redfish-exporter-collectlog\"}) by (env)", "interval": "", - "legendFormat": "Powered up {{ env }}", + "legendFormat": "{{ env }}", "range": true, "refId": "A" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "editorMode": "code", - "expr": "count(redfish_system_power_state{group=\"$group\", job=\"redfish-exporter\"} == 2) by (env) * -1", - "hide": true, - "interval": "", - "legendFormat": "Powered down {{ env }}", - "range": true, - "refId": "B" } ], - "title": "Powered ON", + "title": "Power consumption", "type": "timeseries" }, { @@ -1360,11 +1301,11 @@ }, "gridPos": { "h": 11, - "w": 5, - "x": 19, + "w": 6, + "x": 18, "y": 12 }, - "id": 43, + "id": 42, "options": { "legend": { "calcs": [ @@ -1388,8 +1329,8 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "count(redfish_system_power_state{group=\"$group\", job=\"redfish-exporter\"} == 1) by (env)", - "hide": true, + "expr": "count(redfish_system_power_state{group=\"$group\", job!=\"redfish-exporter-collectlog\"} == 1) by (env)", + "hide": false, "interval": "", "legendFormat": "Powered up {{ env }}", "range": true, @@ -1401,15 +1342,15 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "count(redfish_system_power_state{group=\"$group\", job=\"redfish-exporter\"} == 2) by (env)", - "hide": false, + "expr": "count(redfish_system_power_state{group=\"$group\", job!=\"redfish-exporter-collectlog\"} == 2) by (env) * -1", + "hide": true, "interval": "", "legendFormat": "Powered down {{ env }}", "range": true, "refId": "B" } ], - "title": "Powered OFF", + "title": "Powered status", "type": "timeseries" }, { @@ -1508,7 +1449,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "max(redfish_chassis_temperature_celsius{sensor=~\".*Ambient.*Temp|.*Inlet.*\", group=\"$group\", job=\"redfish-exporter\"}) by (env)", + "expr": "max(redfish_chassis_temperature_celsius{sensor=~\".*Ambient.*Temp|.*Inlet.*\", group=\"$group\", job!=\"redfish-exporter-collectlog\"}) by (env)", "hide": false, "interval": "", "legendFormat": "max inlet {{ env }}", @@ -1657,7 +1598,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "max(redfish_chassis_temperature_celsius{sensor=~\".*CPU.*1.*Temp\", group=\"$group\", job=\"redfish-exporter\"}) by (env)", + "expr": "max(redfish_chassis_temperature_celsius{sensor=~\".*CPU.*1.*Temp\", group=\"$group\", job!=\"redfish-exporter-collectlog\"}) by (env)", "hide": false, "interval": "", "legendFormat": "{{ env }}", @@ -1805,7 +1746,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "max(redfish_chassis_temperature_celsius{sensor=~\"CPU2 Temp|.*CPU.2.*Temp\", group=\"$group\", job=\"redfish-exporter\"}) by (env)", + "expr": "max(redfish_chassis_temperature_celsius{sensor=~\"CPU2 Temp|.*CPU.2.*Temp\", group=\"$group\", job!=\"redfish-exporter-collectlog\"}) by (env)", "hide": false, "interval": "", "legendFormat": "{{ env }}", @@ -1848,7 +1789,7 @@ }, "gridPos": { "h": 10, - "w": 5, + "w": 6, "x": 0, "y": 34 }, @@ -1907,7 +1848,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "redfish_chassis_temperature_celsius{sensor=~\".*Ambient.*Temp|.*Inlet.*\", group=\"$group\", job=\"redfish-exporter\"}", + "expr": "redfish_chassis_temperature_celsius{sensor=~\".*Ambient.*Temp|.*Inlet.*\", group=\"$group\", job!=\"redfish-exporter-collectlog\"}", "hide": false, "interval": "", "intervalFactor": 1, @@ -1966,7 +1907,7 @@ "gridPos": { "h": 10, "w": 6, - "x": 5, + "x": 6, "y": 34 }, "heatmap": {}, @@ -2026,7 +1967,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "redfish_chassis_temperature_celsius{sensor=~\".*CPU.*1.*Temp\", group=\"$group\", job=\"redfish-exporter\"} != 0", + "expr": "redfish_chassis_temperature_celsius{sensor=~\".*CPU.*1.*Temp\", group=\"$group\", job!=\"redfish-exporter-collectlog\"} != 0", "hide": false, "interval": "", "intervalFactor": 1, @@ -2087,7 +2028,7 @@ "gridPos": { "h": 10, "w": 6, - "x": 11, + "x": 12, "y": 34 }, "heatmap": {}, @@ -2147,7 +2088,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "redfish_chassis_temperature_celsius{sensor=~\".*CPU.*2.*Temp\", group=\"$group\", job=\"redfish-exporter\"} != 0", + "expr": "redfish_chassis_temperature_celsius{sensor=~\".*CPU.*2.*Temp\", group=\"$group\", job!=\"redfish-exporter-collectlog\"} != 0", "hide": false, "interval": "", "intervalFactor": 1, @@ -2207,7 +2148,7 @@ }, "gridPos": { "h": 10, - "w": 5, + "w": 6, "x": 18, "y": 34 }, @@ -2266,7 +2207,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "max(redfish_chassis_fan_rpm_percentage{group=\"$group\", job=\"redfish-exporter\"}) by (server) > 0", + "expr": "max(redfish_chassis_fan_rpm_percentage{group=\"$group\", job!=\"redfish-exporter-collectlog\"}) by (server) > 0", "hide": false, "interval": "", "intervalFactor": 1, @@ -2341,8 +2282,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -2386,7 +2326,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "redfish_chassis_temperature_celsius{sensor=~\".*Ambient.*Temp\", job=\"redfish-exporter\"} or redfish_chassis_temperature_celsius{sensor_id=~\".*InletTemp\", job=\"redfish-exporter\"} or redfish_chassis_temperature_celsius{sensor=~\".*Inlet.*\", job=\"redfish-exporter\"}", + "expr": "redfish_chassis_temperature_celsius{sensor=~\".*Ambient.*Temp\", job!=\"redfish-exporter-collectlog\"} or redfish_chassis_temperature_celsius{sensor_id=~\".*InletTemp\", job!=\"redfish-exporter-collectlog\"} or redfish_chassis_temperature_celsius{sensor=~\".*Inlet.*\", job!=\"redfish-exporter-collectlog\"}", "hide": false, "interval": "", "legendFormat": "{{ env }} {{ server }}", @@ -2448,8 +2388,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -2492,7 +2431,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "redfish_chassis_temperature_celsius{sensor=~\".*CPU.*1.*Temp\", group=\"$group\", job=\"redfish-exporter\"} != 0", + "expr": "redfish_chassis_temperature_celsius{sensor=~\".*CPU.*1.*Temp\", group=\"$group\", job!=\"redfish-exporter-collectlog\"} != 0", "hide": false, "interval": "", "legendFormat": "{{ env }} {{ server }}", @@ -2545,8 +2484,7 @@ "mode": "percentage", "steps": [ { - "color": "green", - "value": null + "color": "green" } ] }, @@ -2584,7 +2522,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "redfish_chassis_power_average_consumed_watts{server=\"$server\", job=\"redfish-exporter\"}", + "expr": "redfish_chassis_power_average_consumed_watts{server=\"$server\", job!=\"redfish-exporter-collectlog\"}", "hide": false, "interval": "", "legendFormat": "{{power_voltage}}", @@ -2609,8 +2547,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -2650,7 +2587,7 @@ "type": "prometheus", "uid": "${datasource}" }, - "expr": "count(redfish_system_storage_drive_state{server=~\"$server\", job=\"redfish-exporter\"} != 1) or vector(0)", + "expr": "count(redfish_system_storage_drive_state{server=~\"$server\", job!=\"redfish-exporter-collectlog\"} != 1) or vector(0)", "format": "table", "instant": true, "interval": "", @@ -2675,8 +2612,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -2716,7 +2652,7 @@ "type": "prometheus", "uid": "${datasource}" }, - "expr": "count(redfish_system_pcie_device_health_state{server=~\"$server\", job=\"redfish-exporter\"} != 1) or vector(0)", + "expr": "count(redfish_system_pcie_device_health_state{server=~\"$server\", job!=\"redfish-exporter-collectlog\"} != 1) or vector(0)", "format": "table", "instant": true, "interval": "", @@ -2742,8 +2678,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -2784,7 +2719,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "count(redfish_chassis_temperature_sensor_state{server=~\"$server\", job=\"redfish-exporter\"} > 2) or vector(0)", + "expr": "count(redfish_chassis_temperature_sensor_state{server=~\"$server\", job!=\"redfish-exporter-collectlog\"} > 2) or vector(0)", "format": "table", "instant": true, "interval": "", @@ -2809,8 +2744,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -2850,7 +2784,7 @@ "type": "prometheus", "uid": "${datasource}" }, - "expr": "count(redfish_chassis_power_powersupply_health{server=~\"$server\", job=\"redfish-exporter\"} > 1) or vector(0)", + "expr": "count(redfish_chassis_power_powersupply_health{server=~\"$server\", job!=\"redfish-exporter-collectlog\"} > 1) or vector(0)", "format": "table", "instant": true, "interval": "", @@ -2875,7 +2809,7 @@ "custom": { "align": "auto", "cellOptions": { - "type": "color-text" + "type": "auto" }, "inspect": false }, @@ -2886,18 +2820,32 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", - "value": 70 + "value": 80 } ] }, "unit": "degree" }, - "overrides": [] + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Temperature" + }, + "properties": [ + { + "id": "custom.cellOptions", + "value": { + "type": "color-background" + } + } + ] + } + ] }, "gridPos": { "h": 18, @@ -2916,7 +2864,13 @@ ], "show": false }, - "showHeader": true + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "Temperature" + } + ] }, "pluginVersion": "11.2.0", "targets": [ @@ -2927,7 +2881,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum by (sensor) (sum_over_time(redfish_chassis_temperature_celsius{server=\"$server\", job=\"redfish-exporter\"}[15m])) / sum by (sensor) (count_over_time(redfish_chassis_temperature_celsius{server=\"$server\", job=\"redfish-exporter\"}[15m])) > 0", + "expr": "sum by (sensor) (sum_over_time(redfish_chassis_temperature_celsius{server=\"$server\", job!=\"redfish-exporter-collectlog\"}[15m])) / sum by (sensor) (count_over_time(redfish_chassis_temperature_celsius{server=\"$server\", job!=\"redfish-exporter-collectlog\"}[15m])) > 0", "format": "table", "instant": true, "interval": "", @@ -2988,8 +2942,7 @@ "mode": "absolute", "steps": [ { - "color": "#d44a3a", - "value": null + "color": "#d44a3a" }, { "color": "#299c46", @@ -3038,7 +2991,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "redfish_system_power_state{server=\"$server\", job=\"redfish-exporter\"}", + "expr": "redfish_system_power_state{server=\"$server\", job!=\"redfish-exporter-collectlog\"}", "format": "time_series", "instant": true, "interval": "", @@ -3051,31 +3004,24 @@ }, { "datasource": { - "default": false, "type": "prometheus", "uid": "${datasource}" }, + "description": "", "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "mappings": [ { "options": { "1": { "text": "OK" - } - }, - "type": "value" - }, - { - "options": { + }, "2": { "text": "WARNING" - } - }, - "type": "value" - }, - { - "options": { + }, "3": { "text": "CRITICAL" } @@ -3087,8 +3033,7 @@ "mode": "absolute", "steps": [ { - "color": "#299c46", - "value": null + "color": "#299c46" }, { "color": "#299c46", @@ -3110,23 +3055,23 @@ "x": 3, "y": 59 }, - "id": 7, + "id": 66, "maxDataPoints": 100, "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "auto", - "orientation": "horizontal", + "orientation": "vertical", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ - "lastNotNull" + "max" ], - "fields": "", + "fields": "/^Value$/", "values": false }, "showPercentChange": false, - "textMode": "auto", + "textMode": "value", "wideLayout": true }, "pluginVersion": "11.2.0", @@ -3137,11 +3082,13 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "redfish_chassis_health{server=\"$server\", job=\"redfish-exporter\"}", - "format": "time_series", + "exemplar": false, + "expr": "redfish_chassis_health{server=~\"$server\"}", + "format": "table", "instant": true, - "interval": "30m", - "legendFormat": "Chassis {{chassis_id}}", + "interval": "", + "legendFormat": "{{chassis_id}}", + "range": false, "refId": "A" } ], @@ -3198,8 +3145,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "orange", @@ -3245,7 +3191,7 @@ "type": "prometheus", "uid": "${datasource}" }, - "expr": "redfish_chassis_temperature_celsius{server=~\"$server\", job=\"redfish-exporter\"}", + "expr": "redfish_chassis_temperature_celsius{server=~\"$server\", job!=\"redfish-exporter-collectlog\"}", "interval": "", "legendFormat": "{{sensor}}", "refId": "A" @@ -3254,6 +3200,141 @@ "title": "Temperatures", "type": "timeseries" }, + { + "datasource": { + "default": false, + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "fixed" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "color-background" + }, + "filterable": false, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#299c46" + }, + { + "color": "#299c46", + "value": 1 + }, + { + "color": "#d44a3a", + "value": 2 + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "mappings", + "value": [ + { + "options": { + "1": { + "color": "dark-green", + "index": 0, + "text": "OK" + }, + "2": { + "color": "dark-yellow", + "index": 1, + "text": "WARNING" + }, + "3": { + "color": "dark-red", + "index": 2, + "text": "CRITICAL" + } + }, + "type": "value" + } + ] + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 8, + "x": 0, + "y": 63 + }, + "id": 7, + "maxDataPoints": 100, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "enablePagination": true, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [] + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "redfish_chassis_health{server=\"$server\", job!=\"redfish-exporter-collectlog\"}", + "format": "table", + "instant": true, + "interval": "30m", + "legendFormat": "Chassis {{chassis_id}}", + "refId": "A" + } + ], + "title": "Health by Chassis", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "__name__": true, + "env": true, + "group": true, + "instance": true, + "job": true, + "resource": true, + "server": true + }, + "includeByName": {}, + "indexByName": {}, + "renameByName": {} + } + } + ], + "type": "table" + }, { "datasource": { "default": false, @@ -3305,8 +3386,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -3349,7 +3429,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "redfish_chassis_power_average_consumed_watts{server=~\"$server\", job=\"redfish-exporter\"}", + "expr": "redfish_chassis_power_average_consumed_watts{server=~\"$server\", job!=\"redfish-exporter-collectlog\"}", "interval": "", "legendFormat": "{{power_voltage}}", "range": true, @@ -3387,7 +3467,7 @@ }, { "current": { - "selected": true, + "selected": false, "text": "production", "value": "production" }, @@ -3418,7 +3498,7 @@ }, { "current": { - "selected": true, + "selected": false, "text": "compute", "value": "compute" }, @@ -3481,8 +3561,8 @@ ] }, "time": { - "from": "now-15m", - "to": "now-1m" + "from": "now-30m", + "to": "now-5m" }, "timepicker": { "nowDelay": "1m", @@ -3501,8 +3581,8 @@ }, "timezone": "", "title": "Redfish exporter", - "uid": "b02mElQGX", - "version": 1, + "uid": "redfish", + "version": 3, "weekStart": "" } {% endraw %} From 4809f3f6307fc77220ecb4f7db8cc84e9db78539 Mon Sep 17 00:00:00 2001 From: Matt Crees Date: Thu, 1 May 2025 09:52:44 +0100 Subject: [PATCH 14/16] Apply suggestions from code review Co-authored-by: Alex-Welsh <112560678+Alex-Welsh@users.noreply.github.com> --- .../kolla/config/prometheus/prometheus.yml.d/60-redfish.yml | 2 +- .../notes/bumps-redfish-exporter-to-v2-11032fb9dde36283.yaml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/etc/kayobe/kolla/config/prometheus/prometheus.yml.d/60-redfish.yml b/etc/kayobe/kolla/config/prometheus/prometheus.yml.d/60-redfish.yml index ebec44edc1..09994e2d20 100644 --- a/etc/kayobe/kolla/config/prometheus/prometheus.yml.d/60-redfish.yml +++ b/etc/kayobe/kolla/config/prometheus/prometheus.yml.d/60-redfish.yml @@ -7,7 +7,7 @@ scrape_configs: collectlogs: ['false'] metrics_path: /redfish scrape_timeout: 300s - scrape_interval: {{ redfish_exporter_scrape_interval }}s + scrape_interval: {{ stackhpc_redfish_exporter_scrape_interval }}s relabel_configs: - source_labels: [__address__] target_label: __param_target diff --git a/releasenotes/notes/bumps-redfish-exporter-to-v2-11032fb9dde36283.yaml b/releasenotes/notes/bumps-redfish-exporter-to-v2-11032fb9dde36283.yaml index 3372f052bf..0f949eb65c 100644 --- a/releasenotes/notes/bumps-redfish-exporter-to-v2-11032fb9dde36283.yaml +++ b/releasenotes/notes/bumps-redfish-exporter-to-v2-11032fb9dde36283.yaml @@ -10,8 +10,8 @@ features: fixes: - | Sets the prometheus server side option ``query.lookback-delta`` to - the largest scrape interval so that metrics are not from exporters - with large scrape intervals are not marked stale before the next scrape. + the largest scrape interval so that metrics from exporters with large + scrape intervals are not marked stale before the next scrape. - Fixes various issues with the redfish exporter dashboard. upgrade: - | From 110afe4fd1805f623ac3920973a2733f7ee66b3a Mon Sep 17 00:00:00 2001 From: Matt Crees Date: Thu, 1 May 2025 09:53:14 +0100 Subject: [PATCH 15/16] Bump to redfish exporter v2.1.1-stackhpc --- etc/kayobe/seed.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etc/kayobe/seed.yml b/etc/kayobe/seed.yml index fd8fd298e8..4542c9bbc6 100644 --- a/etc/kayobe/seed.yml +++ b/etc/kayobe/seed.yml @@ -145,7 +145,7 @@ seed_redfish_exporter_container: image: ghcr.io/stackhpc/redfish-exporter pre: "{{ kayobe_config_path }}/containers/redfish_exporter/pre.yml" post: "{{ kayobe_config_path }}/containers/redfish_exporter/post.yml" - tag: "v2.0.0-stackhpc" + tag: "v2.1.1-stackhpc" network_mode: host command: redfish_exporter --config.file /redfish_exporter.yml volumes: "/opt/kayobe/containers/redfish_exporter/redfish_exporter.yml:/redfish_exporter.yml:ro" From 7c5935b9f0486922af22ec306b03300b367a0a16 Mon Sep 17 00:00:00 2001 From: Matt Crees Date: Thu, 1 May 2025 09:57:47 +0100 Subject: [PATCH 16/16] Temporarily remove redfish log collection This isn't working yet (takes far too long to scrape),let's split it out into a separate patch. Also removed the query lookback delta config, as this was working around the much longer scrape times. --- .../kolla/config/prometheus/prometheus.rules | 2 +- .../prometheus.yml.d/60-redfish.yml | 22 ------------------- etc/kayobe/kolla/globals.yml | 2 +- etc/kayobe/stackhpc-monitoring.yml | 9 -------- ...dfish-exporter-to-v2-11032fb9dde36283.yaml | 4 ---- 5 files changed, 2 insertions(+), 37 deletions(-) diff --git a/etc/kayobe/kolla/config/prometheus/prometheus.rules b/etc/kayobe/kolla/config/prometheus/prometheus.rules index 44d2898400..20e1b303a4 100644 --- a/etc/kayobe/kolla/config/prometheus/prometheus.rules +++ b/etc/kayobe/kolla/config/prometheus/prometheus.rules @@ -7,7 +7,7 @@ groups: rules: - alert: PrometheusTargetMissing - expr: up{job!="redfish-exporter-seed", job!="redfish-exporter-collectlog"} == 0 + expr: up{job!="redfish-exporter-seed"} == 0 for: 5m labels: severity: critical diff --git a/etc/kayobe/kolla/config/prometheus/prometheus.yml.d/60-redfish.yml b/etc/kayobe/kolla/config/prometheus/prometheus.yml.d/60-redfish.yml index 09994e2d20..54a8ae2e55 100644 --- a/etc/kayobe/kolla/config/prometheus/prometheus.yml.d/60-redfish.yml +++ b/etc/kayobe/kolla/config/prometheus/prometheus.yml.d/60-redfish.yml @@ -23,27 +23,5 @@ scrape_configs: server: '{{ host }}' env: "{{ kayobe_environment | default('openstack') }}" group: "{{ hostvars[host]['redfish_exporter_scrape_group'] | default('overcloud') }}" -{% endfor %} - - job_name: redfish-exporter-collectlog - params: - collectlogs: ['true'] - metrics_path: /redfish - scrape_timeout: 1200s - scrape_interval: 3600s - relabel_configs: - - source_labels: [__address__] - target_label: __param_target - - source_labels: [__param_target] - target_label: instance - - target_label: __address__ - replacement: "{{ lookup('vars', admin_oc_net_name ~ '_ips')[groups.seed.0] }}:9610" - static_configs: -{% for host in groups.get('redfish_exporter_targets', []) %} - - targets: - - '{{ hostvars[host]["redfish_exporter_target_address"] }}' - labels: - server: '{{ host }}' - env: "{{ kayobe_environment | default('openstack') }}" - group: "{{ hostvars[host]['redfish_exporter_scrape_group'] | default('overcloud') }}" {% endfor %} {% endif %} diff --git a/etc/kayobe/kolla/globals.yml b/etc/kayobe/kolla/globals.yml index 04be96f0eb..85778eaac3 100644 --- a/etc/kayobe/kolla/globals.yml +++ b/etc/kayobe/kolla/globals.yml @@ -26,7 +26,7 @@ kolla_image_tags: # Monitoring and alerting related settings opensearch_heap_size: 8g -prometheus_cmdline_extras: "--storage.tsdb.retention.time=30d --query.lookback-delta={{ stackhpc_prometheus_query_lookback_delta }}" +prometheus_cmdline_extras: "--storage.tsdb.retention.time=30d" # Additional command line flags for node exporter to enable texfile collector for disk metrics and create textfile docker volume prometheus_node_exporter_extra_volumes: diff --git a/etc/kayobe/stackhpc-monitoring.yml b/etc/kayobe/stackhpc-monitoring.yml index 994320c132..831486d10c 100644 --- a/etc/kayobe/stackhpc-monitoring.yml +++ b/etc/kayobe/stackhpc-monitoring.yml @@ -1,14 +1,5 @@ --- # StackHPC monitoring configuration -############################################################################### -# Prometheus server configuration - -# How far prometheus will look back in time to find a metric. -stackhpc_prometheus_query_lookback_delta: >- - {{ [stackhpc_redfish_exporter_scrape_interval | int, - stackhpc_os_capacity_scrape_interval | int, - stackhpc_prometheus_openstack_exporter_interval | int, - 300] | max + 30 }}s ############################################################################### # Alert configuration diff --git a/releasenotes/notes/bumps-redfish-exporter-to-v2-11032fb9dde36283.yaml b/releasenotes/notes/bumps-redfish-exporter-to-v2-11032fb9dde36283.yaml index 0f949eb65c..ddf1d58eb5 100644 --- a/releasenotes/notes/bumps-redfish-exporter-to-v2-11032fb9dde36283.yaml +++ b/releasenotes/notes/bumps-redfish-exporter-to-v2-11032fb9dde36283.yaml @@ -8,10 +8,6 @@ features: ``stackhpc_prometheus_openstack_exporter_interval`` configuration variables. fixes: - - | - Sets the prometheus server side option ``query.lookback-delta`` to - the largest scrape interval so that metrics from exporters with large - scrape intervals are not marked stale before the next scrape. - Fixes various issues with the redfish exporter dashboard. upgrade: - |