From d628abc0e4bc33da6776ae758853ac223eab4878 Mon Sep 17 00:00:00 2001 From: technowhizz <7688823+technowhizz@users.noreply.github.com> Date: Fri, 12 Dec 2025 10:42:31 +0000 Subject: [PATCH] Add support for NVMe drives in smartmon.py Adds support for collecting SMART metrics from NVMe drives with the use of pysmart and smartctl JSON output. Includes updates to the deployment playbooks, tests, and dashboards. --- .../ansible/deployment/get-nvme-drives.yml | 25 +- .../ansible/deployment/smartmon-tools.yml | 25 +- .../ansible/scripts/generate_fixtures.py | 52 ++++ etc/kayobe/ansible/scripts/nvmemon.sh | 150 ---------- etc/kayobe/ansible/scripts/smartmon.py | 264 ++++++++++++++++- etc/kayobe/ansible/scripts/test_smartmon.py | 203 ++++++++++--- .../scripts/tests/Dell_ENT_NVMe_CM6.json | 26 -- .../scripts/tests/INTEL_SSDPE2KX010T8.json | 133 +++++++++ etc/kayobe/ansible/scripts/tests/nvme.json | 24 -- .../openstack/hardware_overview.json | 231 ++++++--------- .../grafana/dashboards/openstack/nvme.json | 271 +++++++++++++----- 11 files changed, 933 insertions(+), 471 deletions(-) delete mode 100644 etc/kayobe/ansible/scripts/nvmemon.sh delete mode 100644 etc/kayobe/ansible/scripts/tests/Dell_ENT_NVMe_CM6.json create mode 100644 etc/kayobe/ansible/scripts/tests/INTEL_SSDPE2KX010T8.json delete mode 100644 etc/kayobe/ansible/scripts/tests/nvme.json diff --git a/etc/kayobe/ansible/deployment/get-nvme-drives.yml b/etc/kayobe/ansible/deployment/get-nvme-drives.yml index 1d2404d805..9671c0c208 100644 --- a/etc/kayobe/ansible/deployment/get-nvme-drives.yml +++ b/etc/kayobe/ansible/deployment/get-nvme-drives.yml @@ -3,16 +3,31 @@ hosts: overcloud gather_facts: no tasks: - - name: Retrieve NVMe device information - ansible.builtin.command: "nvme list -o json" - register: nvme_list + - name: Scan for NVMe devices with smartctl + ansible.builtin.command: "smartctl --scan -j" + register: smartctl_scan changed_when: false become: true + - name: Extract NVMe device paths + ansible.builtin.set_fact: + nvme_devices: "{{ smartctl_scan.stdout | from_json | json_query('devices[?type==`nvme`].info_name') | default([]) }}" + changed_when: false + + - name: Retrieve NVMe device information via smartctl + ansible.builtin.command: "smartctl -i -j {{ item }}" + register: smartctl_info + loop: "{{ nvme_devices }}" + loop_control: + label: "{{ item }}" + changed_when: false + become: true + when: nvme_devices | length > 0 + - name: Parse NVMe device model names ansible.builtin.set_fact: - nvme_models: "{{ nvme_models | default([]) + [item.ModelNumber] }}" - loop: "{{ nvme_list.stdout | from_json | json_query('Devices[].{ModelNumber: ModelNumber}') }}" + nvme_models: "{{ nvme_models | default([]) + [item.model_name] }}" + loop: "{{ smartctl_info.results | default([]) | map(attribute='stdout') | map('from_json') | selectattr('model_name', 'defined') | list }}" changed_when: false - name: Set unique NVMe models as host facts diff --git a/etc/kayobe/ansible/deployment/smartmon-tools.yml b/etc/kayobe/ansible/deployment/smartmon-tools.yml index 6ec35ae0be..354aae31ae 100644 --- a/etc/kayobe/ansible/deployment/smartmon-tools.yml +++ b/etc/kayobe/ansible/deployment/smartmon-tools.yml @@ -2,11 +2,10 @@ - name: Install and set up SMART monitoring tools hosts: overcloud tasks: - - name: Ensure smartmontools, jq, nvme-cli and cron/cronie are installed + - name: Ensure smartmontools, jq, and cron/cronie are installed ansible.builtin.package: name: - smartmontools - - nvme-cli - jq - "{{ 'cron' if ansible_facts['distribution'] == 'Ubuntu' else 'cronie' }}" state: present @@ -54,7 +53,7 @@ enabled: true become: true - - name: Copy smartmon.py and nvmemon.sh from scripts folder + - name: Copy smartmon.py from scripts folder ansible.builtin.copy: src: "{{ lookup('env', 'KAYOBE_CONFIG_PATH') }}/ansible/scripts/{{ item }}" dest: /usr/local/bin/{{ item }} @@ -63,7 +62,6 @@ mode: "0700" loop: - smartmon.py - - nvmemon.sh become: true - name: Set PATH Variable for cron @@ -84,17 +82,6 @@ mv -f /var/lib/docker/volumes/textfile/_data/smartmon.prom.temp /var/lib/docker/volumes/textfile/_data/smartmon.prom become: true - - name: Schedule cronjob to run nvmemon.sh every 5 minutes and save output to file - ansible.builtin.cron: - name: SMART metrics for drive monitoring using nvmemon.sh - user: root - minute: "*/5" - job: >- - umask 0022 && /usr/local/bin/nvmemon.sh > - /var/lib/docker/volumes/textfile/_data/nvmemon.prom.temp && - mv -f /var/lib/docker/volumes/textfile/_data/nvmemon.prom.temp /var/lib/docker/volumes/textfile/_data/nvmemon.prom - become: true - - name: Remove old cronjobs if present ansible.builtin.cron: name: SMART metrics for drive monitoring using {{ item }} @@ -104,11 +91,15 @@ loop: - smartmon - nvmemon + - nvmemon.sh - - name: Remove old smartmon.sh if present + - name: Remove old monitoring scripts if present ansible.builtin.file: - path: /usr/local/bin/smartmon.sh + path: /usr/local/bin/{{ item }} state: absent + loop: + - smartmon.sh + - nvmemon.sh become: true - name: Gather NVMe drives and generate dwpd ratings diff --git a/etc/kayobe/ansible/scripts/generate_fixtures.py b/etc/kayobe/ansible/scripts/generate_fixtures.py index 5f8f7cc641..589d3acee6 100644 --- a/etc/kayobe/ansible/scripts/generate_fixtures.py +++ b/etc/kayobe/ansible/scripts/generate_fixtures.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 import json import re +import subprocess from pySMART import DeviceList SMARTMON_ATTRS = { @@ -63,6 +64,8 @@ "critical_comp_time", } +SMARTCTL_PATH = "/usr/sbin/smartctl" + DISK_INFO = { "name", "interface", @@ -84,6 +87,17 @@ def camel_to_snake(name): """ return re.sub(r'(? - -# Check if we are root -if [ "$EUID" -ne 0 ]; then - echo "${0##*/}: Please run as root!" >&2 - exit 1 -fi - -# Check if programs are installed -if ! command -v nvme >/dev/null 2>&1; then - echo "${0##*/}: nvme is not installed. Aborting." >&2 - exit 1 -fi - -if ! command -v jq >/dev/null 2>&1; then - echo "${0##*/}: jq is required but not installed. Aborting." >&2 - exit 1 -fi - -# Path to the DWPD ratings JSON file -dwpd_file="/opt/kayobe/etc/monitoring/dwpd_ratings.json" - -declare -A rated_dwpd - -load_dwpd_ratings() { - if [[ -f "$dwpd_file" ]]; then - # Read the JSON; if it fails, default to empty array - dwpd_json="$(cat "$dwpd_file" 2>/dev/null | jq '.' || echo '[]')" - - # We iterate over each array element in dwpd_json - while IFS= read -r line; do - key="$(echo "$line" | jq -r '.model_name')" - value="$(echo "$line" | jq -r '.rated_dwpd')" - - # Clean up trailing whitespace - key="${key%%[[:space:]]*}" - value="${value%%[[:space:]]*}" - - # If we have a valid key, store it in the dictionary - if [[ -n "$key" && "$key" != "null" ]]; then - rated_dwpd["$key"]="$value" - fi - done < <(echo "$dwpd_json" | jq -c '.[]') - else - echo "Warning: DWPD ratings file not found at '$dwpd_file'. Defaulting to rated_dwpd=1." >&2 - fi -} - - -load_dwpd_ratings - -output_format_awk="$( - cat <<'OUTPUTAWK' -BEGIN { v = "" } -v != $1 { - print "# HELP nvme_" $1 " SMART metric " $1; - if ($1 ~ /_total$/) - print "# TYPE nvme_" $1 " counter"; - else - print "# TYPE nvme_" $1 " gauge"; - v = $1 -} -{print "nvme_" $0} -OUTPUTAWK -)" - -format_output() { - sort | awk -F'{' "${output_format_awk}" -} - -# Get the nvme-cli version -nvme_version="$(nvme version | awk '$1 == "nvme" {print $3}')" -echo "nvmecli{version=\"${nvme_version}\"} 1" | format_output - -# Get devices (DevicePath, PhysicalSize and ModelNumber) -device_info="$(nvme list -o json | jq -c '.Devices[] | {DevicePath, PhysicalSize, ModelNumber, SerialNumber}')" - -# Convert device_info to an array -device_info_array=() -while IFS= read -r line; do - device_info_array+=("$line") -done <<< "$device_info" - -# Loop through the NVMe devices -for device_data in "${device_info_array[@]}"; do - device="$(echo "$device_data" | jq -r '.DevicePath')" - json_check="$(nvme smart-log -o json "${device}")" - disk="${device##*/}" - model_name="$(echo "$device_data" | jq -r '.ModelNumber')" - serial_number="$(echo "$device_data" | jq -r '.SerialNumber')" - - physical_size="$(echo "$device_data" | jq -r '.PhysicalSize')" - echo "physical_size_bytes{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${physical_size}" - - # The temperature value in JSON is in Kelvin, we want Celsius - value_temperature="$(echo "$json_check" | jq '.temperature - 273')" - echo "temperature_celsius{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_temperature}" - - # Get the rated DWPD from the dictionary or default to 1 if not found - value_rated_dwpd="${rated_dwpd[$model_name]:-1}" - echo "rated_dwpd{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_rated_dwpd}" - - value_available_spare="$(echo "$json_check" | jq '.avail_spare / 100')" - echo "available_spare_ratio{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_available_spare}" - - value_available_spare_threshold="$(echo "$json_check" | jq '.spare_thresh / 100')" - echo "available_spare_threshold_ratio{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_available_spare_threshold}" - - value_percentage_used="$(echo "$json_check" | jq '.percent_used / 100')" - echo "percentage_used_ratio{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_percentage_used}" - - value_critical_warning="$(echo "$json_check" | jq '.critical_warning')" - echo "critical_warning_total{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_critical_warning}" - - value_media_errors="$(echo "$json_check" | jq '.media_errors')" - echo "media_errors_total{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_media_errors}" - - value_num_err_log_entries="$(echo "$json_check" | jq '.num_err_log_entries')" - echo "num_err_log_entries_total{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_num_err_log_entries}" - - value_power_cycles="$(echo "$json_check" | jq '.power_cycles')" - echo "power_cycles_total{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_power_cycles}" - - value_power_on_hours="$(echo "$json_check" | jq '.power_on_hours')" - echo "power_on_hours_total{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_power_on_hours}" - - value_controller_busy_time="$(echo "$json_check" | jq '.controller_busy_time')" - echo "controller_busy_time_seconds{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_controller_busy_time}" - - value_data_units_written="$(echo "$json_check" | jq '.data_units_written')" - echo "data_units_written_total{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_data_units_written}" - - value_data_units_read="$(echo "$json_check" | jq '.data_units_read')" - echo "data_units_read_total{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_data_units_read}" - - value_host_read_commands="$(echo "$json_check" | jq '.host_read_commands')" - echo "host_read_commands_total{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_host_read_commands}" - - value_host_write_commands="$(echo "$json_check" | jq '.host_write_commands')" - echo "host_write_commands_total{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_host_write_commands}" -done | format_output diff --git a/etc/kayobe/ansible/scripts/smartmon.py b/etc/kayobe/ansible/scripts/smartmon.py index 033ddbb94b..c14c717d29 100644 --- a/etc/kayobe/ansible/scripts/smartmon.py +++ b/etc/kayobe/ansible/scripts/smartmon.py @@ -69,8 +69,101 @@ "num_err_log_entries", "warning_temp_time", "critical_comp_time", + "nvme_total_capacity", + "nvme_unallocated_capacity", } +DATA_UNIT_BYTES = 512000 # NVMe data unit size (1000 * 512 bytes) +BYTES_PER_TB = 10 ** 12 +DWPD_RATINGS_PATH = "/opt/kayobe/etc/monitoring/dwpd_ratings.json" +DEFAULT_DWPD = 1.0 + + +def canonical_device_path(name): + """ + Ensure device name is an absolute /dev path for smartctl invocations. + """ + if not name: + return name + return name if name.startswith("/dev/") else f"/dev/{name}" + +def coerce_numeric(value): + """ + Best effort conversion of various value types (including pySMART attribute objects) + into a float. Returns None when conversion is not possible. + """ + if value is None: + return None + if isinstance(value, (int, float)): + return float(value) + for attr in ("value", "raw"): + try: + candidate = getattr(value, attr) + except AttributeError: + continue + if isinstance(candidate, (int, float)): + return float(candidate) + try: + return float(value) + except (TypeError, ValueError): + return None + + +def load_dwpd_ratings(path=DWPD_RATINGS_PATH): + """ + Load rated DWPD values from JSON file. + + The file is expected to contain either a list of objects with + 'model_name' and 'rated_dwpd' keys, or a dictionary containing such a list. + """ + mapping = {} + + try: + with open(path, "r", encoding="utf-8") as fh: + data = json.load(fh) + except FileNotFoundError: + return mapping + except (json.JSONDecodeError, OSError): + return mapping + + if isinstance(data, dict): + if isinstance(data.get("stackhpc_dwpd_ratings"), list): + data_iterable = data["stackhpc_dwpd_ratings"] + elif isinstance(data.get("dwpd_values"), list): + data_iterable = data["dwpd_values"] + else: + data_iterable = [] + elif isinstance(data, list): + data_iterable = data + else: + data_iterable = [] + + for entry in data_iterable: + if not isinstance(entry, dict): + continue + model_name = str(entry.get("model_name", "")).strip() + rated_value = coerce_numeric(entry.get("rated_dwpd")) + if not model_name: + continue + if rated_value is None: + continue + mapping[model_name.lower()] = rated_value + + return mapping + + +DWPD_RATINGS = load_dwpd_ratings() + + +def get_rated_dwpd(model_name): + """ + Look up DWPD rating for the given model name, defaulting to 1.0. + """ + if not model_name: + return DEFAULT_DWPD + lookup_key = model_name.lower().strip() + return DWPD_RATINGS.get(lookup_key, DEFAULT_DWPD) + def run_command(command, parse_json=False): """ Helper to run a subprocess command and optionally parse JSON output. @@ -80,6 +173,27 @@ def run_command(command, parse_json=False): return json.loads(result.stdout) return result.stdout.strip() + +def smartctl_json(disk_name, disk_type=None, *args): + """ + Execute smartctl with JSON output enabled and return the parsed response. + + Args: + disk_name (str): Device path (e.g. /dev/nvme0). + disk_type (str): Interface type passed to smartctl -d (optional). + *args: Additional smartctl arguments (e.g. "-x", "-n", "standby"). + + Returns: + dict: Parsed JSON response. + """ + cmd = [SMARTCTL_PATH] + cmd.extend(args) + if disk_type and disk_type.lower() != "nvme": + cmd.extend(["-d", disk_type]) + cmd.extend(["-j", disk_name]) + return run_command(cmd, parse_json=True) + + def camel_to_snake(name): """ Convert a CamelCase string to snake_case. @@ -132,6 +246,12 @@ def parse_device_info(device): f'smartmon_device_smart_healthy{{{metric_labels}}} {float(is_healthy)}' ) + # Explicitly collect top-level temperature if available (fixes SCSI temperature issue) + # pySMART exposes 'temperature' as a top-level property which we can use for SCSI, + # whereas device.if_attributes.temperature is often None for SCSI. + if device.temperature is not None: + metrics.append(f'smartmon_temperature{{{metric_labels}}} {float(device.temperature)}') + return metrics def parse_if_attributes(device): @@ -170,6 +290,139 @@ def parse_if_attributes(device): return metrics + +def collect_nvme_metrics(device): + """ + Collect NVMe specific metrics using smartctl JSON output. + + Args: + device (Device): pySMART Device instance. + + Returns: + List[str]: Prometheus formatted metric strings. + """ + metrics = [] + disk_name = device.name + disk_type = device.interface or "" + serial_number = (device.serial or "").lower() + labels = f'disk="{disk_name}",serial_number="{serial_number}",type="{disk_type}"' + model_name = (device.model or "").strip() + + attr_values = {} + if device.if_attributes: + for attr_name in dir(device.if_attributes): + if attr_name.startswith("_"): + continue + value = getattr(device.if_attributes, attr_name, None) + if callable(value): + continue + attr_values[camel_to_snake(attr_name)] = value + + smartctl_target = canonical_device_path(disk_name) + try: + nvme_json = smartctl_json(smartctl_target, disk_type, "-x") + except Exception: + nvme_json = {} + + if not model_name: + model_name = str(nvme_json.get("model_name", "")).strip() + + health_log = nvme_json.get("nvme_smart_health_information_log") + if not isinstance(health_log, dict): + health_log = {} + + user_capacity = nvme_json.get("user_capacity") + if not isinstance(user_capacity, dict): + user_capacity = {} + + namespaces = nvme_json.get("nvme_namespaces") + if not isinstance(namespaces, list): + namespaces = [] + + def numeric_value(*sources): + for source in sources: + value = coerce_numeric(source) + if value is not None: + return value + return None + + namespace_capacity = None + for namespace in namespaces: + if not isinstance(namespace, dict): + continue + namespace_capacity = numeric_value( + namespace.get("capacity", {}).get("bytes"), + namespace.get("size", {}).get("bytes"), + namespace.get("utilization", {}).get("bytes"), + ) + if namespace_capacity is not None: + break + + total_capacity = numeric_value( + attr_values.get("nvme_total_capacity"), + nvme_json.get("nvme_total_capacity"), + user_capacity.get("bytes"), + namespace_capacity, + ) + if total_capacity is not None: + metrics.append(f"smartmon_nvme_total_capacity_bytes{{{labels}}} {total_capacity}") + metrics.append(f"smartmon_physical_size_bytes{{{labels}}} {total_capacity}") + + rated_dwpd = get_rated_dwpd(model_name) + metrics.append(f"smartmon_nvme_rated_dwpd{{{labels}}} {rated_dwpd}") + + unallocated_capacity = numeric_value( + attr_values.get("nvme_unallocated_capacity"), + nvme_json.get("nvme_unallocated_capacity"), + ) + if unallocated_capacity is not None: + metrics.append(f"smartmon_nvme_unallocated_capacity_bytes{{{labels}}} {unallocated_capacity}") + + data_units_read_attr = "data_units_read" in attr_values + data_units_read = numeric_value( + attr_values.get("data_units_read"), + health_log.get("data_units_read"), + ) + if data_units_read is not None: + bytes_read = data_units_read * DATA_UNIT_BYTES + if not data_units_read_attr: + metrics.append(f"smartmon_data_units_read{{{labels}}} {data_units_read}") + metrics.append(f"smartmon_nvme_terabytes_read_total{{{labels}}} {bytes_read / BYTES_PER_TB}") + + data_units_written_attr = "data_units_written" in attr_values + data_units_written = numeric_value( + attr_values.get("data_units_written"), + health_log.get("data_units_written"), + ) + if data_units_written is not None: + bytes_written = data_units_written * DATA_UNIT_BYTES + if not data_units_written_attr: + metrics.append(f"smartmon_data_units_written{{{labels}}} {data_units_written}") + metrics.append(f"smartmon_nvme_terabytes_written_total{{{labels}}} {bytes_written / BYTES_PER_TB}") + + # Collect additional NVMe health log metrics that might be missed by pySMART + # due to naming mismatches + nvme_health_metrics = [ + "media_errors", + "num_err_log_entries", + "warning_temp_time", + "critical_comp_time", + "host_reads", + "host_writes", + ] + + for key in nvme_health_metrics: + # Check if we already got this from pySMART (may change in the future) + if key in attr_values: + continue + + val = numeric_value(health_log.get(key)) + if val is not None: + metrics.append(f"smartmon_{key}{{{labels}}} {val}") + + return metrics + + def write_metrics_to_textfile(metrics, output_path=None): """ Write metrics to a Prometheus textfile using prometheus_client. @@ -228,13 +481,15 @@ def main(output_path=None): disk_type = dev.interface or "" serial_number = (dev.serial or "").lower() + if not serial_number or not dev.assessment: + continue + run_timestamp = int(datetime.datetime.now(datetime.timezone.utc).timestamp()) all_metrics.append(f'smartmon_smartctl_run{{disk="{disk_name}",type="{disk_type}"}} {run_timestamp}') active = 1 try: - cmd = [SMARTCTL_PATH, "-n", "standby", "-d", disk_type, "-j", disk_name] - standby_json = run_command(cmd, parse_json=True) + standby_json = smartctl_json(canonical_device_path(disk_name), disk_type, "-n", "standby") if standby_json.get("power_mode", "") == "standby": active = 0 except json.JSONDecodeError: @@ -250,6 +505,11 @@ def main(output_path=None): all_metrics.extend(parse_device_info(dev)) all_metrics.extend(parse_if_attributes(dev)) + disk_basename = os.path.basename(disk_name) + disk_type_normalized = (disk_type or "").lower() + is_nvme = disk_type_normalized == "nvme" or disk_basename.startswith("nvme") + if is_nvme: + all_metrics.extend(collect_nvme_metrics(dev)) write_metrics_to_textfile(all_metrics, output_path) diff --git a/etc/kayobe/ansible/scripts/test_smartmon.py b/etc/kayobe/ansible/scripts/test_smartmon.py index 4749808a5c..2ebd7c16b3 100644 --- a/etc/kayobe/ansible/scripts/test_smartmon.py +++ b/etc/kayobe/ansible/scripts/test_smartmon.py @@ -1,11 +1,49 @@ import glob import json import os -import unittest +import sys import tempfile +import types +import unittest import math -from time import sleep +CURRENT_DIR = os.path.dirname(__file__) +if CURRENT_DIR not in sys.path: + sys.path.insert(0, CURRENT_DIR) + +prometheus_stub = types.ModuleType("prometheus_client") + +class DummyCollectorRegistry: + pass + + +class DummyGauge: + def __init__(self, *args, **kwargs): + self._values = {} + + def labels(self, *args, **kwargs): + return self + + def set(self, value): + self._last_set = value + + +prometheus_stub.CollectorRegistry = DummyCollectorRegistry +prometheus_stub.Gauge = DummyGauge +prometheus_stub.write_to_textfile = lambda *args, **kwargs: None +sys.modules.setdefault("prometheus_client", prometheus_stub) + +pySMART_stub = types.ModuleType("pySMART") + +class DummyDeviceList: + def __init__(self, devices=None): + self.devices = devices or [] + +pySMART_stub.DeviceList = DummyDeviceList +sys.modules.setdefault("pySMART", pySMART_stub) + +# Import after stubbing so smartmon pulls in the lightweight stand-ins above. +import smartmon from unittest.mock import patch, MagicMock from smartmon import ( parse_device_info, @@ -13,7 +51,10 @@ main, SMARTMON_ATTRS, camel_to_snake, - write_metrics_to_textfile, + collect_nvme_metrics, + DATA_UNIT_BYTES, + BYTES_PER_TB, + DEFAULT_DWPD, ) def load_json_fixture(filename): @@ -28,14 +69,26 @@ def load_json_fixture(filename): class TestSmartMon(unittest.TestCase): @classmethod def setUpClass(cls): - # Collect all *.json files from ./tests/ + # Collect all JSON fixtures that include both device metadata and the smartctl JSON payload. data_folder = os.path.join(os.path.dirname(__file__), "tests") - cls.fixture_files = glob.glob(os.path.join(data_folder, "*.json")) + cls.fixture_files = [] + for path in glob.glob(os.path.join(data_folder, "*.json")): + with open(path, "r", encoding="utf-8") as fh: + try: + data = json.load(fh) + except json.JSONDecodeError: + continue + if isinstance(data, dict) and "device_info" in data and "smartctl" in data: + cls.fixture_files.append(path) + if not cls.fixture_files: + raise unittest.SkipTest("No SMART fixtures found") + cls.primary_fixture = os.path.basename(cls.fixture_files[0]) def create_mock_device_from_json(self, device_info, if_attributes=None): """ Given a 'device_info' dict and optional 'if_attributes', build - a MagicMock that mimics a pySMART Device object. + a MagicMock that mimics a pySMART Device object so the code under test + sees the same shape it would on a live host. """ device = MagicMock() device.name = device_info.get("name", "") @@ -202,15 +255,81 @@ def test_parse_if_attributes(self): with self.subTest(fixture=fixture_name): self._test_parse_if_attributes(fixture_name) + @patch("smartmon.smartctl_json") + @patch("smartmon.get_rated_dwpd") + def test_collect_nvme_metrics_includes_capacity_and_dwpd(self, mock_get_dwpd, mock_smartctl_json): + """ + Ensure collect_nvme_metrics emits NVMe capacity, DWPD, and TB counters. + """ + # Re-use the real fixture so the smartctl payload matches what the exporter + # will see in production. + data = load_json_fixture(self.primary_fixture) + device_info = data["device_info"] + smartctl_payload = data["smartctl"] + mock_smartctl_json.return_value = smartctl_payload + mock_get_dwpd.return_value = 2.5 + + device = self.create_mock_device_from_json(device_info, data.get("if_attributes")) + + metrics = collect_nvme_metrics(device) + disk_name = device_info["name"] + serial_number = device_info["serial"].lower() + disk_type = device_info["interface"] + labels = f'disk="{disk_name}",serial_number="{serial_number}",type="{disk_type}"' + + total_capacity = float(smartctl_payload["nvme_total_capacity"]) + expected_capacity = f"smartmon_nvme_total_capacity_bytes{{{labels}}} {total_capacity}" + expected_physical = f"smartmon_physical_size_bytes{{{labels}}} {total_capacity}" + expected_unallocated = f"smartmon_nvme_unallocated_capacity_bytes{{{labels}}} {float(smartctl_payload.get('nvme_unallocated_capacity', 0))}" + expected_rated = f"smartmon_nvme_rated_dwpd{{{labels}}} 2.5" + + self.assertIn(expected_capacity, metrics) + self.assertIn(expected_physical, metrics) + self.assertIn(expected_unallocated, metrics) + self.assertIn(expected_rated, metrics) + + health_log = smartctl_payload["nvme_smart_health_information_log"] + expected_tb_read = (health_log["data_units_read"] * DATA_UNIT_BYTES) / BYTES_PER_TB + expected_tb_written = (health_log["data_units_written"] * DATA_UNIT_BYTES) / BYTES_PER_TB + + self.assertTrue( + any( + line.startswith(f"smartmon_nvme_terabytes_read_total{{{labels}}}") and + math.isclose(float(line.split()[-1]), expected_tb_read, rel_tol=1e-9) + for line in metrics + ), + "Expected NVMe TB read metric not found or incorrect value.", + ) + self.assertTrue( + any( + line.startswith(f"smartmon_nvme_terabytes_written_total{{{labels}}}") and + math.isclose(float(line.split()[-1]), expected_tb_written, rel_tol=1e-9) + for line in metrics + ), + "Expected NVMe TB written metric not found or incorrect value.", + ) + + self.assertFalse( + any(line.startswith(f"smartmon_data_units_read{{{labels}}}") for line in metrics), + "collect_nvme_metrics should not emit raw data_units_read when already provided by pySMART.", + ) + @patch("smartmon.run_command") @patch("smartmon.DeviceList") - @patch("smartmon.write_metrics_to_textfile", wraps=write_metrics_to_textfile) + @patch("smartmon.write_metrics_to_textfile") def test_main(self, mock_write_metrics, mock_devicelist_class, mock_run_cmd): """ End-to-end test of main() for every JSON fixture in ./tests/. This ensures we can handle multiple disks (multiple fixture files). Checks metrics written to a temp file, and that write_metrics_to_textfile is called once. """ + def fake_write_metrics(metrics, output_path): + # Instead of writing Prometheus text format we simply dump the raw metric + # strings so assertions can compare them without the collector library. + with open(output_path, "w", encoding="utf-8") as fh: + fh.write("\n".join(metrics)) + + mock_write_metrics.side_effect = fake_write_metrics # Patch run_command to return a version & "active" power_mode def run_command_side_effect(cmd, parse_json=False): @@ -227,6 +346,7 @@ def run_command_side_effect(cmd, parse_json=False): with self.subTest(msg=f"Testing main() with {fixture_name}"): mock_write_metrics.reset_mock() data = load_json_fixture(fixture_name) + smartctl_payload = data.get("smartctl", {}) device_info = data["device_info"] if_attrs = data.get("if_attributes", {}) @@ -238,41 +358,46 @@ def run_command_side_effect(cmd, parse_json=False): mock_dev_list.devices = [device_mock] mock_devicelist_class.return_value = mock_dev_list - with tempfile.NamedTemporaryFile(mode="r+", delete_on_close=False) as tmpfile: - path= tmpfile.name - main(output_path=path) - tmpfile.close() + with patch("smartmon.smartctl_json", return_value=smartctl_payload), patch("smartmon.get_rated_dwpd", return_value=DEFAULT_DWPD): + with tempfile.NamedTemporaryFile(mode="r+", delete=False) as tmpfile: + path = tmpfile.name + main(output_path=path) + tmpfile.close() - # Ensure write_metrics_to_textfile was called once self.assertEqual(mock_write_metrics.call_count, 1) - with open(path, "r") as f: - # Read the metrics from the file - metrics_lines = [line.strip() for line in f.readlines() if line.strip() and not line.startswith('#')] - print(f"Metrics lines: {metrics_lines}") - - # Generate expected metrics using the parse functions - expected_metrics = [] - expected_metrics.extend(parse_device_info(device_mock)) - expected_metrics.extend(parse_if_attributes(device_mock)) - - # Check that all expected metrics are present in the file - for expected in expected_metrics: - exp_metric, exp_val_str = expected.rsplit(" ", 1) - exp_val = float(exp_val_str) - found = any( - (exp_metric in line) and - math.isclose(float(line.rsplit(" ", 1)[1]), exp_val) - for line in metrics_lines - ) - self.assertTrue(found, f"Expected metric '{expected}' not found") - - # Check that smartctl_version metric is present - version_found = any(line.startswith("smartmon_smartctl_version{") for line in metrics_lines) - self.assertTrue(version_found, "Expected 'smartmon_smartctl_version' metric not found in output file.") - - # Check that the output file is not empty - self.assertTrue(metrics_lines, "Metrics output file is empty.") + with open(path, "r", encoding="utf-8") as f: + metrics_lines = [ + line.strip() + for line in f.readlines() + if line.strip() and not line.startswith("#") + ] + + try: + expected_metrics = [] + expected_metrics.extend(parse_device_info(device_mock)) + expected_metrics.extend(parse_if_attributes(device_mock)) + + iface = (device_info.get("interface") or "").lower() + if iface == "nvme" or device_info.get("name", "").startswith("/dev/nvme"): + expected_metrics.extend(collect_nvme_metrics(device_mock)) + + for expected in expected_metrics: + exp_metric, exp_val_str = expected.rsplit(" ", 1) + exp_val = float(exp_val_str) + found = any( + (exp_metric in line) and + math.isclose(float(line.rsplit(" ", 1)[1]), exp_val, rel_tol=1e-9) + for line in metrics_lines + ) + self.assertTrue(found, f"Expected metric '{expected}' not found") + + version_found = any(line.startswith("smartmon_smartctl_version{") for line in metrics_lines) + self.assertTrue(version_found, "Expected 'smartmon_smartctl_version' metric not found in output file.") + self.assertTrue(metrics_lines, "Metrics output file is empty.") + finally: + if os.path.exists(path): + os.unlink(path) if __name__ == "__main__": unittest.main() diff --git a/etc/kayobe/ansible/scripts/tests/Dell_ENT_NVMe_CM6.json b/etc/kayobe/ansible/scripts/tests/Dell_ENT_NVMe_CM6.json deleted file mode 100644 index d867910ae1..0000000000 --- a/etc/kayobe/ansible/scripts/tests/Dell_ENT_NVMe_CM6.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "device_info": { - "assessment": "PASS", - "firmware": "2.1.8", - "interface": "nvme", - "model": "Dell Ent NVMe CM6 RI 7.68TB", - "name": "nvme8", - "serial": "Y2Q0A0BPTCF8", - "smart_capable": true, - "smart_enabled": true, - "vendor": "Dell" - }, - "if_attributes": { - "availableSpare": 100, - "availableSpareThreshold": 10, - "controllerBusyTime": 2478, - "criticalWarning": 0, - "dataUnitsRead": 177817765, - "dataUnitsWritten": 127992843, - "percentageUsed": 1, - "powerCycles": 750, - "powerOnHours": 17427, - "temperature": 36, - "unsafeShutdowns": 37 - } -} diff --git a/etc/kayobe/ansible/scripts/tests/INTEL_SSDPE2KX010T8.json b/etc/kayobe/ansible/scripts/tests/INTEL_SSDPE2KX010T8.json new file mode 100644 index 0000000000..9ab1fd4334 --- /dev/null +++ b/etc/kayobe/ansible/scripts/tests/INTEL_SSDPE2KX010T8.json @@ -0,0 +1,133 @@ +{ + "device_info": { + "assessment": "PASS", + "firmware": "VDV10131", + "interface": "nvme", + "model": "INTEL SSDPE2KX010T8", + "name": "nvme1", + "serial": "BTLJ9183038W1P0FGN", + "smart_capable": true, + "smart_enabled": true, + "vendor": "INTEL" + }, + "if_attributes": { + "availableSpare": 100, + "availableSpareThreshold": 10, + "controllerBusyTime": 938, + "criticalWarning": 0, + "dataUnitsRead": 372386419, + "dataUnitsWritten": 172944945, + "percentageUsed": 8, + "powerCycles": 111, + "powerOnHours": 49618, + "temperature": 36, + "unsafeShutdowns": 95 + }, + "smartctl": { + "json_format_version": [ + 1, + 0 + ], + "smartctl": { + "version": [ + 7, + 2 + ], + "svn_revision": "5155", + "platform_info": "x86_64-linux-5.14.0-427.31.1.el9_4.x86_64", + "build_info": "(local build)", + "argv": [ + "smartctl", + "-x", + "-j", + "/dev/nvme1" + ], + "exit_status": 0 + }, + "device": { + "name": "/dev/nvme1", + "info_name": "/dev/nvme1", + "type": "nvme", + "protocol": "NVMe" + }, + "model_name": "INTEL SSDPE2KX010T8", + "serial_number": "BTLJ9183038W1P0FGN", + "firmware_version": "VDV10131", + "nvme_pci_vendor": { + "id": 32902, + "subsystem_id": 32902 + }, + "nvme_ieee_oui_identifier": 6083300, + "nvme_total_capacity": 1000204886016, + "nvme_unallocated_capacity": 0, + "nvme_controller_id": 0, + "nvme_version": { + "string": "1.2", + "value": 66048 + }, + "nvme_number_of_namespaces": 1, + "nvme_namespaces": [ + { + "id": 1, + "size": { + "blocks": 1953525168, + "bytes": 1000204886016 + }, + "capacity": { + "blocks": 1953525168, + "bytes": 1000204886016 + }, + "utilization": { + "blocks": 1953525168, + "bytes": 1000204886016 + }, + "formatted_lba_size": 512, + "eui64": { + "oui": 6083300, + "ext_id": 664757469440 + } + } + ], + "user_capacity": { + "blocks": 1953525168, + "bytes": 1000204886016 + }, + "logical_block_size": 512, + "local_time": { + "time_t": 1762460775, + "asctime": "Thu Nov 6 22:26:15 2025 SAST" + }, + "smart_status": { + "passed": true, + "nvme": { + "value": 0 + } + }, + "nvme_smart_health_information_log": { + "critical_warning": 0, + "temperature": 36, + "available_spare": 100, + "available_spare_threshold": 10, + "percentage_used": 8, + "data_units_read": 372386419, + "data_units_written": 172944945, + "host_reads": 2592438330, + "host_writes": 8381424983, + "controller_busy_time": 938, + "power_cycles": 111, + "power_on_hours": 49618, + "unsafe_shutdowns": 95, + "media_errors": 0, + "num_err_log_entries": 0, + "warning_temp_time": 0, + "critical_comp_time": 0 + }, + "temperature": { + "current": 36 + }, + "power_cycle_count": 111, + "power_on_time": { + "hours": 49618 + } + } +} diff --git a/etc/kayobe/ansible/scripts/tests/nvme.json b/etc/kayobe/ansible/scripts/tests/nvme.json deleted file mode 100644 index bbff19ec01..0000000000 --- a/etc/kayobe/ansible/scripts/tests/nvme.json +++ /dev/null @@ -1,24 +0,0 @@ -{ - "device_info": { - "name": "/dev/nvme0", - "interface": "nvme", - "vendor": "AcmeCorp", - "family": "Acme NVMe Family", - "model": "Acme NVMe 1TB", - "serial": "ABCD1234", - "firmware": "3.0.1", - "smart_capable": true, - "smart_enabled": true, - "assessment": "PASS" - }, - "if_attributes": { - "criticalWarning": 0, - "temperature": 36, - "availableSpare": 100, - "availableSpareThreshold": 10, - "percentageUsed": 0, - "dataUnitsRead": 117446405, - "dataUnitsWritten": 84630284, - "notInSmartmonAttrs": 999 - } -} diff --git a/etc/kayobe/kolla/config/grafana/dashboards/openstack/hardware_overview.json b/etc/kayobe/kolla/config/grafana/dashboards/openstack/hardware_overview.json index b305502223..1e5b22e287 100644 --- a/etc/kayobe/kolla/config/grafana/dashboards/openstack/hardware_overview.json +++ b/etc/kayobe/kolla/config/grafana/dashboards/openstack/hardware_overview.json @@ -1,48 +1,5 @@ {% raw %} { - "__inputs": [ - { - "name": "datasource", - "label": "Prometheus", - "description": "", - "type": "datasource", - "pluginId": "prometheus", - "pluginName": "Prometheus" - } - ], - "__elements": {}, - "__requires": [ - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "11.4.0" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "1.0.0" - }, - { - "type": "panel", - "id": "stat", - "name": "Stat", - "version": "" - }, - { - "type": "panel", - "id": "table", - "name": "Table", - "version": "" - }, - { - "type": "panel", - "id": "timeseries", - "name": "Time series", - "version": "" - } - ], "annotations": { "list": [ { @@ -68,7 +25,7 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, - "id": null, + "id": 66, "links": [], "panels": [ { @@ -100,8 +57,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" } ] } @@ -132,7 +88,7 @@ "textMode": "auto", "wideLayout": true }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.0.1+security-01", "targets": [ { "datasource": { @@ -170,8 +126,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -206,7 +161,7 @@ "textMode": "auto", "wideLayout": true }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.0.1+security-01", "targets": [ { "datasource": { @@ -243,8 +198,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" } ] } @@ -275,7 +229,7 @@ "textMode": "auto", "wideLayout": true }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.0.1+security-01", "targets": [ { "datasource": { @@ -317,8 +271,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" } ] } @@ -562,7 +515,7 @@ "showHeader": true, "sortBy": [] }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.0.1+security-01", "targets": [ { "$$hashKey": "object:40", @@ -704,8 +657,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -731,11 +683,12 @@ "showLegend": true }, "tooltip": { + "hideZeros": false, "mode": "single", "sort": "none" } }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.0.1+security-01", "targets": [ { "datasource": { @@ -789,8 +742,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" } ] } @@ -1024,7 +976,7 @@ } ] }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.0.1+security-01", "targets": [ { "$$hashKey": "object:40", @@ -1041,7 +993,7 @@ "displayValueWithAlias": "Never", "editorMode": "code", "exemplar": false, - "expr": "label_join(nvme_critical_warning_total{instance=~\"$node\"},\"unique_device\", \"-\", \"instance\", \"device\")", + "expr": "label_join(smartmon_critical_warning{instance=~\"$node\",type=\"nvme\"},\"unique_device\", \"-\", \"instance\", \"disk\")", "format": "table", "instant": true, "interval": "", @@ -1059,7 +1011,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "label_join(nvme_temperature_celsius{instance=~\"$node\"},\"unique_device\", \"-\", \"instance\", \"device\")", + "expr": "label_join(smartmon_temperature{instance=~\"$node\",type=\"nvme\"},\"unique_device\", \"-\", \"instance\", \"disk\")", "format": "table", "hide": false, "instant": true, @@ -1075,7 +1027,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "label_join(nvme_data_units_written_total{instance=~\"$node\"},\"unique_device\", \"-\", \"instance\", \"device\") * 512", + "expr": "label_join(smartmon_data_units_written{instance=~\"$node\",type=\"nvme\"},\"unique_device\", \"-\", \"instance\", \"disk\") * 512", "format": "table", "hide": false, "instant": true, @@ -1090,7 +1042,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "label_join(nvme_data_units_read_total{instance=~\"$node\"},\"unique_device\", \"-\", \"instance\", \"device\") * 512", + "expr": "label_join(smartmon_data_units_read{instance=~\"$node\",type=\"nvme\"},\"unique_device\", \"-\", \"instance\", \"disk\") * 512", "format": "table", "hide": false, "instant": true, @@ -1105,7 +1057,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "label_join(nvme_physical_size_bytes{instance=~\"$node\"},\"unique_device\", \"-\", \"instance\", \"device\")", + "expr": "label_join(smartmon_physical_size_bytes{instance=~\"$node\",type=\"nvme\"},\"unique_device\", \"-\", \"instance\", \"disk\")", "format": "table", "hide": false, "instant": true, @@ -1120,7 +1072,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "label_join(delta(nvme_data_units_written_total{instance=~\"$node\"}[24h])*512000,\"unique_device\", \"-\", \"instance\", \"device\")/label_join(nvme_physical_size_bytes{instance=~\"$node\"},\"unique_device\", \"-\", \"instance\", \"device\")", + "expr": "label_join(delta(smartmon_data_units_written{instance=~\"$node\",type=\"nvme\"}[24h])*512000,\"unique_device\", \"-\", \"instance\", \"disk\")/label_join(smartmon_physical_size_bytes{instance=~\"$node\",type=\"nvme\"},\"unique_device\", \"-\", \"instance\", \"disk\")", "format": "table", "hide": false, "instant": true, @@ -1135,7 +1087,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "label_join(nvme_rated_dwpd{instance=~\"$node\"},\"unique_device\", \"-\", \"instance\", \"device\")", + "expr": "label_join(smartmon_nvme_rated_dwpd{instance=~\"$node\"},\"unique_device\", \"-\", \"instance\", \"disk\")", "format": "table", "hide": false, "instant": true, @@ -1176,6 +1128,12 @@ "device 5": true, "device 6": true, "device 7": true, + "disk 2": true, + "disk 3": true, + "disk 4": true, + "disk 5": true, + "disk 6": true, + "disk 7": true, "instance 1": false, "instance 2": true, "instance 3": true, @@ -1209,17 +1167,24 @@ "serial_number 5": true, "serial_number 6": true, "serial_number 7": true, + "type 1": true, + "type 2": true, + "type 3": true, + "type 4": true, + "type 5": true, + "type 6": true, + "type 7": true, "unique_device": true }, "includeByName": {}, "indexByName": { - "Time 1": 11, - "Time 2": 15, - "Time 3": 23, - "Time 4": 27, - "Time 5": 32, - "Time 6": 38, - "Time 7": 53, + "Time 1": 10, + "Time 2": 14, + "Time 3": 19, + "Time 4": 21, + "Time 5": 24, + "Time 6": 28, + "Time 7": 36, "Value #Capacity": 6, "Value #DWPD": 8, "Value #Health": 2, @@ -1227,53 +1192,46 @@ "Value #TBR": 5, "Value #TBW": 4, "Value #Temp": 3, - "__name__ 1": 12, - "__name__ 2": 16, - "__name__ 3": 37, - "__name__ 4": 54, - "device 1": 1, - "device 2": 21, - "device 3": 24, - "device 4": 28, - "device 5": 33, - "device 6": 39, - "device 7": 55, - "instance 1": 0, - "instance 2": 17, - "instance 3": 14, - "instance 4": 29, - "instance 5": 34, - "instance 6": 40, - "instance 7": 56, - "job 1": 13, - "job 2": 18, - "job 3": 25, - "job 4": 30, - "job 5": 35, - "job 6": 41, - "job 7": 57, - "model 1": 9, - "model 2": 43, - "model 3": 45, - "model 4": 47, - "model 5": 49, - "model 6": 51, - "model 7": 58, - "original_device 1": 20, - "original_device 2": 22, - "original_device 3": 26, - "original_device 4": 31, - "original_device 5": 36, - "original_device 6": 42, - "original_device 7": 59, - "serial_number 1": 10, - "serial_number 2": 44, - "serial_number 3": 46, - "serial_number 4": 48, - "serial_number 5": 50, - "serial_number 6": 52, - "serial_number 7": 60, - "unique_device": 19 + "__name__ 1": 11, + "__name__ 2": 15, + "__name__ 3": 27, + "__name__ 4": 37, + "disk 1": 0, + "disk 2": 42, + "disk 3": 44, + "disk 4": 46, + "disk 5": 48, + "disk 6": 50, + "disk 7": 52, + "instance 1": 1, + "instance 2": 16, + "instance 3": 13, + "instance 4": 22, + "instance 5": 25, + "instance 6": 29, + "instance 7": 38, + "job 1": 12, + "job 2": 17, + "job 3": 20, + "job 4": 23, + "job 5": 26, + "job 6": 30, + "job 7": 39, + "serial_number 1": 9, + "serial_number 2": 31, + "serial_number 3": 32, + "serial_number 4": 33, + "serial_number 5": 34, + "serial_number 6": 35, + "serial_number 7": 40, + "type 1": 41, + "type 2": 43, + "type 3": 45, + "type 4": 47, + "type 5": 49, + "type 6": 51, + "type 7": 53, + "unique_device": 18 }, "renameByName": { "Time 1": "", @@ -1284,6 +1242,7 @@ "Value #TBW": "TBW", "__name__ 1": "", "device 1": "Device", + "disk 1": "Device", "instance 1": "Hostname", "model 1": "Model Name", "serial_number 1": "Serial Number" @@ -1343,8 +1302,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" } ] } @@ -1366,11 +1324,12 @@ "showLegend": true }, "tooltip": { + "hideZeros": false, "mode": "single", "sort": "none" } }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.0.1+security-01", "targets": [ { "datasource": { @@ -1378,8 +1337,8 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "delta(nvme_data_units_written_total{instance=~\"$node\"}[24h])*512000 / nvme_physical_size_bytes{instance=~\"$node\"}", - "legendFormat": "{{instance}} - {{device}}", + "expr": "delta(smartmon_data_units_written{instance=~\"$node\"}[24h])*512000 / smartmon_physical_size_bytes{instance=~\"$node\"}", + "legendFormat": "{{instance}} - {{disk}}", "range": true, "refId": "A" } @@ -1435,8 +1394,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -1462,11 +1420,12 @@ "showLegend": true }, "tooltip": { + "hideZeros": false, "mode": "single", "sort": "none" } }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.0.1+security-01", "targets": [ { "datasource": { @@ -1475,10 +1434,10 @@ }, "editorMode": "code", "exemplar": false, - "expr": "avg_over_time(nvme_temperature_celsius{instance=~\"$node\"}[1h]) ", + "expr": "avg_over_time(smartmon_temperature{instance=~\"$node\"}[1h]) ", "instant": false, "interval": "", - "legendFormat": "{{instance}} - {{device}}", + "legendFormat": "{{instance}} - {{disk}}", "range": true, "refId": "A" } @@ -1487,8 +1446,9 @@ "type": "timeseries" } ], + "preload": false, "refresh": false, - "schemaVersion": 40, + "schemaVersion": 41, "tags": [], "templating": { "list": [ @@ -1543,7 +1503,6 @@ "timezone": "", "title": "Hardware Overview", "uid": "TCN51Y25P", - "version": 10, - "weekStart": "" + "version": 5 } {% endraw %} diff --git a/etc/kayobe/kolla/config/grafana/dashboards/openstack/nvme.json b/etc/kayobe/kolla/config/grafana/dashboards/openstack/nvme.json index 1669b02a06..a450ec2a44 100644 --- a/etc/kayobe/kolla/config/grafana/dashboards/openstack/nvme.json +++ b/etc/kayobe/kolla/config/grafana/dashboards/openstack/nvme.json @@ -19,7 +19,7 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 1, - "id": 17197, + "id": 85, "links": [], "panels": [ { @@ -35,8 +35,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" } ] } @@ -68,7 +67,7 @@ "textMode": "name", "wideLayout": true }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.0.1+security-01", "targets": [ { "datasource": { @@ -76,9 +75,9 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "nvme_data_units_written_total{serial_number=~\"$serial_number\"}", + "expr": "smartmon_data_units_written{serial_number=~\"$serial_number\"}", "instant": true, - "legendFormat": "{{instance}} - {{device}} - {{serial_number}}", + "legendFormat": "{{instance}} - {{disk}} - {{serial_number}}", "refId": "A" } ], @@ -110,8 +109,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" } ] }, @@ -143,14 +141,14 @@ "textMode": "auto", "wideLayout": true }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.0.1+security-01", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "expr": "nvme_physical_size_bytes{serial_number=\"$serial_number\"}", + "expr": "smartmon_physical_size_bytes{serial_number=\"$serial_number\"}", "legendFormat": "Physical Size", "refId": "A" } @@ -172,8 +170,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "yellow", @@ -211,14 +208,14 @@ "showThresholdMarkers": true, "sizing": "auto" }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.0.1+security-01", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "expr": "nvme_temperature_celsius{serial_number=\"$serial_number\"}", + "expr": "smartmon_temperature{serial_number=\"$serial_number\"}", "legendFormat": "Temperature", "refId": "A" } @@ -238,8 +235,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" } ] } @@ -270,14 +266,14 @@ "textMode": "auto", "wideLayout": true }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.0.1+security-01", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "expr": "nvme_rated_dwpd{serial_number=\"$serial_number\"}", + "expr": "smartmon_nvme_rated_dwpd{serial_number=\"$serial_number\"}", "legendFormat": "Rated DWPD", "refId": "A" } @@ -297,8 +293,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "orange", @@ -333,14 +328,14 @@ "textMode": "auto", "wideLayout": true }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.0.1+security-01", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "expr": "nvme_power_on_hours_total{serial_number=\"$serial_number\"}", + "expr": "smartmon_power_on_hours{serial_number=\"$serial_number\"}", "legendFormat": "Power Hours", "refId": "A" } @@ -360,8 +355,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" } ] } @@ -392,14 +386,14 @@ "textMode": "auto", "wideLayout": true }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.0.1+security-01", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "expr": "nvme_power_cycles_total{serial_number=\"$serial_number\"}", + "expr": "smartmon_power_cycles{serial_number=\"$serial_number\"}", "legendFormat": "Power Cycles", "refId": "A" } @@ -435,8 +429,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -471,14 +464,14 @@ "textMode": "auto", "wideLayout": true }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.0.1+security-01", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "expr": "nvme_critical_warning_total{serial_number=\"$serial_number\"}", + "expr": "smartmon_critical_warning{serial_number=\"$serial_number\"}", "legendFormat": "Critical Warnings", "refId": "A" } @@ -500,8 +493,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "orange", @@ -539,15 +531,17 @@ "showThresholdMarkers": true, "sizing": "auto" }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.0.1+security-01", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "expr": "nvme_percentage_used_ratio{serial_number=\"$serial_number\"} * 100", + "editorMode": "code", + "expr": "smartmon_percentage_used{serial_number=\"$serial_number\"}", "legendFormat": "Percentage Used", + "range": true, "refId": "A" } ], @@ -566,8 +560,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -602,14 +595,14 @@ "textMode": "auto", "wideLayout": true }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.0.1+security-01", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "expr": "nvme_num_err_log_entries_total{serial_number=\"$serial_number\"}", + "expr": "smartmon_num_err_log_entries{serial_number=\"$serial_number\"}", "legendFormat": "Error Log Entries", "refId": "A" } @@ -629,8 +622,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -665,14 +657,14 @@ "textMode": "auto", "wideLayout": true }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.0.1+security-01", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "expr": "nvme_media_errors_total{serial_number=\"$serial_number\"}", + "expr": "smartmon_media_errors{serial_number=\"$serial_number\"}", "legendFormat": "Media Errors", "refId": "A" } @@ -694,8 +686,7 @@ "mode": "absolute", "steps": [ { - "color": "red", - "value": null + "color": "red" }, { "color": "yellow", @@ -733,15 +724,17 @@ "showThresholdMarkers": true, "sizing": "auto" }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.0.1+security-01", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "expr": "nvme_available_spare_ratio{serial_number=\"$serial_number\"} * 100", + "editorMode": "code", + "expr": "smartmon_available_spare{serial_number=\"$serial_number\"}", "legendFormat": "Available Spare", + "range": true, "refId": "A" } ], @@ -765,8 +758,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" } ] }, @@ -798,15 +790,17 @@ "textMode": "auto", "wideLayout": true }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.0.1+security-01", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "expr": "nvme_available_spare_threshold_ratio{serial_number=\"$serial_number\"} * 100", + "editorMode": "code", + "expr": "smartmon_available_spare_threshold{serial_number=\"$serial_number\"}", "legendFormat": "Spare Threshold", + "range": true, "refId": "A" } ], @@ -874,8 +868,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -902,11 +895,12 @@ "showLegend": true }, "tooltip": { + "hideZeros": false, "mode": "single", "sort": "none" } }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.0.1+security-01", "targets": [ { "datasource": { @@ -914,7 +908,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "rate(nvme_data_units_read_total{serial_number=\"$serial_number\"}[5m])*512000", + "expr": "rate(smartmon_data_units_read{serial_number=\"$serial_number\"}[5m])*512000", "legendFormat": "Data Read", "range": true, "refId": "A" @@ -925,7 +919,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "rate(nvme_data_units_written_total{serial_number=\"$serial_number\"}[5m])*512000", + "expr": "rate(smartmon_data_units_written{serial_number=\"$serial_number\"}[5m])*512000", "legendFormat": "Data Written", "range": true, "refId": "B" @@ -946,8 +940,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" } ] }, @@ -979,7 +972,7 @@ "textMode": "auto", "wideLayout": true }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.0.1+security-01", "targets": [ { "datasource": { @@ -988,7 +981,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "nvme_data_units_written_total{serial_number=\"$serial_number\"} * 512000", + "expr": "smartmon_data_units_written{serial_number=\"$serial_number\"} * 512000", "instant": false, "legendFormat": "__auto", "range": true, @@ -1010,8 +1003,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" } ] }, @@ -1043,7 +1035,7 @@ "textMode": "auto", "wideLayout": true }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.0.1+security-01", "targets": [ { "datasource": { @@ -1051,7 +1043,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "nvme_data_units_read_total{serial_number=\"$serial_number\"} * 512000", + "expr": "smartmon_data_units_read{serial_number=\"$serial_number\"} * 512000", "legendFormat": "__auto", "range": true, "refId": "A" @@ -1108,8 +1100,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -1136,11 +1127,12 @@ "showLegend": true }, "tooltip": { + "hideZeros": false, "mode": "single", "sort": "none" } }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.0.1+security-01", "targets": [ { "datasource": { @@ -1148,7 +1140,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "irate(nvme_controller_busy_time_seconds{serial_number=\"$serial_number\"}[5m])", + "expr": "irate(smartmon_controller_busy_time{serial_number=\"$serial_number\"}[5m])", "legendFormat": "Controller Busy Time", "range": true, "refId": "A" @@ -1156,11 +1148,147 @@ ], "title": "Controller Busy Time", "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 12, + "y": 27 + }, + "id": 27, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.0.1+security-01", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "delta(smartmon_data_units_written{serial_number=\"$serial_number\"}[24h])*512000 / smartmon_physical_size_bytes{serial_number=\"$serial_number\"}", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "DWPD", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 18, + "y": 27 + }, + "id": 28, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.0.1+security-01", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "(smartmon_data_units_written{serial_number=\"$serial_number\"} * 512000)/smartmon_physical_size_bytes{serial_number=\"$serial_number\"}/(smartmon_power_on_hours{serial_number=\"$serial_number\"} / 24)", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Lifetime Average DWPD", + "type": "stat" } ], "preload": false, "refresh": "1m", - "schemaVersion": 40, + "schemaVersion": 41, "tags": [], "templating": { "list": [ @@ -1187,14 +1315,14 @@ "type": "prometheus", "uid": "${datasource}" }, - "definition": "label_values(nvme_data_units_read_total,serial_number)", + "definition": "label_values(smartmon_data_units_read,serial_number)", "includeAll": false, "label": "Serial Number", "name": "serial_number", "options": [], "query": { "qryType": 1, - "query": "label_values(nvme_data_units_read_total,serial_number)", + "query": "label_values(smartmon_data_units_read,serial_number)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -1211,7 +1339,6 @@ "timezone": "", "title": "NVMe Monitoring", "uid": "uesjf83hh", - "version": 1, - "weekStart": "" + "version": 2 } {% endraw %}