Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 20 additions & 5 deletions etc/kayobe/ansible/deployment/get-nvme-drives.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,31 @@
hosts: overcloud
gather_facts: no
tasks:
- name: Retrieve NVMe device information
ansible.builtin.command: "nvme list -o json"
register: nvme_list
- name: Scan for NVMe devices with smartctl
ansible.builtin.command: "smartctl --scan -j"
register: smartctl_scan
changed_when: false
become: true

- name: Extract NVMe device paths
ansible.builtin.set_fact:
nvme_devices: "{{ smartctl_scan.stdout | from_json | json_query('devices[?type==`nvme`].info_name') | default([]) }}"
changed_when: false

- name: Retrieve NVMe device information via smartctl
ansible.builtin.command: "smartctl -i -j {{ item }}"
register: smartctl_info
loop: "{{ nvme_devices }}"
loop_control:
label: "{{ item }}"
changed_when: false
become: true
when: nvme_devices | length > 0

- name: Parse NVMe device model names
ansible.builtin.set_fact:
nvme_models: "{{ nvme_models | default([]) + [item.ModelNumber] }}"
loop: "{{ nvme_list.stdout | from_json | json_query('Devices[].{ModelNumber: ModelNumber}') }}"
nvme_models: "{{ nvme_models | default([]) + [item.model_name] }}"
loop: "{{ smartctl_info.results | default([]) | map(attribute='stdout') | map('from_json') | selectattr('model_name', 'defined') | list }}"
changed_when: false

- name: Set unique NVMe models as host facts
Expand Down
25 changes: 8 additions & 17 deletions etc/kayobe/ansible/deployment/smartmon-tools.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,10 @@
- name: Install and set up SMART monitoring tools
hosts: overcloud
tasks:
- name: Ensure smartmontools, jq, nvme-cli and cron/cronie are installed
- name: Ensure smartmontools, jq, and cron/cronie are installed
ansible.builtin.package:
name:
- smartmontools
- nvme-cli
- jq
- "{{ 'cron' if ansible_facts['distribution'] == 'Ubuntu' else 'cronie' }}"
state: present
Expand Down Expand Up @@ -54,7 +53,7 @@
enabled: true
become: true

- name: Copy smartmon.py and nvmemon.sh from scripts folder
- name: Copy smartmon.py from scripts folder
ansible.builtin.copy:
src: "{{ lookup('env', 'KAYOBE_CONFIG_PATH') }}/ansible/scripts/{{ item }}"
dest: /usr/local/bin/{{ item }}
Expand All @@ -63,7 +62,6 @@
mode: "0700"
loop:
- smartmon.py
- nvmemon.sh
become: true

- name: Set PATH Variable for cron
Expand All @@ -84,17 +82,6 @@
mv -f /var/lib/docker/volumes/textfile/_data/smartmon.prom.temp /var/lib/docker/volumes/textfile/_data/smartmon.prom
become: true

- name: Schedule cronjob to run nvmemon.sh every 5 minutes and save output to file
ansible.builtin.cron:
name: SMART metrics for drive monitoring using nvmemon.sh
user: root
minute: "*/5"
job: >-
umask 0022 && /usr/local/bin/nvmemon.sh >
/var/lib/docker/volumes/textfile/_data/nvmemon.prom.temp &&
mv -f /var/lib/docker/volumes/textfile/_data/nvmemon.prom.temp /var/lib/docker/volumes/textfile/_data/nvmemon.prom
become: true

- name: Remove old cronjobs if present
ansible.builtin.cron:
name: SMART metrics for drive monitoring using {{ item }}
Expand All @@ -104,11 +91,15 @@
loop:
- smartmon
- nvmemon
- nvmemon.sh

- name: Remove old smartmon.sh if present
- name: Remove old monitoring scripts if present
ansible.builtin.file:
path: /usr/local/bin/smartmon.sh
path: /usr/local/bin/{{ item }}
state: absent
loop:
- smartmon.sh
- nvmemon.sh
become: true

- name: Gather NVMe drives and generate dwpd ratings
Expand Down
52 changes: 52 additions & 0 deletions etc/kayobe/ansible/scripts/generate_fixtures.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#!/usr/bin/env python3
import json
import re
import subprocess
from pySMART import DeviceList

SMARTMON_ATTRS = {
Expand Down Expand Up @@ -63,6 +64,8 @@
"critical_comp_time",
}

SMARTCTL_PATH = "/usr/sbin/smartctl"

DISK_INFO = {
"name",
"interface",
Expand All @@ -84,6 +87,17 @@ def camel_to_snake(name):
"""
return re.sub(r'(?<!^)(?=[A-Z])', '_', name).lower()

def canonical_device_path(name):
"""
Ensure device name is returned as absolute /dev path for smartctl.

pySMART sometimes reports bare device names (e.g. 'nvme0'); smartctl on the
CLI expects the canonical /dev path, so normalise here to avoid surprises.
"""
if not name:
return name
return name if name.startswith("/dev/") else f"/dev/{name}"

def attrs_to_dict(obj, allowed_keys):
"""
Build {attr: value} for every public, non-callable attribute whose
Expand All @@ -105,14 +119,52 @@ def attrs_to_dict(obj, allowed_keys):
attributes[name] = value
return attributes

def smartctl_json(device_name, device_type):
"""
Execute smartctl -x -j for the given device and return the parsed JSON payload.

The goal is to mirror the exact data smartmon.py consumes at runtime so our
fixtures stay faithful to real hardware output.
"""
if not device_name:
return {}

target = canonical_device_path(device_name)

cmd = [SMARTCTL_PATH, "-x", "-j", target]
if device_type and device_type.lower() not in (None, "", "nvme"):
cmd.insert(3, device_type)
cmd.insert(3, "-d")

try:
result = subprocess.run(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
check=False,
text=True,
)
except OSError:
return {}

if not result.stdout:
return {}

try:
return json.loads(result.stdout)
except json.JSONDecodeError:
return {}

for disk in DeviceList().devices:

fixtures = {}
disk_info = attrs_to_dict(disk, DISK_INFO)
if_stats = attrs_to_dict(disk.if_attributes, SMARTMON_ATTRS)
smartctl_payload = smartctl_json(disk.name, disk.interface)

fixtures["device_info"] = disk_info
fixtures["if_attributes"] = if_stats
fixtures["smartctl"] = smartctl_payload

print(f'Disk: {disk.name}: \n')
print(json.dumps(fixtures, indent=2, default=str))
150 changes: 0 additions & 150 deletions etc/kayobe/ansible/scripts/nvmemon.sh

This file was deleted.

Loading
Loading