diff --git a/CHANGELOG.md b/CHANGELOG.md index b05308ab..0a3da6ea 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # Change Log +## Unreleased + +- Simvue runs now contain built in system health alerts which ensure runs terminate with uploads before high CPU usage or low disk space. + ## [v2.3.0](https://github.com/simvue-io/client/releases/tag/v2.3.0) - 2025-12-11 - Refactored sender functionality introducing new `Sender` class. diff --git a/simvue/metrics.py b/simvue/metrics.py index 2914b351..105f58bb 100644 --- a/simvue/metrics.py +++ b/simvue/metrics.py @@ -9,7 +9,8 @@ import contextlib import logging import psutil - +import os +import typing from .pynvml import ( nvmlDeviceGetComputeRunningProcesses, @@ -158,6 +159,8 @@ def to_dict(self) -> dict[str, float]: _metrics: dict[str, float] = { f"{RESOURCES_METRIC_PREFIX}/cpu.usage.percentage": self.cpu_percent, f"{RESOURCES_METRIC_PREFIX}/cpu.usage.memory": self.cpu_memory, + f"{RESOURCES_METRIC_PREFIX}/memory.virtual.available.percentage": self.memory_available_percent, + f"{RESOURCES_METRIC_PREFIX}/disk.available.percentage": self.disk_available_percent, } for i, gpu in enumerate(self.gpus or []): @@ -177,3 +180,11 @@ def gpu_percent(self) -> float: @property def gpu_memory(self) -> float: return sum(m[1] for m in self.gpus or []) / (len(self.gpus or []) or 1) + + @property + def memory_available_percent(self) -> float: + return 100 - typing.cast("float", psutil.virtual_memory().percent) + + @property + def disk_available_percent(self) -> float: + return 100 - psutil.disk_usage(os.getcwd()).percent diff --git a/simvue/run.py b/simvue/run.py index 7e3e96fb..b9564b51 100644 --- a/simvue/run.py +++ b/simvue/run.py @@ -499,6 +499,30 @@ def _dispatch_callback( return _dispatch_callback + def _define_system_health_alerts( + self, terminate_on_alert: bool, email_notify: bool + ) -> None: + """Define system health resource metric alerts.""" + _ = self.create_metric_threshold_alert( + name="low_available_virtual_memory", + metric=f"{RESOURCES_METRIC_PREFIX}/memory.virtual.available.percentage", + threshold=5, + aggregation="at least one", + window=2, + rule="is below", + notification="email" if email_notify else "none", + trigger_abort=terminate_on_alert, + ) + _ = self.create_metric_threshold_alert( + name="low_disk_space", + metric=f"{RESOURCES_METRIC_PREFIX}/disk.available.percentage", + threshold=5, + aggregation="at least one", + window=2, + rule="is below", + trigger_abort=terminate_on_alert, + ) + def _start(self) -> bool: """Start a run @@ -616,17 +640,18 @@ def init( self, name: typing.Annotated[str | None, pydantic.Field(pattern=NAME_REGEX)] = None, *, - metadata: dict[str, typing.Any] = None, + metadata: dict[str, typing.Any] | None = None, tags: list[str] | None = None, description: str | None = None, - folder: typing.Annotated[ - str, pydantic.Field(None, pattern=FOLDER_REGEX) - ] = None, + folder: typing.Annotated[str, pydantic.Field(None, pattern=FOLDER_REGEX)] + | None = None, notification: typing.Literal["none", "all", "error", "lost"] = "none", running: bool = True, retention_period: str | None = None, timeout: int | None = 180, visibility: typing.Literal["public", "tenant"] | list[str] | None = None, + terminate_on_low_system_health: bool = False, + email_on_low_system_health: bool = False, no_color: bool = False, record_shell_vars: set[str] | None = None, ) -> bool: @@ -664,6 +689,13 @@ def init( * public - run viewable to all. * tenant - run viewable to all within the current tenant. * A list of usernames with which to share this run + terminate_on_low_system_health : bool, optional + whether to terminate this run if the resource metrics are + registering unhealthy values, e.g. very low available memory + default is False + email_on_low_system_health : bool, optional + notify by email if system health enters fail status, e.g. + low memory, default is False no_color : bool, optional disable terminal colors. Default False. record_shell_vars : list[str] | None, @@ -774,6 +806,10 @@ def init( if self._status == "running": self._start() + self._define_system_health_alerts( + terminate_on_low_system_health, email_on_low_system_health + ) + if self._user_config.run.mode == "online": click.secho( f"[simvue] Run {self.name} created", @@ -1057,6 +1093,7 @@ def config( system_metrics_interval: pydantic.PositiveInt | None = None, enable_emission_metrics: bool | None = None, disable_resources_metrics: bool | None = None, + healthcheck_alert_email_notify: bool | None = None, storage_id: str | None = None, abort_on_alert: typing.Literal["run", "terminate", "ignore"] | None = None, ) -> bool: @@ -1075,6 +1112,9 @@ def config( enable monitoring of emission metrics disable_resources_metrics : bool, optional disable monitoring of resource metrics + healthcheck_alert_email_notify : bool, optional + whether to enable notification by email of + critical memory usage or disk availability storage_id : str, optional identifier of storage to use, by default None abort_on_alert : Literal['ignore', 'terminate', 'ignore'], optional @@ -1105,6 +1145,11 @@ def config( if system_metrics_interval: self._system_metrics_interval = system_metrics_interval + if healthcheck_alert_email_notify is not None: + self._healthcheck_alert_email_notification = ( + healthcheck_alert_email_notify + ) + if disable_resources_metrics: if self._emissions_monitor: self._error( diff --git a/simvue/system.py b/simvue/system.py index 84ce016b..5a3dded4 100644 --- a/simvue/system.py +++ b/simvue/system.py @@ -1,3 +1,10 @@ +""" +System Information +================== + +Retrieve and assemble information on the current system. +""" + import os import platform import socket @@ -5,6 +12,7 @@ import shutil import sys import contextlib +import psutil import typing @@ -60,6 +68,14 @@ def get_gpu_info(): return _gpu_info +def get_memory_info() -> dict[str, int]: + """Get total available memory in GB.""" + return { + "virtual": typing.cast("int", psutil.virtual_memory().total) // 1024**3, + "swap": psutil.swap_memory().total // 1024**3, + } + + def get_system() -> dict[str, typing.Any]: """ Get system details @@ -76,6 +92,7 @@ def get_system() -> dict[str, typing.Any]: system["platform"]["system"] = platform.system() system["platform"]["release"] = platform.release() system["platform"]["version"] = platform.version() + system["memory"] = {k: f"{v}GB" for k, v in get_memory_info().items()} system["cpu"] = {} system["cpu"]["arch"] = cpu[1] system["cpu"]["processor"] = cpu[0] diff --git a/tests/functional/test_run_class.py b/tests/functional/test_run_class.py index f1ba2d81..f542d7c3 100644 --- a/tests/functional/test_run_class.py +++ b/tests/functional/test_run_class.py @@ -19,13 +19,13 @@ import random import datetime import simvue + from simvue.api.objects import Alert, Metrics from simvue.api.objects.grids import GridMetrics from simvue.exception import ObjectNotFoundError, SimvueRunError from simvue.sender import Sender import simvue.run as sv_run import simvue.client as sv_cl -import simvue.config.user as sv_cfg from simvue.api.objects import Run as RunObject @@ -1391,7 +1391,7 @@ def test_abort_on_alert_python( attempts: int = 0 - while run._status == "terminated" and attemps < 5: + while run._status == "terminated" and attempts < 5: time.sleep(1) attempts += 1 @@ -1561,6 +1561,7 @@ def test_reconnect_with_process() -> None: @pytest.mark.parametrize( "environment", ("python_conda", "python_poetry", "python_uv", "julia", "rust", "nodejs") ) +@pytest.mark.run @pytest.mark.online def test_run_environment_metadata(environment: str, mocker: pytest_mock.MockerFixture) -> None: """Tests that the environment information is compatible with the server.""" @@ -1583,3 +1584,4 @@ def test_run_environment_metadata(environment: str, mocker: pytest_mock.MockerFi ) run.update_metadata(env_func(_target_dir)) +