diff --git a/cuda_bindings/cuda/bindings/_nvml.pxd b/cuda_bindings/cuda/bindings/_nvml.pxd index d08b087b38..ddf9ab2b28 100644 --- a/cuda_bindings/cuda/bindings/_nvml.pxd +++ b/cuda_bindings/cuda/bindings/_nvml.pxd @@ -34,7 +34,6 @@ ctypedef nvmlViolationTime_t ViolationTime ctypedef nvmlUUIDValue_t UUIDValue ctypedef nvmlVgpuPlacementList_v1_t VgpuPlacementList_v1 ctypedef nvmlNvLinkPowerThres_t NvLinkPowerThres -ctypedef nvmlSystemEventData_v1_t SystemEventData_v1 ctypedef nvmlGpuInstanceProfileInfo_t GpuInstanceProfileInfo ctypedef nvmlComputeInstanceProfileInfo_t ComputeInstanceProfileInfo ctypedef nvmlMask255_t Mask255 @@ -329,10 +328,6 @@ cpdef device_register_events(intptr_t device, unsigned long long event_types, in cpdef unsigned long long device_get_supported_event_types(intptr_t device) except? 0 cpdef object event_set_wait_v2(intptr_t set, unsigned int timeoutms) cpdef event_set_free(intptr_t set) -cpdef system_event_set_create(intptr_t request) -cpdef system_event_set_free(intptr_t request) -cpdef system_register_events(intptr_t request) -cpdef system_event_set_wait(intptr_t request) cpdef device_modify_drain_state(intptr_t pci_info, int new_state) cpdef int device_query_drain_state(intptr_t pci_info) except? -1 cpdef device_remove_gpu_v2(intptr_t pci_info, int gpu_state, int link_state) diff --git a/cuda_bindings/cuda/bindings/_nvml.pyx b/cuda_bindings/cuda/bindings/_nvml.pyx index 3a3f01ea7a..b9c59d6637 100644 --- a/cuda_bindings/cuda/bindings/_nvml.pyx +++ b/cuda_bindings/cuda/bindings/_nvml.pyx @@ -10155,11 +10155,157 @@ cdef class EventData: return obj +cdef _get_system_event_data_v1_dtype_offsets(): + cdef nvmlSystemEventData_v1_t pod = nvmlSystemEventData_v1_t() + return _numpy.dtype({ + 'names': ['event_type', 'gpu_id'], + 'formats': [_numpy.uint64, _numpy.uint32], + 'offsets': [ + (&(pod.eventType)) - (&pod), + (&(pod.gpuId)) - (&pod), + ], + 'itemsize': sizeof(nvmlSystemEventData_v1_t), + }) + +system_event_data_v1_dtype = _get_system_event_data_v1_dtype_offsets() + +cdef class SystemEventData_v1: + """Empty-initialize an array of `nvmlSystemEventData_v1_t`. + + The resulting object is of length `size` and of dtype `system_event_data_v1_dtype`. + If default-constructed, the instance represents a single struct. + + Args: + size (int): number of structs, default=1. + + + .. seealso:: `nvmlSystemEventData_v1_t` + """ + cdef: + readonly object _data + + + + def __init__(self, size=1): + arr = _numpy.empty(size, dtype=system_event_data_v1_dtype) + self._data = arr.view(_numpy.recarray) + assert self._data.itemsize == sizeof(nvmlSystemEventData_v1_t), \ + f"itemsize {self._data.itemsize} mismatches struct size { sizeof(nvmlSystemEventData_v1_t) }" + + def __repr__(self): + if self._data.size > 1: + return f"<{__name__}.SystemEventData_v1_Array_{self._data.size} object at {hex(id(self))}>" + else: + return f"<{__name__}.SystemEventData_v1 object at {hex(id(self))}>" + + @property + def ptr(self): + """Get the pointer address to the data as Python :class:`int`.""" + return self._data.ctypes.data + + cdef intptr_t _get_ptr(self): + return self._data.ctypes.data + + def __int__(self): + if self._data.size > 1: + raise TypeError("int() argument must be a bytes-like object of size 1. " + "To get the pointer address of an array, use .ptr") + return self._data.ctypes.data + + def __len__(self): + return self._data.size + + def __eq__(self, other): + cdef object self_data = self._data + if (not isinstance(other, SystemEventData_v1)) or self_data.size != other._data.size or self_data.dtype != other._data.dtype: + return False + return bool((self_data == other._data).all()) + + @property + def event_type(self): + """Union[~_numpy.uint64, int]: Information about what specific system event occurred.""" + if self._data.size == 1: + return int(self._data.event_type[0]) + return self._data.event_type + + @event_type.setter + def event_type(self, val): + self._data.event_type = val + + @property + def gpu_id(self): + """Union[~_numpy.uint32, int]: gpuId in PCI format""" + if self._data.size == 1: + return int(self._data.gpu_id[0]) + return self._data.gpu_id + + @gpu_id.setter + def gpu_id(self, val): + self._data.gpu_id = val + + def __getitem__(self, key): + cdef ssize_t key_ + cdef ssize_t size + if isinstance(key, int): + key_ = key + size = self._data.size + if key_ >= size or key_ <= -(size+1): + raise IndexError("index is out of bounds") + if key_ < 0: + key_ += size + return SystemEventData_v1.from_data(self._data[key_:key_+1]) + out = self._data[key] + if isinstance(out, _numpy.recarray) and out.dtype == system_event_data_v1_dtype: + return SystemEventData_v1.from_data(out) + return out + + def __setitem__(self, key, val): + self._data[key] = val + + @staticmethod + def from_data(data): + """Create an SystemEventData_v1 instance wrapping the given NumPy array. + + Args: + data (_numpy.ndarray): a 1D array of dtype `system_event_data_v1_dtype` holding the data. + """ + cdef SystemEventData_v1 obj = SystemEventData_v1.__new__(SystemEventData_v1) + if not isinstance(data, _numpy.ndarray): + raise TypeError("data argument must be a NumPy ndarray") + if data.ndim != 1: + raise ValueError("data array must be 1D") + if data.dtype != system_event_data_v1_dtype: + raise ValueError("data array must be of dtype system_event_data_v1_dtype") + obj._data = data.view(_numpy.recarray) + + return obj + + @staticmethod + def from_ptr(intptr_t ptr, size_t size=1, bint readonly=False): + """Create an SystemEventData_v1 instance wrapping the given pointer. + + Args: + ptr (intptr_t): pointer address as Python :class:`int` to the data. + size (int): number of structs, default=1. + readonly (bool): whether the data is read-only (to the user). default is `False`. + """ + if ptr == 0: + raise ValueError("ptr must not be null (0)") + cdef SystemEventData_v1 obj = SystemEventData_v1.__new__(SystemEventData_v1) + cdef flag = cpython.buffer.PyBUF_READ if readonly else cpython.buffer.PyBUF_WRITE + cdef object buf = cpython.memoryview.PyMemoryView_FromMemory( + ptr, sizeof(nvmlSystemEventData_v1_t) * size, flag) + data = _numpy.ndarray(size, buffer=buf, dtype=system_event_data_v1_dtype) + obj._data = data.view(_numpy.recarray) + + return obj + + cdef _get_accounting_stats_dtype_offsets(): cdef nvmlAccountingStats_t pod = nvmlAccountingStats_t() return _numpy.dtype({ 'names': ['gpu_utilization', 'memory_utilization', 'max_memory_usage', 'time', 'start_time', 'is_running', 'reserved'], - 'formats': [_numpy.uint32, _numpy.uint32, _numpy.uint64, _numpy.uint64, _numpy.uint64, _numpy.uint32, _numpy.uint32], + 'formats': [_numpy.uint32, _numpy.uint32, _numpy.uint64, _numpy.uint64, _numpy.uint64, _numpy.uint32, (_numpy.uint32, 5)], 'offsets': [ (&(pod.gpuUtilization)) - (&pod), (&(pod.memoryUtilization)) - (&pod), @@ -24913,58 +25059,6 @@ cpdef event_set_free(intptr_t set): check_status(__status__) -cpdef system_event_set_create(intptr_t request): - """Create an empty set of system events. Event set should be freed by ``nvmlSystemEventSetFree``. - - Args: - request (intptr_t): Reference to nvmlSystemEventSetCreateRequest_t. - - .. seealso:: `nvmlSystemEventSetCreate` - """ - with nogil: - __status__ = nvmlSystemEventSetCreate(request) - check_status(__status__) - - -cpdef system_event_set_free(intptr_t request): - """Releases system event set. - - Args: - request (intptr_t): Reference to nvmlSystemEventSetFreeRequest_t. - - .. seealso:: `nvmlSystemEventSetFree` - """ - with nogil: - __status__ = nvmlSystemEventSetFree(request) - check_status(__status__) - - -cpdef system_register_events(intptr_t request): - """Starts recording of events on system and add the events to specified ``nvmlSystemEventSet_t``. - - Args: - request (intptr_t): Reference to the struct nvmlSystemRegisterEventRequest_t. - - .. seealso:: `nvmlSystemRegisterEvents` - """ - with nogil: - __status__ = nvmlSystemRegisterEvents(request) - check_status(__status__) - - -cpdef system_event_set_wait(intptr_t request): - """Waits on system events and delivers events. - - Args: - request (intptr_t): Reference in which to nvmlSystemEventSetWaitRequest_t. - - .. seealso:: `nvmlSystemEventSetWait` - """ - with nogil: - __status__ = nvmlSystemEventSetWait(request) - check_status(__status__) - - cpdef device_modify_drain_state(intptr_t pci_info, int new_state): """Modify the drain state of a GPU. This method forces a GPU to no longer accept new incoming requests. Any new NVML process will no longer see this GPU. Persistence mode for this GPU must be turned off before this call is made. Must be called as administrator. For Linux only. @@ -27908,3 +28002,64 @@ cpdef object device_get_nvlink_info(intptr_t device): __status__ = nvmlDeviceGetNvLinkInfo(device, info) check_status(__status__) return info_v1_py + + +cpdef intptr_t system_event_set_create(): + """Create an empty set of system events. Event set should be freed by ``nvmlSystemEventSetFree``.""" + cdef nvmlSystemEventSetCreateRequest_v1_t[1] request + with nogil: + request[0].version = sizeof(nvmlSystemEventSetCreateRequest_v1_t) | (1 << 24) + __status__ = nvmlSystemEventSetCreate(request) + check_status(__status__) + return (request[0].set) + + +cpdef system_event_set_free(intptr_t event_set): + """Frees an event set.""" + cdef nvmlSystemEventSetFreeRequest_v1_t[1] request + request[0].set = event_set + with nogil: + request[0].version = sizeof(nvmlSystemEventSetFreeRequest_v1_t) | (1 << 24) + __status__ = nvmlSystemEventSetFree(request) + check_status(__status__) + + +cpdef system_register_events(unsigned long long event_types, intptr_t event_set): + """Starts recording of events on system and add the events to specified ``nvmlSystemEventSet_t``. + + Args: + event_types (unsigned long long): Bitmask of nvmlSystemEventType_t values representing the events to register. + event_set (intptr_t): The system event set handle. + """ + cdef nvmlSystemRegisterEventRequest_v1_t[1] request + request[0].set = event_set + request[0].eventTypes = event_types + with nogil: + request[0].version = sizeof(nvmlSystemRegisterEventRequest_v1_t) | (1 << 24) + __status__ = nvmlSystemRegisterEvents(request) + check_status(__status__) + + +cpdef object system_event_set_wait(intptr_t event_set, unsigned int timeout_ms, unsigned int buffer_size): + """Waits for events to occur on the system event set. + + Args: + event_set (intptr_t): The system event set handle. + timeout_ms (unsigned int): The maximum amount of time in milliseconds to wait for an event. + buffer_size (unsigned int): The size of the event buffer. + + Returns: + SystemEvent: The system event that occurred. + """ + cdef nvmlSystemEventSetWaitRequest_v1_t[1] request + cdef SystemEventData_v1 event_data = SystemEventData_v1(buffer_size) + request[0].timeoutms = timeout_ms + request[0].set = event_set + request[0].data = (event_data._get_ptr()) + request[0].dataSize = buffer_size + with nogil: + request[0].version = sizeof(nvmlSystemEventSetWaitRequest_v1_t) | (1 << 24) + __status__ = nvmlSystemEventSetWait(request) + check_status(__status__) + event_data._data.resize((request[0].numEvent,)) + return event_data diff --git a/cuda_core/cuda/core/system/__init__.py b/cuda_core/cuda/core/system/__init__.py index 29dc1538da..8c1953c5d7 100644 --- a/cuda_core/cuda/core/system/__init__.py +++ b/cuda_core/cuda/core/system/__init__.py @@ -25,9 +25,12 @@ if CUDA_BINDINGS_NVML_IS_COMPATIBLE: from ._device import * from ._device import __all__ as _device_all + from ._system_events import * + from ._system_events import __all__ as _system_events_all from .exceptions import * from .exceptions import __all__ as _exceptions_all __all__.append("get_nvml_version") __all__.extend(_device_all) + __all__.extend(_system_events_all) __all__.extend(_exceptions_all) diff --git a/cuda_core/cuda/core/system/_device.pyx b/cuda_core/cuda/core/system/_device.pyx index 2371c09c30..c9a2e8f369 100644 --- a/cuda_core/cuda/core/system/_device.pyx +++ b/cuda_core/cuda/core/system/_device.pyx @@ -2,7 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 -from libc.stdint cimport intptr_t +from libc.stdint cimport intptr_t, uint64_t from libc.math cimport ceil from multiprocessing import cpu_count @@ -16,6 +16,7 @@ include "_device_utils.pxi" BrandType = nvml.BrandType +EventType = nvml.EventType FieldId = nvml.FieldId @@ -175,6 +176,140 @@ cdef class PciInfo: return self._pci_info.pci_device_id >> 16 +cdef class EventData: + """ + Data about a single event. + """ + def __init__(self, event_data: nvml.EventData): + self._event_data = event_data + + @property + def device(self) -> Device: + """ + The device on which the event occurred. + """ + device = Device.__new__() + device._handle = self._event_data.device + return device + + @property + def event_type(self) -> EventType: + """ + The type of event that was triggered. + """ + return EventType(self._event_data.event_type) + + @property + def event_data(self) -> int: + """ + Returns Xid error for the device in the event of + :member:`EventType.EVENT_TYPE_XID_CRITICAL_ERROR`. + + Raises :class:`ValueError` for other event types. + """ + if self.event_type != EventType.EVENT_TYPE_XID_CRITICAL_ERROR: + raise ValueError("event_data is only available for Xid critical error events.") + return self._event_data.event_data + + @property + def gpu_instance_id(self) -> int: + """ + The GPU instance ID for MIG devices. + + Only valid for events of type :attr:`EventType.EVENT_TYPE_XID_CRITICAL_ERROR`. + + Raises :class:`ValueError` for other event types. + """ + if self.event_type != EventType.EVENT_TYPE_XID_CRITICAL_ERROR: + raise ValueError("gpu_instance_id is only available for Xid critical error events.") + return self._event_data.gpu_instance_id + + @property + def compute_instance_id(self) -> int: + """ + The Compute instance ID for MIG devices. + + Only valid for events of type :attr:`EventType.EVENT_TYPE_XID_CRITICAL_ERROR`. + + Raises :class:`ValueError` for other event types. + """ + if self.event_type != EventType.EVENT_TYPE_XID_CRITICAL_ERROR: + raise ValueError("compute_instance_id is only available for Xid critical error events.") + return self._event_data.compute_instance_id + + +cdef class DeviceEvents: + """ + Represents a set of events that can be waited on for a specific device. + """ + cdef intptr_t _event_set + cdef intptr_t _device_handle + + def __init__(self, device_handle: intptr_t, events: EventType | int | list[EventType | int]): + cdef unsigned long long event_bitmask + if isinstance(events, (int, EventType)): + event_bitmask = int(events) + elif isinstance(events, list): + event_bitmask = 0 + for ev in events: + event_bitmask |= int(ev) + else: + raise TypeError("events must be an EventType, int, or list of EventType or int") + + self._device_handle = device_handle + self._event_set = nvml.event_set_create() + # If this raises, the event needs to be freed and this is handled by + # this class's __dealloc__ method. + nvml.device_register_events(self._device_handle, event_bitmask, self._event_set) + + def __dealloc__(self): + nvml.event_set_free(self._event_set) + + def wait(self, timeout_ms: int = 0) -> EventData: + """ + Wait for events in the event set. + + For Fermi™ or newer fully supported devices. + + If some events are ready to be delivered at the time of the call, + function returns immediately. If there are no events ready to be + delivered, function sleeps until event arrives but not longer than + specified timeout. If timeout passes, a + :class:`cuda.core.system.TimeoutError` is raised. This function in + certain conditions can return before specified timeout passes (e.g. when + interrupt arrives). + + On Windows, in case of Xid error, the function returns the most recent + Xid error type seen by the system. If there are multiple Xid errors + generated before ``wait`` is invoked, then the last seen Xid + error type is returned for all Xid error events. + + On Linux, every Xid error event would return the associated event data + and other information if applicable. + + In MIG mode, if device handle is provided, the API reports all the + events for the available instances, only if the caller has appropriate + privileges. In absence of required privileges, only the events which + affect all the instances (i.e. whole device) are reported. + + This API does not currently support per-instance event reporting using + MIG device handles. + + Parameters + ---------- + timeout_ms: int + The timeout in milliseconds. A value of 0 means to wait indefinitely. + + Raises + ------ + :class:`cuda.core.system.TimeoutError` + If the timeout expires before an event is received. + :class:`cuda.core.system.GpuIsLostError` + If the GPU has fallen off the bus or is otherwise inaccessible. + """ + return EventData(nvml.event_set_wait_v2(self._event_set, timeout_ms)) + + cdef class DeviceAttributes: """ Various device attributes. @@ -416,16 +551,22 @@ cdef class Device: cdef intptr_t _handle - def __init__(self, index: int | None = None, uuid: bytes | str | None = None, pci_bus_id: bytes | str | None = None): - initialize() - + def __init__( + self, + *, + index: int | None = None, + uuid: bytes | str | None = None, + pci_bus_id: bytes | str | None = None, + ): args = [index, uuid, pci_bus_id] - arg_count = sum(x is not None for x in args) + cdef int arg_count = sum(arg is not None for arg in args) if arg_count > 1: - raise ValueError("Handle requires only one of either device `index`, `uuid` or `pci_bus_id`.") + raise ValueError("Handle requires only one of `index`, `uuid`, or `pci_bus_id`.") if arg_count == 0: - raise ValueError("Handle requires either a device `index`, `uuid` or `pci_bus_id`.") + raise ValueError("Handle requires either a device `index`, `uuid`, or `pci_bus_id`.") + + initialize() if index is not None: self._handle = nvml.device_get_handle_by_index_v2(index) @@ -437,8 +578,6 @@ cdef class Device: if isinstance(pci_bus_id, bytes): pci_bus_id = pci_bus_id.decode("ascii") self._handle = nvml.device_get_handle_by_pci_bus_id_v2(pci_bus_id) - else: - raise ValueError("Error parsing arguments") @classmethod def get_all_devices(cls) -> Iterable[Device]: @@ -452,7 +591,7 @@ cdef class Device: """ total = nvml.device_get_count_v2() for device_id in range(total): - yield cls(device_id) + yield cls(index=device_id) @property def architecture(self) -> DeviceArchitecture: @@ -558,6 +697,64 @@ cdef class Device: """ return nvml.device_get_uuid(self._handle) + def register_events(self, events: EventType | int | list[EventType | int]) -> DeviceEvents: + """ + Starts recording events on this device. + + For Fermi™ or newer fully supported devices. For Linux only. + + ECC events are available only on ECC-enabled devices (see + :meth:`Device.get_total_ecc_errors`). Power capping events are + available only on Power Management enabled devices (see + :meth:`Device.get_power_management_mode`). + + This call starts recording of events on specific device. All events + that occurred before this call are not recorded. Wait for events using + the :meth:`DeviceEvents.wait` method on the result. + + Examples + -------- + >>> device = Device(index=0) + >>> events = device.register_events([ + ... EventType.EVENT_TYPE_XID_CRITICAL_ERROR, + ... ]) + >>> while event := events.wait(timeout_ms=10000): + ... print(f"Event {event.event_type} occurred on device {event.device.uuid}") + + Parameters + ---------- + events: EventType, int, or list of EventType or int + The event type or list of event types to register for this device. + + Returns + ------- + :class:`DeviceEvents` + An object representing the registered events. Call + :meth:`DeviceEvents.wait` on this object to wait for events. + + Raises + ------ + :class:`cuda.core.system.NotSupportedError` + None of the requested event types are registered. + """ + return DeviceEvents(self._handle, events) + + def get_supported_event_types(self) -> list[EventType]: + """ + Get the list of event types supported by this device. + + For Fermi™ or newer fully supported devices. For Linux only (returns an + empty list on Windows). + + Returns + ------- + list[EventType] + The list of supported event types. + """ + cdef uint64_t[1] bitmask + bitmask[0] = nvml.device_get_supported_event_types(self._handle) + return [EventType(1 << ev) for ev in _unpack_bitmask(bitmask)] + @property def attributes(self) -> DeviceAttributes: """ @@ -638,6 +835,9 @@ __all__ = [ "Device", "DeviceArchitecture", "DeviceAttributes", + "DeviceEvents", + "EventData", + "EventType", "FieldId", "FieldValue", "FieldValues", diff --git a/cuda_core/cuda/core/system/_device_utils.pxi b/cuda_core/cuda/core/system/_device_utils.pxi index 6d7a150e8f..a9eccc73cc 100644 --- a/cuda_core/cuda/core/system/_device_utils.pxi +++ b/cuda_core/cuda/core/system/_device_utils.pxi @@ -2,28 +2,12 @@ # # SPDX-License-Identifier: Apache-2.0 -from cpython cimport array from libc.stdint cimport uint64_t cpdef inline list[int] _unpack_bitmask(uint64_t[:] arr): """ Unpack a list of integers containing bitmasks. - - Parameters - ---------- - x: list of int - A list of integers - - Examples - -------- - >>> from cuda.core.system.utils import unpack_bitmask - >>> unpack_bitmask([1 + 2 + 8]) - [0, 1, 3] - >>> unpack_bitmask([1 + 2 + 16]) - [0, 1, 4] - >>> unpack_bitmask([1 + 2 + 16, 2 + 4]) - [0, 1, 4, 65, 66] """ cdef uint64_t i, j, idx cdef int mask_bits = 64 diff --git a/cuda_core/cuda/core/system/_system_events.pyx b/cuda_core/cuda/core/system/_system_events.pyx new file mode 100644 index 0000000000..4f6c646a7e --- /dev/null +++ b/cuda_core/cuda/core/system/_system_events.pyx @@ -0,0 +1,164 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + + +from libc.stdint cimport intptr_t + +from cuda.bindings import _nvml as nvml + +from ._nvml_context cimport initialize + +from . import _device + + +SystemEventType = nvml.SystemEventType + + +cdef class SystemEvent: + """ + Data about a collection of system events. + """ + def __init__(self, event_data: nvml.SystemEventData_v1): + assert len(event_data) == 1 + self._event_data = event_data + + @property + def event_type(self) -> SystemEventType: + """ + The type of event that was triggered. + """ + return SystemEventType(self._event_data.event_type) + + @property + def gpu_id(self) -> int: + """ + The GPU ID in PCI ID format. + """ + return self._event_data.gpu_id + + @property + def device(self) -> _device.Device: + """ + The device associated with this event. + """ + return _device.Device(pci_bus_id=self.gpu_id) + + +cdef class SystemEvents: + """ + Data about a collection of system events. + """ + def __init__(self, event_data: nvml.SystemEventData_v1): + self._event_data = event_data + + def __len__(self): + return len(self._event_data) + + def __getitem__(self, idx: int) -> SystemEvent: + return SystemEvent(self._event_data[idx]) + + +cdef class RegisteredSystemEvents: + """ + Represents a set of events that can be waited on for a specific device. + """ + cdef intptr_t _event_set + + def __init__(self, events: SystemEventType | int | list[SystemEventType | int]): + cdef unsigned long long event_bitmask + if isinstance(events, (int, SystemEventType)): + event_bitmask = int(events) + elif isinstance(events, list): + event_bitmask = 0 + for ev in events: + event_bitmask |= int(ev) + else: + raise TypeError("events must be an SystemEventType, int, or list of SystemEventType or int") + + initialize() + + self._event_set = nvml.system_event_set_create() + print("event set:", self._event_set) + # If this raises, the event needs to be freed and this is handled by + # this class's __dealloc__ method. + nvml.system_register_events(event_bitmask, self._event_set) + + def __dealloc__(self): + nvml.system_event_set_free(self._event_set) + + def wait(self, timeout_ms: int = 0, buffer_size: int = 1) -> SystemEvents: + """ + Wait for events in the system event set. + + For Fermi™ or newer fully supported devices. + + If some events are ready to be delivered at the time of the call, + function returns immediately. If there are no events ready to be + delivered, function sleeps till event arrives but not longer than + specified timeout. If timeout passes, a + :class:`cuda.core.system.TimeoutError` is raised. This function in + certain conditions can return before specified timeout passes (e.g. when + interrupt arrives) + + Parameters + ---------- + timeout_ms: int + The timeout in milliseconds. A value of 0 means to wait indefinitely. + buffer_size: int + The maximum number of events to retrieve. Must be at least 1. + + Raises + ------ + :class:`cuda.core.system.TimeoutError` + If the timeout expires before an event is received. + :class:`cuda.core.system.GpuIsLostError` + If the GPU has fallen off the bus or is otherwise inaccessible. + """ + return SystemEvents(nvml.system_event_set_wait(self._event_set, timeout_ms, buffer_size)) + + +def register_events(events: SystemEventType | int | list[SystemEventType | int]) -> RegisteredSystemEvents: + """ + Starts recording of events on test system. + + For Linux only. + + All events that occurred before this call are not recorded. Wait for events + using the :meth:`SystemEvents.wait` method on the result. + + Examples + -------- + >>> from cuda.core import system + >>> events = system.register_events([ + ... SystemEventType.SYSTEM_EVENT_TYPE_GPU_DRIVER_UNBIND, + ... ]) + >>> while event := events.wait(timeout_ms=10000): + ... print(f"Event {event.event_type} occurred.") + + Parameters + ---------- + events: SystemEventType, int, or list of SsystemEventType or int + The event type or list of event types to register for this device. + + Returns + ------- + :class:`SystemEvents` + An object representing the registered events. Call + :meth:`SystemEvents.wait` on this object to wait for events. + + Raises + ------ + :class:`cuda.core.system.NotSupportedError` + None of the requested event types are registered. + """ + return RegisteredSystemEvents(events) + + +__all__ = [ + "register_events", + "RegisteredSystemEvents", + "SystemEvent", + "SystemEvents", + "SystemEventType", +] diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst index 15338383f6..de5cedda19 100644 --- a/cuda_core/docs/source/api.rst +++ b/cuda_core/docs/source/api.rst @@ -80,6 +80,12 @@ CUDA system information and NVIDIA Management Library (NVML) system.get_nvml_version system.get_process_name + system.register_events + system.RegisteredSystemEvents + system.SystemEvent + system.SystemEvents + system.SystemEventType + :template: autosummary/cyclass.rst system.Device @@ -87,6 +93,9 @@ CUDA system information and NVIDIA Management Library (NVML) system.BrandType system.DeviceArchitecture system.DeviceAttributes + system.DeviceEvents + system.EventData + system.EventType system.FieldId system.FieldValue system.FieldValues diff --git a/cuda_core/tests/system/test_system_device.py b/cuda_core/tests/system/test_system_device.py index 52c08533ff..79923c1303 100644 --- a/cuda_core/tests/system/test_system_device.py +++ b/cuda_core/tests/system/test_system_device.py @@ -187,6 +187,39 @@ def test_unpack_bitmask_single_value(): _device._unpack_bitmask(1) +@pytest.mark.skipif(helpers.IS_WSL or helpers.IS_WINDOWS, reason="Events not supported on WSL or Windows") +def test_register_events(): + # This is not the world's greatest test. All of the events are pretty + # infrequent and hard to simulate. So all we do here is register an event, + # wait with a timeout, and ensure that we get no event (since we didn't do + # anything to trigger one). + + # Also, some hardware doesn't support any event types. + + for device in system.Device.get_all_devices(): + supported_events = device.get_supported_event_types() + assert isinstance(supported_events, list) + assert all(isinstance(ev, system.EventType) for ev in supported_events) + + for device in system.Device.get_all_devices(): + events = device.register_events([]) + with pytest.raises(system.TimeoutError): + events.wait(timeout_ms=500) + + for device in system.Device.get_all_devices(): + events = device.register_events(0) + with pytest.raises(system.TimeoutError): + events.wait(timeout_ms=500) + + +def test_event_type_parsing(): + events = [system.EventType(1 << ev) for ev in _device._unpack_bitmask(array.array("Q", [3]))] + assert events == [ + system.EventType.EVENT_TYPE_SINGLE_BIT_ECC_ERROR, + system.EventType.EVENT_TYPE_DOUBLE_BIT_ECC_ERROR, + ] + + def test_device_brand(): for device in system.Device.get_all_devices(): brand = device.brand diff --git a/cuda_core/tests/system/test_system_events.py b/cuda_core/tests/system/test_system_events.py new file mode 100644 index 0000000000..4db87ce90e --- /dev/null +++ b/cuda_core/tests/system/test_system_events.py @@ -0,0 +1,27 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +# ruff: noqa: E402 + +from .conftest import skip_if_nvml_unsupported + +pytestmark = skip_if_nvml_unsupported + +import helpers +import pytest +from cuda.core import system + + +@pytest.mark.skipif(helpers.IS_WSL or helpers.IS_WINDOWS, reason="System events not supported on WSL or Windows") +def test_register_events(): + # This is not the world's greatest test. All of the events are pretty + # infrequent and hard to simulate. So all we do here is register an event, + # wait with a timeout, and ensure that we get no event (since we didn't do + # anything to trigger one). + + # Also, some hardware doesn't support any event types. + + events = system.register_events([system.SystemEventType.SYSTEM_EVENT_TYPE_GPU_DRIVER_UNBIND]) + with pytest.raises(system.TimeoutError): + events.wait(timeout_ms=500, buffer_size=1)