From 5a0bcae2e048ee8b0868151c7ad6e69201725c90 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Fri, 9 Jan 2026 09:32:22 -0500 Subject: [PATCH 1/9] cuda.core.system: Add APIs related to events --- cuda_bindings/cuda/bindings/_nvml.pxd | 5 - cuda_bindings/cuda/bindings/_nvml.pyx | 338 ++++++++++++++----- cuda_core/cuda/core/system/_device.pyx | 213 +++++++++++- cuda_core/cuda/core/system/_device_utils.pxi | 16 - cuda_core/docs/source/api.rst | 3 + cuda_core/tests/system/test_system_device.py | 24 ++ 6 files changed, 483 insertions(+), 116 deletions(-) diff --git a/cuda_bindings/cuda/bindings/_nvml.pxd b/cuda_bindings/cuda/bindings/_nvml.pxd index d08b087b38..ddf9ab2b28 100644 --- a/cuda_bindings/cuda/bindings/_nvml.pxd +++ b/cuda_bindings/cuda/bindings/_nvml.pxd @@ -34,7 +34,6 @@ ctypedef nvmlViolationTime_t ViolationTime ctypedef nvmlUUIDValue_t UUIDValue ctypedef nvmlVgpuPlacementList_v1_t VgpuPlacementList_v1 ctypedef nvmlNvLinkPowerThres_t NvLinkPowerThres -ctypedef nvmlSystemEventData_v1_t SystemEventData_v1 ctypedef nvmlGpuInstanceProfileInfo_t GpuInstanceProfileInfo ctypedef nvmlComputeInstanceProfileInfo_t ComputeInstanceProfileInfo ctypedef nvmlMask255_t Mask255 @@ -329,10 +328,6 @@ cpdef device_register_events(intptr_t device, unsigned long long event_types, in cpdef unsigned long long device_get_supported_event_types(intptr_t device) except? 0 cpdef object event_set_wait_v2(intptr_t set, unsigned int timeoutms) cpdef event_set_free(intptr_t set) -cpdef system_event_set_create(intptr_t request) -cpdef system_event_set_free(intptr_t request) -cpdef system_register_events(intptr_t request) -cpdef system_event_set_wait(intptr_t request) cpdef device_modify_drain_state(intptr_t pci_info, int new_state) cpdef int device_query_drain_state(intptr_t pci_info) except? -1 cpdef device_remove_gpu_v2(intptr_t pci_info, int gpu_state, int link_state) diff --git a/cuda_bindings/cuda/bindings/_nvml.pyx b/cuda_bindings/cuda/bindings/_nvml.pyx index d9bddcc4bc..e32e822109 100644 --- a/cuda_bindings/cuda/bindings/_nvml.pyx +++ b/cuda_bindings/cuda/bindings/_nvml.pyx @@ -787,7 +787,7 @@ class AffinityScope(_IntEnum): SOCKET = 1 # Scope of processor socket for affinity queries -class FI(_IntEnum): +class FieldId(_IntEnum): DEV_ECC_CURRENT = 1 # Current ECC mode. 1=Active. 0=Inactive DEV_ECC_PENDING = 2 # Pending ECC mode. 1=Active. 0=Inactive # ECC Count Totals @@ -1778,7 +1778,7 @@ cdef _get_pci_info_ext_v1_dtype_offsets(): cdef nvmlPciInfoExt_v1_t pod = nvmlPciInfoExt_v1_t() return _numpy.dtype({ 'names': ['version', 'domain', 'bus', 'device_', 'pci_device_id', 'pci_sub_system_id', 'base_class', 'sub_class', 'bus_id'], - 'formats': [_numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.int8], + 'formats': [_numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, (_numpy.int8, 32)], 'offsets': [ (&(pod.version)) - (&pod), (&(pod.domain)) - (&pod), @@ -1998,7 +1998,7 @@ cdef _get_pci_info_dtype_offsets(): cdef nvmlPciInfo_t pod = nvmlPciInfo_t() return _numpy.dtype({ 'names': ['bus_id_legacy', 'domain', 'bus', 'device_', 'pci_device_id', 'pci_sub_system_id', 'bus_id'], - 'formats': [_numpy.int8, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.int8], + 'formats': [(_numpy.int8, 16), _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, (_numpy.int8, 32)], 'offsets': [ (&(pod.busIdLegacy)) - (&pod), (&(pod.domain)) - (&pod), @@ -4901,7 +4901,7 @@ cdef _get_device_perf_modes_v1_dtype_offsets(): cdef nvmlDevicePerfModes_v1_t pod = nvmlDevicePerfModes_v1_t() return _numpy.dtype({ 'names': ['version', 'str'], - 'formats': [_numpy.uint32, _numpy.int8], + 'formats': [_numpy.uint32, (_numpy.int8, 2048)], 'offsets': [ (&(pod.version)) - (&pod), (&(pod.str)) - (&pod), @@ -5037,7 +5037,7 @@ cdef _get_device_current_clock_freqs_v1_dtype_offsets(): cdef nvmlDeviceCurrentClockFreqs_v1_t pod = nvmlDeviceCurrentClockFreqs_v1_t() return _numpy.dtype({ 'names': ['version', 'str'], - 'formats': [_numpy.uint32, _numpy.int8], + 'formats': [_numpy.uint32, (_numpy.int8, 2048)], 'offsets': [ (&(pod.version)) - (&pod), (&(pod.str)) - (&pod), @@ -5849,7 +5849,7 @@ cdef _get_platform_info_v1_dtype_offsets(): cdef nvmlPlatformInfo_v1_t pod = nvmlPlatformInfo_v1_t() return _numpy.dtype({ 'names': ['version', 'ib_guid', 'rack_guid', 'chassis_physical_slot_number', 'compute_slot_ind_ex', 'node_ind_ex', 'peer_type', 'module_id'], - 'formats': [_numpy.uint32, _numpy.uint8, _numpy.uint8, _numpy.uint8, _numpy.uint8, _numpy.uint8, _numpy.uint8, _numpy.uint8], + 'formats': [_numpy.uint32, (_numpy.uint8, 16), (_numpy.uint8, 16), _numpy.uint8, _numpy.uint8, _numpy.uint8, _numpy.uint8, _numpy.uint8], 'offsets': [ (&(pod.version)) - (&pod), (&(pod.ibGuid)) - (&pod), @@ -6061,7 +6061,7 @@ cdef _get_platform_info_v2_dtype_offsets(): cdef nvmlPlatformInfo_v2_t pod = nvmlPlatformInfo_v2_t() return _numpy.dtype({ 'names': ['version', 'ib_guid', 'chassis_serial_number', 'slot_number', 'tray_ind_ex', 'host_id', 'peer_type', 'module_id'], - 'formats': [_numpy.uint32, _numpy.uint8, _numpy.uint8, _numpy.uint8, _numpy.uint8, _numpy.uint8, _numpy.uint8, _numpy.uint8], + 'formats': [_numpy.uint32, (_numpy.uint8, 16), (_numpy.uint8, 16), _numpy.uint8, _numpy.uint8, _numpy.uint8, _numpy.uint8, _numpy.uint8], 'offsets': [ (&(pod.version)) - (&pod), (&(pod.ibGuid)) - (&pod), @@ -6993,7 +6993,7 @@ cdef _get_vgpu_process_utilization_info_v1_dtype_offsets(): cdef nvmlVgpuProcessUtilizationInfo_v1_t pod = nvmlVgpuProcessUtilizationInfo_v1_t() return _numpy.dtype({ 'names': ['process_name', 'time_stamp', 'vgpu_instance', 'pid', 'sm_util', 'mem_util', 'enc_util', 'dec_util', 'jpg_util', 'ofa_util'], - 'formats': [_numpy.int8, _numpy.uint64, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32], + 'formats': [(_numpy.int8, 64), _numpy.uint64, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32], 'offsets': [ (&(pod.processName)) - (&pod), (&(pod.timeStamp)) - (&pod), @@ -8063,7 +8063,7 @@ cdef _get_vgpu_scheduler_capabilities_dtype_offsets(): cdef nvmlVgpuSchedulerCapabilities_t pod = nvmlVgpuSchedulerCapabilities_t() return _numpy.dtype({ 'names': ['supported_schedulers', 'max_timeslice', 'min_timeslice', 'is_arr_mode_supported', 'max_frequency_for_arr', 'min_frequency_for_arr', 'max_avg_factor_for_arr', 'min_avg_factor_for_arr'], - 'formats': [_numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32], + 'formats': [(_numpy.uint32, 3), _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32], 'offsets': [ (&(pod.supportedSchedulers)) - (&pod), (&(pod.maxTimeslice)) - (&pod), @@ -9233,7 +9233,7 @@ cdef _get_hwbc_entry_dtype_offsets(): cdef nvmlHwbcEntry_t pod = nvmlHwbcEntry_t() return _numpy.dtype({ 'names': ['hwbc_id', 'firmware_version'], - 'formats': [_numpy.uint32, _numpy.int8], + 'formats': [_numpy.uint32, (_numpy.int8, 32)], 'offsets': [ (&(pod.hwbcId)) - (&pod), (&(pod.firmwareVersion)) - (&pod), @@ -9377,7 +9377,7 @@ cdef _get_led_state_dtype_offsets(): cdef nvmlLedState_t pod = nvmlLedState_t() return _numpy.dtype({ 'names': ['cause', 'color'], - 'formats': [_numpy.int8, _numpy.int32], + 'formats': [(_numpy.int8, 256), _numpy.int32], 'offsets': [ (&(pod.cause)) - (&pod), (&(pod.color)) - (&pod), @@ -9513,7 +9513,7 @@ cdef _get_unit_info_dtype_offsets(): cdef nvmlUnitInfo_t pod = nvmlUnitInfo_t() return _numpy.dtype({ 'names': ['name', 'id', 'serial', 'firmware_version'], - 'formats': [_numpy.int8, _numpy.int8, _numpy.int8, _numpy.int8], + 'formats': [(_numpy.int8, 96), (_numpy.int8, 96), (_numpy.int8, 96), (_numpy.int8, 96)], 'offsets': [ (&(pod.name)) - (&pod), (&(pod.id)) - (&pod), @@ -9685,7 +9685,7 @@ cdef _get_psu_info_dtype_offsets(): cdef nvmlPSUInfo_t pod = nvmlPSUInfo_t() return _numpy.dtype({ 'names': ['state', 'current', 'voltage', 'power'], - 'formats': [_numpy.int8, _numpy.uint32, _numpy.uint32, _numpy.uint32], + 'formats': [(_numpy.int8, 256), _numpy.uint32, _numpy.uint32, _numpy.uint32], 'offsets': [ (&(pod.state)) - (&pod), (&(pod.current)) - (&pod), @@ -10155,11 +10155,157 @@ cdef class EventData: return obj +cdef _get_system_event_data_v1_dtype_offsets(): + cdef nvmlSystemEventData_v1_t pod = nvmlSystemEventData_v1_t() + return _numpy.dtype({ + 'names': ['event_type', 'gpu_id'], + 'formats': [_numpy.uint64, _numpy.uint32], + 'offsets': [ + (&(pod.eventType)) - (&pod), + (&(pod.gpuId)) - (&pod), + ], + 'itemsize': sizeof(nvmlSystemEventData_v1_t), + }) + +system_event_data_v1_dtype = _get_system_event_data_v1_dtype_offsets() + +cdef class SystemEventData_v1: + """Empty-initialize an array of `nvmlSystemEventData_v1_t`. + + The resulting object is of length `size` and of dtype `system_event_data_v1_dtype`. + If default-constructed, the instance represents a single struct. + + Args: + size (int): number of structs, default=1. + + + .. seealso:: `nvmlSystemEventData_v1_t` + """ + cdef: + readonly object _data + + + + def __init__(self, size=1): + arr = _numpy.empty(size, dtype=system_event_data_v1_dtype) + self._data = arr.view(_numpy.recarray) + assert self._data.itemsize == sizeof(nvmlSystemEventData_v1_t), \ + f"itemsize {self._data.itemsize} mismatches struct size { sizeof(nvmlSystemEventData_v1_t) }" + + def __repr__(self): + if self._data.size > 1: + return f"<{__name__}.SystemEventData_v1_Array_{self._data.size} object at {hex(id(self))}>" + else: + return f"<{__name__}.SystemEventData_v1 object at {hex(id(self))}>" + + @property + def ptr(self): + """Get the pointer address to the data as Python :class:`int`.""" + return self._data.ctypes.data + + cdef intptr_t _get_ptr(self): + return self._data.ctypes.data + + def __int__(self): + if self._data.size > 1: + raise TypeError("int() argument must be a bytes-like object of size 1. " + "To get the pointer address of an array, use .ptr") + return self._data.ctypes.data + + def __len__(self): + return self._data.size + + def __eq__(self, other): + cdef object self_data = self._data + if (not isinstance(other, SystemEventData_v1)) or self_data.size != other._data.size or self_data.dtype != other._data.dtype: + return False + return bool((self_data == other._data).all()) + + @property + def event_type(self): + """Union[~_numpy.uint64, int]: Information about what specific system event occurred.""" + if self._data.size == 1: + return int(self._data.event_type[0]) + return self._data.event_type + + @event_type.setter + def event_type(self, val): + self._data.event_type = val + + @property + def gpu_id(self): + """Union[~_numpy.uint32, int]: gpuId in PCI format""" + if self._data.size == 1: + return int(self._data.gpu_id[0]) + return self._data.gpu_id + + @gpu_id.setter + def gpu_id(self, val): + self._data.gpu_id = val + + def __getitem__(self, key): + cdef ssize_t key_ + cdef ssize_t size + if isinstance(key, int): + key_ = key + size = self._data.size + if key_ >= size or key_ <= -(size+1): + raise IndexError("index is out of bounds") + if key_ < 0: + key_ += size + return SystemEventData_v1.from_data(self._data[key_:key_+1]) + out = self._data[key] + if isinstance(out, _numpy.recarray) and out.dtype == system_event_data_v1_dtype: + return SystemEventData_v1.from_data(out) + return out + + def __setitem__(self, key, val): + self._data[key] = val + + @staticmethod + def from_data(data): + """Create an SystemEventData_v1 instance wrapping the given NumPy array. + + Args: + data (_numpy.ndarray): a 1D array of dtype `system_event_data_v1_dtype` holding the data. + """ + cdef SystemEventData_v1 obj = SystemEventData_v1.__new__(SystemEventData_v1) + if not isinstance(data, _numpy.ndarray): + raise TypeError("data argument must be a NumPy ndarray") + if data.ndim != 1: + raise ValueError("data array must be 1D") + if data.dtype != system_event_data_v1_dtype: + raise ValueError("data array must be of dtype system_event_data_v1_dtype") + obj._data = data.view(_numpy.recarray) + + return obj + + @staticmethod + def from_ptr(intptr_t ptr, size_t size=1, bint readonly=False): + """Create an SystemEventData_v1 instance wrapping the given pointer. + + Args: + ptr (intptr_t): pointer address as Python :class:`int` to the data. + size (int): number of structs, default=1. + readonly (bool): whether the data is read-only (to the user). default is `False`. + """ + if ptr == 0: + raise ValueError("ptr must not be null (0)") + cdef SystemEventData_v1 obj = SystemEventData_v1.__new__(SystemEventData_v1) + cdef flag = cpython.buffer.PyBUF_READ if readonly else cpython.buffer.PyBUF_WRITE + cdef object buf = cpython.memoryview.PyMemoryView_FromMemory( + ptr, sizeof(nvmlSystemEventData_v1_t) * size, flag) + data = _numpy.ndarray(size, buffer=buf, dtype=system_event_data_v1_dtype) + obj._data = data.view(_numpy.recarray) + + return obj + + cdef _get_accounting_stats_dtype_offsets(): cdef nvmlAccountingStats_t pod = nvmlAccountingStats_t() return _numpy.dtype({ 'names': ['gpu_utilization', 'memory_utilization', 'max_memory_usage', 'time', 'start_time', 'is_running', 'reserved'], - 'formats': [_numpy.uint32, _numpy.uint32, _numpy.uint64, _numpy.uint64, _numpy.uint64, _numpy.uint32, _numpy.uint32], + 'formats': [_numpy.uint32, _numpy.uint32, _numpy.uint64, _numpy.uint64, _numpy.uint64, _numpy.uint32, (_numpy.uint32, 5)], 'offsets': [ (&(pod.gpuUtilization)) - (&pod), (&(pod.memoryUtilization)) - (&pod), @@ -11544,7 +11690,7 @@ cdef _get_conf_compute_gpu_certificate_dtype_offsets(): cdef nvmlConfComputeGpuCertificate_t pod = nvmlConfComputeGpuCertificate_t() return _numpy.dtype({ 'names': ['cert_chain_size', 'attestation_cert_chain_size', 'cert_chain', 'attestation_cert_chain'], - 'formats': [_numpy.uint32, _numpy.uint32, _numpy.uint8, _numpy.uint8], + 'formats': [_numpy.uint32, _numpy.uint32, (_numpy.uint8, 4096), (_numpy.uint8, 5120)], 'offsets': [ (&(pod.certChainSize)) - (&pod), (&(pod.attestationCertChainSize)) - (&pod), @@ -11708,7 +11854,7 @@ cdef _get_conf_compute_gpu_attestation_report_dtype_offsets(): cdef nvmlConfComputeGpuAttestationReport_t pod = nvmlConfComputeGpuAttestationReport_t() return _numpy.dtype({ 'names': ['is_cec_attestation_report_present', 'attestation_report_size', 'cec_attestation_report_size', 'nonce', 'attestation_report', 'cec_attestation_report'], - 'formats': [_numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint8, _numpy.uint8, _numpy.uint8], + 'formats': [_numpy.uint32, _numpy.uint32, _numpy.uint32, (_numpy.uint8, 32), (_numpy.uint8, 8192), (_numpy.uint8, 4096)], 'offsets': [ (&(pod.isCecAttestationReportPresent)) - (&pod), (&(pod.attestationReportSize)) - (&pod), @@ -12032,7 +12178,7 @@ cdef _get_gpu_fabric_info_v2_dtype_offsets(): cdef nvmlGpuFabricInfo_v2_t pod = nvmlGpuFabricInfo_v2_t() return _numpy.dtype({ 'names': ['version', 'cluster_uuid', 'status', 'clique_id', 'state', 'health_mask'], - 'formats': [_numpy.uint32, _numpy.uint8, _numpy.int32, _numpy.uint32, _numpy.uint8, _numpy.uint32], + 'formats': [_numpy.uint32, (_numpy.uint8, 16), _numpy.int32, _numpy.uint32, _numpy.uint8, _numpy.uint32], 'offsets': [ (&(pod.version)) - (&pod), (&(pod.clusterUuid)) - (&pod), @@ -12216,7 +12362,7 @@ cdef _get_nvlink_supported_bw_modes_v1_dtype_offsets(): cdef nvmlNvlinkSupportedBwModes_v1_t pod = nvmlNvlinkSupportedBwModes_v1_t() return _numpy.dtype({ 'names': ['version', 'bw_modes', 'total_bw_modes'], - 'formats': [_numpy.uint32, _numpy.uint8, _numpy.uint8], + 'formats': [_numpy.uint32, (_numpy.uint8, 23), _numpy.uint8], 'offsets': [ (&(pod.version)) - (&pod), (&(pod.bwModes)) - (&pod), @@ -12784,7 +12930,7 @@ cdef _get_vgpu_metadata_dtype_offsets(): cdef nvmlVgpuMetadata_t pod = nvmlVgpuMetadata_t() return _numpy.dtype({ 'names': ['version', 'revision', 'guest_info_state', 'guest_driver_version', 'host_driver_version', 'reserved', 'vgpu_virtualization_caps', 'guest_vgpu_version', 'opaque_data_size', 'opaque_data'], - 'formats': [_numpy.uint32, _numpy.uint32, _numpy.int32, _numpy.int8, _numpy.int8, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.int8], + 'formats': [_numpy.uint32, _numpy.uint32, _numpy.int32, (_numpy.int8, 80), (_numpy.int8, 80), (_numpy.uint32, 6), _numpy.uint32, _numpy.uint32, _numpy.uint32, (_numpy.int8, 4)], 'offsets': [ (&(pod.version)) - (&pod), (&(pod.revision)) - (&pod), @@ -13291,7 +13437,7 @@ cdef _get_gpu_instance_profile_info_v2_dtype_offsets(): cdef nvmlGpuInstanceProfileInfo_v2_t pod = nvmlGpuInstanceProfileInfo_v2_t() return _numpy.dtype({ 'names': ['version', 'id', 'is_p2p_supported', 'slice_count', 'instance_count', 'multiprocessor_count', 'copy_engine_count', 'decoder_count', 'encoder_count', 'jpeg_count', 'ofa_count', 'memory_size_mb', 'name'], - 'formats': [_numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint64, _numpy.int8], + 'formats': [_numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint64, (_numpy.int8, 96)], 'offsets': [ (&(pod.version)) - (&pod), (&(pod.id)) - (&pod), @@ -13559,7 +13705,7 @@ cdef _get_gpu_instance_profile_info_v3_dtype_offsets(): cdef nvmlGpuInstanceProfileInfo_v3_t pod = nvmlGpuInstanceProfileInfo_v3_t() return _numpy.dtype({ 'names': ['version', 'id', 'slice_count', 'instance_count', 'multiprocessor_count', 'copy_engine_count', 'decoder_count', 'encoder_count', 'jpeg_count', 'ofa_count', 'memory_size_mb', 'name', 'capabilities'], - 'formats': [_numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint64, _numpy.int8, _numpy.uint32], + 'formats': [_numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint64, (_numpy.int8, 96), _numpy.uint32], 'offsets': [ (&(pod.version)) - (&pod), (&(pod.id)) - (&pod), @@ -13973,7 +14119,7 @@ cdef _get_compute_instance_profile_info_v2_dtype_offsets(): cdef nvmlComputeInstanceProfileInfo_v2_t pod = nvmlComputeInstanceProfileInfo_v2_t() return _numpy.dtype({ 'names': ['version', 'id', 'slice_count', 'instance_count', 'multiprocessor_count', 'shared_copy_engine_count', 'shared_decoder_count', 'shared_encoder_count', 'shared_jpeg_count', 'shared_ofa_count', 'name'], - 'formats': [_numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.int8], + 'formats': [_numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, (_numpy.int8, 96)], 'offsets': [ (&(pod.version)) - (&pod), (&(pod.id)) - (&pod), @@ -14217,7 +14363,7 @@ cdef _get_compute_instance_profile_info_v3_dtype_offsets(): cdef nvmlComputeInstanceProfileInfo_v3_t pod = nvmlComputeInstanceProfileInfo_v3_t() return _numpy.dtype({ 'names': ['version', 'id', 'slice_count', 'instance_count', 'multiprocessor_count', 'shared_copy_engine_count', 'shared_decoder_count', 'shared_encoder_count', 'shared_jpeg_count', 'shared_ofa_count', 'name', 'capabilities'], - 'formats': [_numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.int8, _numpy.uint32], + 'formats': [_numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, (_numpy.int8, 96), _numpy.uint32], 'offsets': [ (&(pod.version)) - (&pod), (&(pod.id)) - (&pod), @@ -15495,7 +15641,7 @@ cdef _get_gpu_fabric_info_v3_dtype_offsets(): cdef nvmlGpuFabricInfo_v3_t pod = nvmlGpuFabricInfo_v3_t() return _numpy.dtype({ 'names': ['version', 'cluster_uuid', 'status', 'clique_id', 'state', 'health_mask', 'health_summary'], - 'formats': [_numpy.uint32, _numpy.uint8, _numpy.int32, _numpy.uint32, _numpy.uint8, _numpy.uint32, _numpy.uint8], + 'formats': [_numpy.uint32, (_numpy.uint8, 16), _numpy.int32, _numpy.uint32, _numpy.uint8, _numpy.uint32, _numpy.uint8], 'offsets': [ (&(pod.version)) - (&pod), (&(pod.clusterUuid)) - (&pod), @@ -16351,7 +16497,7 @@ cdef _get_excluded_device_info_dtype_offsets(): cdef nvmlExcludedDeviceInfo_t pod = nvmlExcludedDeviceInfo_t() return _numpy.dtype({ 'names': ['pci_info', 'uuid'], - 'formats': [pci_info_dtype, _numpy.int8], + 'formats': [pci_info_dtype, (_numpy.int8, 80)], 'offsets': [ (&(pod.pciInfo)) - (&pod), (&(pod.uuid)) - (&pod), @@ -16641,7 +16787,7 @@ cdef _get_bridge_chip_hierarchy_dtype_offsets(): cdef nvmlBridgeChipHierarchy_t pod = nvmlBridgeChipHierarchy_t() return _numpy.dtype({ 'names': ['bridge_count', 'bridge_chip_info'], - 'formats': [_numpy.uint8, bridge_chip_info_dtype], + 'formats': [_numpy.uint8, (bridge_chip_info_dtype, 128)], 'offsets': [ (&(pod.bridgeCount)) - (&pod), (&(pod.bridgeChipInfo)) - (&pod), @@ -17475,7 +17621,7 @@ cdef _get_gpu_thermal_settings_dtype_offsets(): cdef nvmlGpuThermalSettings_t pod = nvmlGpuThermalSettings_t() return _numpy.dtype({ 'names': ['count', 'sensor'], - 'formats': [_numpy.uint32, _py_anon_pod0_dtype], + 'formats': [_numpy.uint32, (_py_anon_pod0_dtype, 3)], 'offsets': [ (&(pod.count)) - (&pod), (&(pod.sensor)) - (&pod), @@ -17610,7 +17756,7 @@ cdef _get_clk_mon_status_dtype_offsets(): cdef nvmlClkMonStatus_t pod = nvmlClkMonStatus_t() return _numpy.dtype({ 'names': ['b_global_status', 'clk_mon_list_size', 'clk_mon_list'], - 'formats': [_numpy.uint32, _numpy.uint32, clk_mon_fault_info_dtype], + 'formats': [_numpy.uint32, _numpy.uint32, (clk_mon_fault_info_dtype, 32)], 'offsets': [ (&(pod.bGlobalStatus)) - (&pod), (&(pod.clkMonListSize)) - (&pod), @@ -17910,7 +18056,7 @@ cdef _get_gpu_dynamic_pstates_info_dtype_offsets(): cdef nvmlGpuDynamicPstatesInfo_t pod = nvmlGpuDynamicPstatesInfo_t() return _numpy.dtype({ 'names': ['flags_', 'utilization'], - 'formats': [_numpy.uint32, _py_anon_pod1_dtype], + 'formats': [_numpy.uint32, (_py_anon_pod1_dtype, 8)], 'offsets': [ (&(pod.flags)) - (&pod), (&(pod.utilization)) - (&pod), @@ -18601,7 +18747,7 @@ cdef _get_grid_licensable_feature_dtype_offsets(): cdef nvmlGridLicensableFeature_t pod = nvmlGridLicensableFeature_t() return _numpy.dtype({ 'names': ['feature_code', 'feature_state', 'license_info', 'product_name', 'feature_enabled', 'license_expiry'], - 'formats': [_numpy.int32, _numpy.uint32, _numpy.int8, _numpy.int8, _numpy.uint32, grid_license_expiry_dtype], + 'formats': [_numpy.int32, _numpy.uint32, (_numpy.int8, 128), (_numpy.int8, 128), _numpy.uint32, grid_license_expiry_dtype], 'offsets': [ (&(pod.featureCode)) - (&pod), (&(pod.featureState)) - (&pod), @@ -18789,7 +18935,7 @@ cdef _get_unit_fan_speeds_dtype_offsets(): cdef nvmlUnitFanSpeeds_t pod = nvmlUnitFanSpeeds_t() return _numpy.dtype({ 'names': ['fans', 'count'], - 'formats': [unit_fan_info_dtype, _numpy.uint32], + 'formats': [(unit_fan_info_dtype, 24), _numpy.uint32], 'offsets': [ (&(pod.fans)) - (&pod), (&(pod.count)) - (&pod), @@ -18924,7 +19070,7 @@ cdef _get_vgpu_pgpu_metadata_dtype_offsets(): cdef nvmlVgpuPgpuMetadata_t pod = nvmlVgpuPgpuMetadata_t() return _numpy.dtype({ 'names': ['version', 'revision', 'host_driver_version', 'pgpu_virtualization_caps', 'reserved', 'host_supported_vgpu_range', 'opaque_data_size', 'opaque_data'], - 'formats': [_numpy.uint32, _numpy.uint32, _numpy.int8, _numpy.uint32, _numpy.uint32, vgpu_version_dtype, _numpy.uint32, _numpy.int8], + 'formats': [_numpy.uint32, _numpy.uint32, (_numpy.int8, 80), _numpy.uint32, (_numpy.uint32, 5), vgpu_version_dtype, _numpy.uint32, (_numpy.int8, 4)], 'offsets': [ (&(pod.version)) - (&pod), (&(pod.revision)) - (&pod), @@ -19593,7 +19739,7 @@ cdef _get_nvlink_firmware_info_dtype_offsets(): cdef nvmlNvlinkFirmwareInfo_t pod = nvmlNvlinkFirmwareInfo_t() return _numpy.dtype({ 'names': ['firmware_version', 'num_valid_entries'], - 'formats': [nvlink_firmware_version_dtype, _numpy.uint32], + 'formats': [(nvlink_firmware_version_dtype, 100), _numpy.uint32], 'offsets': [ (&(pod.firmwareVersion)) - (&pod), (&(pod.numValidEntries)) - (&pod), @@ -20039,7 +20185,7 @@ cdef _get_vgpu_scheduler_log_dtype_offsets(): cdef nvmlVgpuSchedulerLog_t pod = nvmlVgpuSchedulerLog_t() return _numpy.dtype({ 'names': ['engine_id', 'scheduler_policy', 'arr_mode', 'scheduler_params', 'entries_count', 'log_entries'], - 'formats': [_numpy.uint32, _numpy.uint32, _numpy.uint32, vgpu_scheduler_params_dtype, _numpy.uint32, vgpu_scheduler_log_entry_dtype], + 'formats': [_numpy.uint32, _numpy.uint32, _numpy.uint32, vgpu_scheduler_params_dtype, _numpy.uint32, (vgpu_scheduler_log_entry_dtype, 200)], 'offsets': [ (&(pod.engineId)) - (&pod), (&(pod.schedulerPolicy)) - (&pod), @@ -20537,7 +20683,7 @@ cdef _get_vgpu_scheduler_log_info_v1_dtype_offsets(): cdef nvmlVgpuSchedulerLogInfo_v1_t pod = nvmlVgpuSchedulerLogInfo_v1_t() return _numpy.dtype({ 'names': ['version', 'engine_id', 'scheduler_policy', 'arr_mode', 'scheduler_params', 'entries_count', 'log_entries'], - 'formats': [_numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, vgpu_scheduler_params_dtype, _numpy.uint32, vgpu_scheduler_log_entry_dtype], + 'formats': [_numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, vgpu_scheduler_params_dtype, _numpy.uint32, (vgpu_scheduler_log_entry_dtype, 200)], 'offsets': [ (&(pod.version)) - (&pod), (&(pod.engineId)) - (&pod), @@ -20902,7 +21048,7 @@ cdef _get_grid_licensable_features_dtype_offsets(): cdef nvmlGridLicensableFeatures_t pod = nvmlGridLicensableFeatures_t() return _numpy.dtype({ 'names': ['is_grid_license_supported', 'licensable_features_count', 'grid_licensable_features'], - 'formats': [_numpy.int32, _numpy.uint32, grid_licensable_feature_dtype], + 'formats': [_numpy.int32, _numpy.uint32, (grid_licensable_feature_dtype, 3)], 'offsets': [ (&(pod.isGridLicenseSupported)) - (&pod), (&(pod.licensableFeaturesCount)) - (&pod), @@ -24913,58 +25059,6 @@ cpdef event_set_free(intptr_t set): check_status(__status__) -cpdef system_event_set_create(intptr_t request): - """Create an empty set of system events. Event set should be freed by ``nvmlSystemEventSetFree``. - - Args: - request (intptr_t): Reference to nvmlSystemEventSetCreateRequest_t. - - .. seealso:: `nvmlSystemEventSetCreate` - """ - with nogil: - __status__ = nvmlSystemEventSetCreate(request) - check_status(__status__) - - -cpdef system_event_set_free(intptr_t request): - """Releases system event set. - - Args: - request (intptr_t): Reference to nvmlSystemEventSetFreeRequest_t. - - .. seealso:: `nvmlSystemEventSetFree` - """ - with nogil: - __status__ = nvmlSystemEventSetFree(request) - check_status(__status__) - - -cpdef system_register_events(intptr_t request): - """Starts recording of events on system and add the events to specified ``nvmlSystemEventSet_t``. - - Args: - request (intptr_t): Reference to the struct nvmlSystemRegisterEventRequest_t. - - .. seealso:: `nvmlSystemRegisterEvents` - """ - with nogil: - __status__ = nvmlSystemRegisterEvents(request) - check_status(__status__) - - -cpdef system_event_set_wait(intptr_t request): - """Waits on system events and delivers events. - - Args: - request (intptr_t): Reference in which to nvmlSystemEventSetWaitRequest_t. - - .. seealso:: `nvmlSystemEventSetWait` - """ - with nogil: - __status__ = nvmlSystemEventSetWait(request) - check_status(__status__) - - cpdef device_modify_drain_state(intptr_t pci_info, int new_state): """Modify the drain state of a GPU. This method forces a GPU to no longer accept new incoming requests. Any new NVML process will no longer see this GPU. Persistence mode for this GPU must be turned off before this call is made. Must be called as administrator. For Linux only. @@ -27299,10 +27393,16 @@ cdef FieldValue _cast_field_values(values): values_ = FieldValue(valuesCount) for i, v in enumerate(values): if isinstance(v, tuple): + if len(v) != 2: + raise ValueError("FieldValue tuple must be of length 2") + if not isinstance(v[0], int) or not isinstance(v[1], int): + raise ValueError("FieldValue tuple elements must be integers") values_[i].field_id = v[0] values_[i].scope_id = v[1] - else: + elif isinstance(v, int): values_[i].field_id = v + else: + raise ValueError("Each entry must be an integer field ID, or a tuple of (field ID, scope ID)") return values_ @@ -27901,3 +28001,63 @@ cpdef object device_get_nvlink_info(intptr_t device): __status__ = nvmlDeviceGetNvLinkInfo(device, info) check_status(__status__) return info_v1_py + + +cpdef intptr_t system_event_set_create(): + """Create an empty set of system events. Event set should be freed by ``nvmlSystemEventSetFree``.""" + cdef nvmlSystemEventSetCreateRequest_v1_t[1] request + with nogil: + request[0].version = sizeof(nvmlSystemEventSetCreateRequest_v1_t) | (1 << 24) + __status__ = nvmlSystemEventSetCreate(request) + check_status(__status__) + return (request.set) + + +cpdef system_event_set_free(intptr_t event_set): + """Create an empty set of system events. Event set should be freed by ``nvmlSystemEventSetFree``.""" + cdef nvmlSystemEventSetFreeRequest_v1_t[1] request + request[0].set = event_set + with nogil: + request[0].version = sizeof(nvmlSystemEventSetFreeRequest_v1_t) | (1 << 24) + __status__ = nvmlSystemEventSetFree(request) + check_status(__status__) + + +cpdef system_register_events(intptr_t event_set, unsigned long long event_types): + """Starts recording of events on system and add the events to specified ``nvmlSystemEventSet_t``. + + Args: + event_set (intptr_t): The system event set handle. + event_types (unsigned int): Bitmask of nvmlSystemEventType_t values representing the events to register. + """ + cdef nvmlSystemRegisterEventRequest_v1_t[1] request + request[0].set = event_set + request[0].eventTypes = event_types + with nogil: + request[0].version = sizeof(nvmlSystemRegisterEventRequest_v1_t) | (1 << 24) + __status__ = nvmlSystemRegisterEvents(request) + check_status(__status__) + + +cpdef object system_event_set_wait(intptr_t event_set, unsigned int timeout_ms, unsigned int buffer_size): + """Waits for events to occur on the system event set. + + Args: + event_set (intptr_t): The system event set handle. + timeout_ms (unsigned int): The maximum amount of time in milliseconds to wait for an event. + buffer_size (unsigned int): The size of the event buffer. + + Returns: + SystemEvent: The system event that occurred. + """ + cdef nvmlSystemEventSetWaitRequest_v1_t[1] request + cdef SystemEventData_v1 event_data = SystemEventData_v1(buffer_size) + request[0].timeoutms = timeout_ms + request[0].set = event_set + request[0].data = (event_data._get_ptr()) + request[0].dataSize = buffer_size + with nogil: + request[0].version = sizeof(nvmlSystemEventSetWaitRequest_v1_t) | (1 << 24) + __status__ = nvmlSystemEventSetWait(request) + check_status(__status__) + return SystemEventData_v1.from_ptr(event_data._get_ptr(), size=request.numEvent) diff --git a/cuda_core/cuda/core/system/_device.pyx b/cuda_core/cuda/core/system/_device.pyx index 2b5ec242e8..efad2f8f7b 100644 --- a/cuda_core/cuda/core/system/_device.pyx +++ b/cuda_core/cuda/core/system/_device.pyx @@ -2,7 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 -from libc.stdint cimport intptr_t +from libc.stdint cimport intptr_t, uint64_t from libc.math cimport ceil from multiprocessing import cpu_count @@ -15,6 +15,9 @@ from ._nvml_context cimport initialize include "_device_utils.pxi" +EventType = nvml.EventType + + class DeviceArchitecture: """ Device architecture enumeration. @@ -171,6 +174,138 @@ cdef class PciInfo: return self._pci_info.pci_device_id >> 16 +cdef class EventData: + """ + Data about a single event. + """ + def __init__(self, event_data: nvml.EventData): + self._event_data = event_data + + @property + def device(self) -> Device: + """ + The device on which the event occurred. + """ + return Device(handle=self._event_data.device) + + @property + def event_type(self) -> EventType: + """ + The type of event that was triggered. + """ + return EventType(self._event_data.event_type) + + @property + def event_data(self) -> int: + """ + Returns Xid error for the device in the event of + :member:`EventType.EVENT_TYPE_XID_CRITICAL_ERROR`. + + Raises :class:`ValueError` for other event types. + """ + if self.event_type != EventType.EVENT_TYPE_XID_CRITICAL_ERROR: + raise ValueError("event_data is only available for Xid critical error events.") + return self._event_data.event_data + + @property + def gpu_instance_id(self) -> int: + """ + The GPU instance ID for MIG devices. + + Only valid for events of type :member:`EventType.EVENT_TYPE_XID_CRITICAL_ERROR`. + + Raises :class:`ValueError` for other event types. + """ + if self.event_type != EventType.EVENT_TYPE_XID_CRITICAL_ERROR: + raise ValueError("gpu_instance_id is only available for Xid critical error events.") + return self._event_data.gpu_instance_id + + @property + def compute_instance_id(self) -> int: + """ + The Compute instance ID for MIG devices. + + Only valid for events of type :member:`EventType.EVENT_TYPE_XID_CRITICAL_ERROR`. + + Raises :class:`ValueError` for other event types. + """ + if self.event_type != EventType.EVENT_TYPE_XID_CRITICAL_ERROR: + raise ValueError("compute_instance_id is only available for Xid critical error events.") + return self._event_data.compute_instance_id + + +cdef class DeviceEvents: + """ + Represents a set of events that can be waited on for a specific device. + """ + cdef intptr_t _event_set + cdef intptr_t _device_handle + + def __init__(self, device_handle: intptr_t, events: EventType | int | list[EventType | int]): + cdef unsigned long long event_bitmask + if isinstance(events, (int, EventType)): + event_bitmask = int(events) + elif isinstance(events, list): + event_bitmask = 0 + for ev in events: + event_bitmask |= int(ev) + else: + raise TypeError("events must be an EventType, int, or list of EventType or int") + + self._device_handle = device_handle + self._event_set = nvml.event_set_create() + # If this raises, the event needs to be freed and this is handled by + # this class's __dealloc__ method. + nvml.device_register_events(self._device_handle, event_bitmask, self._event_set) + + def __dealloc__(self): + nvml.event_set_free(self._event_set) + + def wait(self, timeout_ms: int = 0) -> None: + """ + Wait for events in the event set. + + For Fermi™ or newer fully supported devices. + + If some events are ready to be delivered at the time of the call, + function returns immediately. If there are no events ready to be + delivered, function sleeps until event arrives but not longer than + specified timeout. If timeout passes, a + :class:`cuda.core.system.TimeoutError` is raised. This function in + certain conditions can return before specified timeout passes (e.g. when + interrupt arrives). + + On Windows, in case of Xid error, the function returns the most recent + Xid error type seen by the system. If there are multiple Xid errors + generated before ``wait`` is invoked, then the last seen Xid + error type is returned for all Xid error events. + + On Linux, every Xid error event would return the associated event data + and other information if applicable. + + In MIG mode, if device handle is provided, the API reports all the + events for the available instances, only if the caller has appropriate + privileges. In absence of required privileges, only the events which + affect all the instances (i.e. whole device) are reported. + + This API does not currently support per-instance event reporting using + MIG device handles. + + Parameters + ---------- + timeout_ms: int + The timeout in milliseconds. A value of 0 means to wait indefinitely. + + Raises + ------ + cuda.core.system.TimeoutError + If the timeout expires before an event is received. + cuda.core.system.GpuIsLostError + If the GPU has fallen off the bus or is otherwise inaccessible. + """ + return EventData(nvml.event_set_wait_v2(self._event_set, timeout_ms)) + + cdef class Device: """ Representation of a device. @@ -194,20 +329,25 @@ cdef class Device: cdef intptr_t _handle - def __init__(self, index: int | None = None, uuid: bytes | str | None = None): + def __init__(self, index: int | None = None, uuid: bytes | str | None = None, handle: int | None = None): initialize() - if index is not None and uuid is not None: - raise ValueError("Handle requires only one of either device `index` or `uuid`.") - if index is None and uuid is None: + args = [index, uuid, handle] + cdef int arg_count = sum(arg is not None for arg in args) + + if arg_count > 1: + raise ValueError("Handle requires only one of `index`, `uuid` or `handle`.") + if arg_count == 0: raise ValueError("Handle requires either a device `index` or `uuid`.") if index is not None: self._handle = nvml.device_get_handle_by_index_v2(index) - else: + elif uuid is not None: if isinstance(uuid, bytes): uuid = uuid.decode("ascii") self._handle = nvml.device_get_handle_by_uuid(uuid) + elif handle is not None: + self._handle = handle @property def handle(self) -> int: @@ -313,11 +453,72 @@ cdef class Device: """ return nvml.device_get_uuid(self._handle) + def register_events(self, events: EventType | int | list[EventType | int]) -> DeviceEvents: + """ + Starts recording events on this device. + + For Fermi™ or newer fully supported devices. For Linux only. + + ECC events are available only on ECC-enabled devices (see + :meth:`Device.get_total_ecc_errors`). Power capping events are + available only on Power Management enabled devices (see + :meth:`Device.get_power_management_mode`). + + This call starts recording of events on specific device. All events + that occurred before this call are not recorded. Wait for events using + the :meth:`DeviceEvents.wait` method on the result. + + Examples + -------- + >>> device = Device(index=0) + >>> events = device.register_events([ + ... EventType.EVENT_TYPE_XID_CRITICAL_ERROR, + ... ]) + >>> while event := events.wait(timeout_ms=10000): + ... print(f"Event {event.event_type} occurred on device {event.device.uuid}") + + Parameters + ---------- + events: EventType, int, or list of EventType or int + The event type or list of event types to register for this device. + + Returns + ------- + :class:`DeviceEvents` + An object representing the registered events. Call + :meth:`DeviceEvents.wait` on this object to wait for events. + + Raises + ------ + :class:`cuda.core.system.NotSupportedError` + None of the requested event types are registered. + """ + return DeviceEvents(self._handle, events) + + def get_supported_event_types(self) -> list[EventType]: + """ + Get the list of event types supported by this device. + + For Fermi™ or newer fully supported devices. For Linux only (returns an + empty list on Windows). + + Returns + ------- + list[EventType] + The list of supported event types. + """ + cdef uint64_t[1] bitmask + bitmask[0] = nvml.device_get_supported_event_types(self._handle) + + return [EventType(ev) for ev in _unpack_bitmask(bitmask)] __all__ = [ "BAR1MemoryInfo", "Device", "DeviceArchitecture", + "DeviceEvents", + "EventData", + "EventType", "MemoryInfo", "PciInfo", ] diff --git a/cuda_core/cuda/core/system/_device_utils.pxi b/cuda_core/cuda/core/system/_device_utils.pxi index 6d7a150e8f..a9eccc73cc 100644 --- a/cuda_core/cuda/core/system/_device_utils.pxi +++ b/cuda_core/cuda/core/system/_device_utils.pxi @@ -2,28 +2,12 @@ # # SPDX-License-Identifier: Apache-2.0 -from cpython cimport array from libc.stdint cimport uint64_t cpdef inline list[int] _unpack_bitmask(uint64_t[:] arr): """ Unpack a list of integers containing bitmasks. - - Parameters - ---------- - x: list of int - A list of integers - - Examples - -------- - >>> from cuda.core.system.utils import unpack_bitmask - >>> unpack_bitmask([1 + 2 + 8]) - [0, 1, 3] - >>> unpack_bitmask([1 + 2 + 16]) - [0, 1, 4] - >>> unpack_bitmask([1 + 2 + 16, 2 + 4]) - [0, 1, 4, 65, 66] """ cdef uint64_t i, j, idx cdef int mask_bits = 64 diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst index 13e1f43a2f..bbddd6098a 100644 --- a/cuda_core/docs/source/api.rst +++ b/cuda_core/docs/source/api.rst @@ -80,6 +80,9 @@ CUDA system information and NVIDIA Management Library (NVML) system.Device system.DeviceArchitecture + system.DeviceEvents + system.EventData + system.EventType system.MemoryInfo system.BAR1MemoryInfo system.PciInfo diff --git a/cuda_core/tests/system/test_system_device.py b/cuda_core/tests/system/test_system_device.py index 134ea7cbbe..4ad12eceb4 100644 --- a/cuda_core/tests/system/test_system_device.py +++ b/cuda_core/tests/system/test_system_device.py @@ -189,3 +189,27 @@ def test_unpack_bitmask(params): def test_unpack_bitmask_single_value(): with pytest.raises(TypeError): system_device._unpack_bitmask(1) + + +def test_register_events(): + # This is not the world's greatest test. All of the events are pretty + # infrequent and hard to simulate. So all we do here is register an event, + # wait with a timeout, and ensure that we get no event (since we didn't do + # anything to trigger one). + + # Also, some hardware doesn't support any event types. + + for device in system.Device.get_all_devices(): + supported_events = device.get_supported_event_types() + assert isinstance(supported_events, list) + assert all(isinstance(ev, system_device.EventType) for ev in supported_events) + + for device in system.Device.get_all_devices(): + events = device.register_events([]) + with pytest.raises(system.TimeoutError): + events.wait(timeout_ms=500) + + for device in system.Device.get_all_devices(): + events = device.register_events(0) + with pytest.raises(system.TimeoutError): + events.wait(timeout_ms=500) From 4ca5f058ac11e29b18006b230c004e3854770b9c Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Fri, 9 Jan 2026 10:14:22 -0500 Subject: [PATCH 2/9] Address copilot's comments in the PR --- cuda_bindings/cuda/bindings/_nvml.pyx | 8 ++++---- cuda_core/cuda/core/system/_device.pyx | 10 +++++----- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/cuda_bindings/cuda/bindings/_nvml.pyx b/cuda_bindings/cuda/bindings/_nvml.pyx index e32e822109..524a7e3d0f 100644 --- a/cuda_bindings/cuda/bindings/_nvml.pyx +++ b/cuda_bindings/cuda/bindings/_nvml.pyx @@ -28010,11 +28010,11 @@ cpdef intptr_t system_event_set_create(): request[0].version = sizeof(nvmlSystemEventSetCreateRequest_v1_t) | (1 << 24) __status__ = nvmlSystemEventSetCreate(request) check_status(__status__) - return (request.set) + return (request[0].set) cpdef system_event_set_free(intptr_t event_set): - """Create an empty set of system events. Event set should be freed by ``nvmlSystemEventSetFree``.""" + """Frees an event set.""" cdef nvmlSystemEventSetFreeRequest_v1_t[1] request request[0].set = event_set with nogil: @@ -28028,7 +28028,7 @@ cpdef system_register_events(intptr_t event_set, unsigned long long event_types) Args: event_set (intptr_t): The system event set handle. - event_types (unsigned int): Bitmask of nvmlSystemEventType_t values representing the events to register. + event_types (unsigned long long): Bitmask of nvmlSystemEventType_t values representing the events to register. """ cdef nvmlSystemRegisterEventRequest_v1_t[1] request request[0].set = event_set @@ -28060,4 +28060,4 @@ cpdef object system_event_set_wait(intptr_t event_set, unsigned int timeout_ms, request[0].version = sizeof(nvmlSystemEventSetWaitRequest_v1_t) | (1 << 24) __status__ = nvmlSystemEventSetWait(request) check_status(__status__) - return SystemEventData_v1.from_ptr(event_data._get_ptr(), size=request.numEvent) + return SystemEventData_v1.from_ptr(event_data._get_ptr(), size=request[0].numEvent) diff --git a/cuda_core/cuda/core/system/_device.pyx b/cuda_core/cuda/core/system/_device.pyx index efad2f8f7b..7eb4d807ca 100644 --- a/cuda_core/cuda/core/system/_device.pyx +++ b/cuda_core/cuda/core/system/_device.pyx @@ -212,7 +212,7 @@ cdef class EventData: """ The GPU instance ID for MIG devices. - Only valid for events of type :member:`EventType.EVENT_TYPE_XID_CRITICAL_ERROR`. + Only valid for events of type :attr:`EventType.EVENT_TYPE_XID_CRITICAL_ERROR`. Raises :class:`ValueError` for other event types. """ @@ -225,7 +225,7 @@ cdef class EventData: """ The Compute instance ID for MIG devices. - Only valid for events of type :member:`EventType.EVENT_TYPE_XID_CRITICAL_ERROR`. + Only valid for events of type :attr:`EventType.EVENT_TYPE_XID_CRITICAL_ERROR`. Raises :class:`ValueError` for other event types. """ @@ -261,7 +261,7 @@ cdef class DeviceEvents: def __dealloc__(self): nvml.event_set_free(self._event_set) - def wait(self, timeout_ms: int = 0) -> None: + def wait(self, timeout_ms: int = 0) -> EventData: """ Wait for events in the event set. @@ -298,9 +298,9 @@ cdef class DeviceEvents: Raises ------ - cuda.core.system.TimeoutError + :class:`cuda.core.system.TimeoutError` If the timeout expires before an event is received. - cuda.core.system.GpuIsLostError + :class:`cuda.core.system.GpuIsLostError` If the GPU has fallen off the bus or is otherwise inaccessible. """ return EventData(nvml.event_set_wait_v2(self._event_set, timeout_ms)) From 49820783d8943ea56c2c2a27ec41b9073e80b245 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Fri, 9 Jan 2026 11:49:14 -0500 Subject: [PATCH 3/9] Add support for system events --- cuda_bindings/cuda/bindings/_nvml.pyx | 10 +- cuda_core/cuda/core/system/__init__.py | 3 + cuda_core/cuda/core/system/_system_events.pyx | 164 ++++++++++++++++++ cuda_core/tests/system/test_system_events.py | 27 +++ 4 files changed, 199 insertions(+), 5 deletions(-) create mode 100644 cuda_core/cuda/core/system/_system_events.pyx create mode 100644 cuda_core/tests/system/test_system_events.py diff --git a/cuda_bindings/cuda/bindings/_nvml.pyx b/cuda_bindings/cuda/bindings/_nvml.pyx index 524a7e3d0f..3b7ce39309 100644 --- a/cuda_bindings/cuda/bindings/_nvml.pyx +++ b/cuda_bindings/cuda/bindings/_nvml.pyx @@ -28016,22 +28016,22 @@ cpdef intptr_t system_event_set_create(): cpdef system_event_set_free(intptr_t event_set): """Frees an event set.""" cdef nvmlSystemEventSetFreeRequest_v1_t[1] request - request[0].set = event_set + request[0].set = event_set with nogil: request[0].version = sizeof(nvmlSystemEventSetFreeRequest_v1_t) | (1 << 24) __status__ = nvmlSystemEventSetFree(request) check_status(__status__) -cpdef system_register_events(intptr_t event_set, unsigned long long event_types): +cpdef system_register_events(unsigned long long event_types, intptr_t event_set): """Starts recording of events on system and add the events to specified ``nvmlSystemEventSet_t``. Args: - event_set (intptr_t): The system event set handle. event_types (unsigned long long): Bitmask of nvmlSystemEventType_t values representing the events to register. + event_set (intptr_t): The system event set handle. """ cdef nvmlSystemRegisterEventRequest_v1_t[1] request - request[0].set = event_set + request[0].set = event_set request[0].eventTypes = event_types with nogil: request[0].version = sizeof(nvmlSystemRegisterEventRequest_v1_t) | (1 << 24) @@ -28053,7 +28053,7 @@ cpdef object system_event_set_wait(intptr_t event_set, unsigned int timeout_ms, cdef nvmlSystemEventSetWaitRequest_v1_t[1] request cdef SystemEventData_v1 event_data = SystemEventData_v1(buffer_size) request[0].timeoutms = timeout_ms - request[0].set = event_set + request[0].set = event_set request[0].data = (event_data._get_ptr()) request[0].dataSize = buffer_size with nogil: diff --git a/cuda_core/cuda/core/system/__init__.py b/cuda_core/cuda/core/system/__init__.py index 29dc1538da..8c1953c5d7 100644 --- a/cuda_core/cuda/core/system/__init__.py +++ b/cuda_core/cuda/core/system/__init__.py @@ -25,9 +25,12 @@ if CUDA_BINDINGS_NVML_IS_COMPATIBLE: from ._device import * from ._device import __all__ as _device_all + from ._system_events import * + from ._system_events import __all__ as _system_events_all from .exceptions import * from .exceptions import __all__ as _exceptions_all __all__.append("get_nvml_version") __all__.extend(_device_all) + __all__.extend(_system_events_all) __all__.extend(_exceptions_all) diff --git a/cuda_core/cuda/core/system/_system_events.pyx b/cuda_core/cuda/core/system/_system_events.pyx new file mode 100644 index 0000000000..4f6c646a7e --- /dev/null +++ b/cuda_core/cuda/core/system/_system_events.pyx @@ -0,0 +1,164 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + + +from libc.stdint cimport intptr_t + +from cuda.bindings import _nvml as nvml + +from ._nvml_context cimport initialize + +from . import _device + + +SystemEventType = nvml.SystemEventType + + +cdef class SystemEvent: + """ + Data about a collection of system events. + """ + def __init__(self, event_data: nvml.SystemEventData_v1): + assert len(event_data) == 1 + self._event_data = event_data + + @property + def event_type(self) -> SystemEventType: + """ + The type of event that was triggered. + """ + return SystemEventType(self._event_data.event_type) + + @property + def gpu_id(self) -> int: + """ + The GPU ID in PCI ID format. + """ + return self._event_data.gpu_id + + @property + def device(self) -> _device.Device: + """ + The device associated with this event. + """ + return _device.Device(pci_bus_id=self.gpu_id) + + +cdef class SystemEvents: + """ + Data about a collection of system events. + """ + def __init__(self, event_data: nvml.SystemEventData_v1): + self._event_data = event_data + + def __len__(self): + return len(self._event_data) + + def __getitem__(self, idx: int) -> SystemEvent: + return SystemEvent(self._event_data[idx]) + + +cdef class RegisteredSystemEvents: + """ + Represents a set of events that can be waited on for a specific device. + """ + cdef intptr_t _event_set + + def __init__(self, events: SystemEventType | int | list[SystemEventType | int]): + cdef unsigned long long event_bitmask + if isinstance(events, (int, SystemEventType)): + event_bitmask = int(events) + elif isinstance(events, list): + event_bitmask = 0 + for ev in events: + event_bitmask |= int(ev) + else: + raise TypeError("events must be an SystemEventType, int, or list of SystemEventType or int") + + initialize() + + self._event_set = nvml.system_event_set_create() + print("event set:", self._event_set) + # If this raises, the event needs to be freed and this is handled by + # this class's __dealloc__ method. + nvml.system_register_events(event_bitmask, self._event_set) + + def __dealloc__(self): + nvml.system_event_set_free(self._event_set) + + def wait(self, timeout_ms: int = 0, buffer_size: int = 1) -> SystemEvents: + """ + Wait for events in the system event set. + + For Fermi™ or newer fully supported devices. + + If some events are ready to be delivered at the time of the call, + function returns immediately. If there are no events ready to be + delivered, function sleeps till event arrives but not longer than + specified timeout. If timeout passes, a + :class:`cuda.core.system.TimeoutError` is raised. This function in + certain conditions can return before specified timeout passes (e.g. when + interrupt arrives) + + Parameters + ---------- + timeout_ms: int + The timeout in milliseconds. A value of 0 means to wait indefinitely. + buffer_size: int + The maximum number of events to retrieve. Must be at least 1. + + Raises + ------ + :class:`cuda.core.system.TimeoutError` + If the timeout expires before an event is received. + :class:`cuda.core.system.GpuIsLostError` + If the GPU has fallen off the bus or is otherwise inaccessible. + """ + return SystemEvents(nvml.system_event_set_wait(self._event_set, timeout_ms, buffer_size)) + + +def register_events(events: SystemEventType | int | list[SystemEventType | int]) -> RegisteredSystemEvents: + """ + Starts recording of events on test system. + + For Linux only. + + All events that occurred before this call are not recorded. Wait for events + using the :meth:`SystemEvents.wait` method on the result. + + Examples + -------- + >>> from cuda.core import system + >>> events = system.register_events([ + ... SystemEventType.SYSTEM_EVENT_TYPE_GPU_DRIVER_UNBIND, + ... ]) + >>> while event := events.wait(timeout_ms=10000): + ... print(f"Event {event.event_type} occurred.") + + Parameters + ---------- + events: SystemEventType, int, or list of SsystemEventType or int + The event type or list of event types to register for this device. + + Returns + ------- + :class:`SystemEvents` + An object representing the registered events. Call + :meth:`SystemEvents.wait` on this object to wait for events. + + Raises + ------ + :class:`cuda.core.system.NotSupportedError` + None of the requested event types are registered. + """ + return RegisteredSystemEvents(events) + + +__all__ = [ + "register_events", + "RegisteredSystemEvents", + "SystemEvent", + "SystemEvents", + "SystemEventType", +] diff --git a/cuda_core/tests/system/test_system_events.py b/cuda_core/tests/system/test_system_events.py new file mode 100644 index 0000000000..4db87ce90e --- /dev/null +++ b/cuda_core/tests/system/test_system_events.py @@ -0,0 +1,27 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +# ruff: noqa: E402 + +from .conftest import skip_if_nvml_unsupported + +pytestmark = skip_if_nvml_unsupported + +import helpers +import pytest +from cuda.core import system + + +@pytest.mark.skipif(helpers.IS_WSL or helpers.IS_WINDOWS, reason="System events not supported on WSL or Windows") +def test_register_events(): + # This is not the world's greatest test. All of the events are pretty + # infrequent and hard to simulate. So all we do here is register an event, + # wait with a timeout, and ensure that we get no event (since we didn't do + # anything to trigger one). + + # Also, some hardware doesn't support any event types. + + events = system.register_events([system.SystemEventType.SYSTEM_EVENT_TYPE_GPU_DRIVER_UNBIND]) + with pytest.raises(system.TimeoutError): + events.wait(timeout_ms=500, buffer_size=1) From 6399c6d3f37c3974d675619de7e02eb6c1701a8e Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Fri, 9 Jan 2026 11:59:35 -0500 Subject: [PATCH 4/9] Add new classes to API docs --- cuda_core/docs/source/api.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst index bbddd6098a..bacddfdff3 100644 --- a/cuda_core/docs/source/api.rst +++ b/cuda_core/docs/source/api.rst @@ -72,10 +72,17 @@ CUDA system information and NVIDIA Management Library (NVML) system.get_driver_version system.get_driver_version_full + system.get_driver_branch system.get_num_devices system.get_nvml_version system.get_process_name + system.register_events + system.RegisteredSystemEvents + system.SystemEvent + system.SystemEvents + system.SystemEventType + :template: autosummary/cyclass.rst system.Device From fe4aae46ed1b5c7016263184b8bf484ec81d8994 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Fri, 9 Jan 2026 12:02:44 -0500 Subject: [PATCH 5/9] Mark test as not working on Windows --- cuda_core/tests/system/test_system_device.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cuda_core/tests/system/test_system_device.py b/cuda_core/tests/system/test_system_device.py index 4ad12eceb4..689dff2553 100644 --- a/cuda_core/tests/system/test_system_device.py +++ b/cuda_core/tests/system/test_system_device.py @@ -13,6 +13,7 @@ import re import sys +import helpers import pytest from cuda.core import system from cuda.core.system import _device as system_device @@ -191,6 +192,7 @@ def test_unpack_bitmask_single_value(): system_device._unpack_bitmask(1) +@pytest.mark.skipif(helpers.IS_WSL or helpers.IS_WINDOWS, reason="Events not supported on WSL or Windows") def test_register_events(): # This is not the world's greatest test. All of the events are pretty # infrequent and hard to simulate. So all we do here is register an event, From b00918407556b576e908694bd4bce94f4b94534d Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Fri, 9 Jan 2026 12:56:53 -0500 Subject: [PATCH 6/9] Fix memory management bug --- cuda_bindings/cuda/bindings/_nvml.pyx | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cuda_bindings/cuda/bindings/_nvml.pyx b/cuda_bindings/cuda/bindings/_nvml.pyx index 3b7ce39309..b9c59d6637 100644 --- a/cuda_bindings/cuda/bindings/_nvml.pyx +++ b/cuda_bindings/cuda/bindings/_nvml.pyx @@ -27422,7 +27422,8 @@ cpdef object device_get_field_values(intptr_t device, values): __status__ = nvmlDeviceGetFieldValues(device, valuesCount, ptr) check_status(__status__) - return FieldValue.from_ptr(ptr, valuesCount) + values_._data.resize((valuesCount,)) + return values_ cpdef object device_clear_field_values(intptr_t device, values): @@ -28060,4 +28061,5 @@ cpdef object system_event_set_wait(intptr_t event_set, unsigned int timeout_ms, request[0].version = sizeof(nvmlSystemEventSetWaitRequest_v1_t) | (1 << 24) __status__ = nvmlSystemEventSetWait(request) check_status(__status__) - return SystemEventData_v1.from_ptr(event_data._get_ptr(), size=request[0].numEvent) + event_data._data.resize((request[0].numEvent,)) + return event_data From 6bbbee18588cd72a1a30bf15fb65099914ad1c87 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Mon, 12 Jan 2026 11:26:18 -0500 Subject: [PATCH 7/9] Try to get more info from CI --- cuda_core/cuda/core/system/_device.pyx | 2 ++ cuda_core/tests/system/test_system_device.py | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/cuda_core/cuda/core/system/_device.pyx b/cuda_core/cuda/core/system/_device.pyx index a9441df00d..615c8b055e 100644 --- a/cuda_core/cuda/core/system/_device.pyx +++ b/cuda_core/cuda/core/system/_device.pyx @@ -645,6 +645,8 @@ cdef class Device: """ cdef uint64_t[1] bitmask bitmask[0] = nvml.device_get_supported_event_types(self._handle) + print("BITMASK", bitmask[0]) + assert False return [EventType(ev) for ev in _unpack_bitmask(bitmask)] diff --git a/cuda_core/tests/system/test_system_device.py b/cuda_core/tests/system/test_system_device.py index a46232f286..0824575f56 100644 --- a/cuda_core/tests/system/test_system_device.py +++ b/cuda_core/tests/system/test_system_device.py @@ -217,6 +217,10 @@ def test_register_events(): events.wait(timeout_ms=500) +def test_event_type_parsing(): + [system.EventType(ev) for ev in _device._unpack_bitmask(array.array("Q", [3]))] + + def test_field_values(): for device in system.Device.get_all_devices(): # TODO: Are there any fields that return double's? It would be good to From 187274aac0e7804ece5c5d70cd81763ceafcc133 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Mon, 12 Jan 2026 11:37:42 -0500 Subject: [PATCH 8/9] Fix tests --- cuda_core/cuda/core/system/_device.pyx | 5 +---- cuda_core/tests/system/test_system_device.py | 6 +++++- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/cuda_core/cuda/core/system/_device.pyx b/cuda_core/cuda/core/system/_device.pyx index 615c8b055e..dbe16a0b66 100644 --- a/cuda_core/cuda/core/system/_device.pyx +++ b/cuda_core/cuda/core/system/_device.pyx @@ -645,10 +645,7 @@ cdef class Device: """ cdef uint64_t[1] bitmask bitmask[0] = nvml.device_get_supported_event_types(self._handle) - print("BITMASK", bitmask[0]) - assert False - - return [EventType(ev) for ev in _unpack_bitmask(bitmask)] + return [EventType(1 << ev) for ev in _unpack_bitmask(bitmask)] def get_field_values(self, field_ids: list[int | tuple[int, int]]) -> FieldValues: """ diff --git a/cuda_core/tests/system/test_system_device.py b/cuda_core/tests/system/test_system_device.py index 0824575f56..26c4b263f6 100644 --- a/cuda_core/tests/system/test_system_device.py +++ b/cuda_core/tests/system/test_system_device.py @@ -218,7 +218,11 @@ def test_register_events(): def test_event_type_parsing(): - [system.EventType(ev) for ev in _device._unpack_bitmask(array.array("Q", [3]))] + events = [system.EventType(1 << ev) for ev in _device._unpack_bitmask(array.array("Q", [3]))] + assert events == [ + system.EventType.EVENT_TYPE_SINGLE_BIT_ECC_ERROR, + system.EventType.EVENT_TYPE_DOUBLE_BIT_ECC_ERROR, + ] def test_field_values(): From 03124f45c223c00be23ba82d9d4cd0eb46fc3589 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Mon, 12 Jan 2026 16:18:58 -0500 Subject: [PATCH 9/9] Hide handle as an implementation detail --- cuda_core/cuda/core/system/_device.pyx | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/cuda_core/cuda/core/system/_device.pyx b/cuda_core/cuda/core/system/_device.pyx index 72c64528ea..c9a2e8f369 100644 --- a/cuda_core/cuda/core/system/_device.pyx +++ b/cuda_core/cuda/core/system/_device.pyx @@ -188,7 +188,9 @@ cdef class EventData: """ The device on which the event occurred. """ - return Device(handle=self._event_data.device) + device = Device.__new__() + device._handle = self._event_data.device + return device @property def event_type(self) -> EventType: @@ -555,17 +557,16 @@ cdef class Device: index: int | None = None, uuid: bytes | str | None = None, pci_bus_id: bytes | str | None = None, - handle: int | None = None ): - initialize() - - args = [index, uuid, pci_bus_id, handle] + args = [index, uuid, pci_bus_id] cdef int arg_count = sum(arg is not None for arg in args) if arg_count > 1: - raise ValueError("Handle requires only one of `index`, `uuid`, `pci_bus_id` or `handle`.") + raise ValueError("Handle requires only one of `index`, `uuid`, or `pci_bus_id`.") if arg_count == 0: - raise ValueError("Handle requires either a device `index`, `pci_bus_id`, or `uuid`.") + raise ValueError("Handle requires either a device `index`, `uuid`, or `pci_bus_id`.") + + initialize() if index is not None: self._handle = nvml.device_get_handle_by_index_v2(index) @@ -577,8 +578,6 @@ cdef class Device: if isinstance(pci_bus_id, bytes): pci_bus_id = pci_bus_id.decode("ascii") self._handle = nvml.device_get_handle_by_pci_bus_id_v2(pci_bus_id) - elif handle is not None: - self._handle = handle @classmethod def get_all_devices(cls) -> Iterable[Device]: