Skip to content
Merged
81 changes: 44 additions & 37 deletions cuda_bindings/cuda/bindings/_nvml.pyx

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions cuda_bindings/tests/nvml/test_nvlink.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,14 @@ def test_nvlink_get_link_count(all_devices):
"""
for device in all_devices:
fields = nvml.FieldValue(1)
fields[0].field_id = nvml.FI.DEV_NVLINK_LINK_COUNT
fields[0].field_id = nvml.FieldId.DEV_NVLINK_LINK_COUNT
value = nvml.device_get_field_values(device, fields)[0]
assert value.nvml_return == nvml.Return.SUCCESS or value.nvml_return == nvml.Return.ERROR_NOT_SUPPORTED, (
f"Unexpected return {value.nvml_return} for link count field query"
)

# Use the alternative argument to device_get_field_values
value = nvml.device_get_field_values(device, [nvml.FI.DEV_NVLINK_LINK_COUNT])[0]
value = nvml.device_get_field_values(device, [nvml.FieldId.DEV_NVLINK_LINK_COUNT])[0]
assert value.nvml_return == nvml.Return.SUCCESS or value.nvml_return == nvml.Return.ERROR_NOT_SUPPORTED, (
f"Unexpected return {value.nvml_return} for link count field query"
)
Expand Down
181 changes: 181 additions & 0 deletions cuda_core/cuda/core/system/_device.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ from ._nvml_context cimport initialize
include "_device_utils.pxi"


FieldId = nvml.FieldId


class DeviceArchitecture:
"""
Device architecture enumeration.
Expand Down Expand Up @@ -171,6 +174,141 @@ cdef class PciInfo:
return self._pci_info.pci_device_id >> 16


cdef class FieldValue:
"""
Represents the data from a single field value.

Use :meth:`Device.get_field_values` to get multiple field values at once.
"""
cdef object _field_value

def __init__(self, field_value: nvml.FieldValue):
assert len(field_value) == 1
self._field_value = field_value

@property
def field_id(self) -> FieldId:
"""
The field ID.
"""
return FieldId(self._field_value.field_id)

@property
def scope_id(self) -> int:
"""
The scope ID.
"""
# Explicit int() cast required because this is a Numpy type
return int(self._field_value.scope_id)

@property
def timestamp(self) -> int:
"""
The CPU timestamp (in microseconds since 1970) at which the value was
sampled.
"""
# Explicit int() cast required because this is a Numpy type
return int(self._field_value.timestamp)

@property
def latency_usec(self) -> int:
"""
How long this field value took to update (in usec) within NVML. This may
be averaged across several fields that are serviced by the same driver
call.
"""
# Explicit int() cast required because this is a Numpy type
return int(self._field_value.latency_usec)

@property
def value(self) -> int | float:
"""
The field value.

Raises
------
:class:`cuda.core.system.NvmlError`
If there was an error retrieving the field value.
"""
nvml.check_status(self._field_value.nvml_return)

cdef int value_type = self._field_value.value_type
value = self._field_value.value

ValueType = nvml.ValueType

if value_type == ValueType.DOUBLE:
return float(value.d_val[0])
elif value_type == ValueType.UNSIGNED_INT:
return int(value.ui_val[0])
elif value_type == ValueType.UNSIGNED_LONG:
return int(value.ul_val[0])
elif value_type == ValueType.UNSIGNED_LONG_LONG:
return int(value.ull_val[0])
elif value_type == ValueType.SIGNED_LONG_LONG:
return int(value.ll_val[0])
elif value_type == ValueType.SIGNED_INT:
return int(value.si_val[0])
elif value_type == ValueType.UNSIGNED_SHORT:
return int(value.us_val[0])
else:
raise AssertionError("Unexpected value type")


cdef class FieldValues:
"""
Container of multiple field values.
"""
cdef object _field_values

def __init__(self, field_values: nvml.FieldValue):
self._field_values = field_values

def __getitem__(self, idx: int) -> FieldValue:
return FieldValue(self._field_values[idx])

def __len__(self) -> int:
return len(self._field_values)

def validate(self) -> None:
"""
Validate that there are no issues in any of the contained field values.

Raises an exception for the first issue found, if any.

Raises
------
:class:`cuda.core.system.NvmlError`
If any of the contained field values has an associated exception.
"""
# TODO: This is a classic use case for an `ExceptionGroup`, but those
# are only available in Python 3.11+.
return_values = self._field_values.nvml_return
if len(self._field_values) == 1:
return_values = [return_values]
for return_value in return_values:
nvml.check_status(return_value)

def get_all_values(self) -> list[int | float]:
"""
Get all field values as a list.

This will validate each of the values and include just the core value in
the list.

Returns
-------
list[int | float]
List of all field values.

Raises
------
:class:`cuda.core.system.NvmlError`
If any of the contained field values has an associated exception.
"""
return [x.value for x in self]


cdef class Device:
"""
Representation of a device.
Expand Down Expand Up @@ -313,11 +451,54 @@ cdef class Device:
"""
return nvml.device_get_uuid(self._handle)

def get_field_values(self, field_ids: list[int | tuple[int, int]]) -> FieldValues:
"""
Get multiple field values from the device.

Each value specified can raise its own exception. That exception will
be raised when attempting to access the corresponding ``value`` from the
returned :class:`FieldValues` container.

To confirm that there are no exceptions in the entire container, call
:meth:`FieldValues.validate`.

Parameters
----------
field_ids: list of int or tuple of (int, int)
List of field IDs to query.

Each item may be either a single value from the :class:`FieldId`
enum, or a pair of (:class:`FieldId`, scope ID).

Returns
-------
:class:`FieldValues`
Container of field values corresponding to the requested field IDs.
"""
return FieldValues(nvml.device_get_field_values(self._handle, field_ids))

def clear_field_values(self, field_ids: list[int | tuple[int, int]]) -> None:
"""
Clear multiple field values from the device.

Parameters
----------
field_ids: list of int or tuple of (int, int)
List of field IDs to clear.

Each item may be either a single value from the :class:`FieldId`
enum, or a pair of (:class:`FieldId`, scope ID).
"""
nvml.device_clear_field_values(self._handle, field_ids)


__all__ = [
"BAR1MemoryInfo",
"Device",
"DeviceArchitecture",
"FieldId",
"FieldValue",
"FieldValues",
"MemoryInfo",
"PciInfo",
]
8 changes: 8 additions & 0 deletions cuda_core/cuda/core/system/_system.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,14 @@ else:

if CUDA_BINDINGS_NVML_IS_COMPATIBLE:
from cuda.bindings import _nvml as nvml
# TODO: We need to be even more specific than version numbers for development.
# This can be removed once we have a release including everything we need.
for member in ["FieldId"]:
if not hasattr(nvml, member):
CUDA_BINDINGS_NVML_IS_COMPATIBLE = False
break

if CUDA_BINDINGS_NVML_IS_COMPATIBLE:
from ._nvml_context import initialize
else:
from cuda.core._utils.cuda_utils import driver, handle_return, runtime
Expand Down
3 changes: 3 additions & 0 deletions cuda_core/docs/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,9 @@ CUDA system information and NVIDIA Management Library (NVML)

system.Device
system.DeviceArchitecture
system.FieldId
system.FieldValue
system.FieldValues
system.MemoryInfo
system.BAR1MemoryInfo
system.PciInfo
Expand Down
62 changes: 55 additions & 7 deletions cuda_core/tests/system/test_system_device.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@

import pytest
from cuda.core import system
from cuda.core.system import _device as system_device

if system.CUDA_BINDINGS_NVML_IS_COMPATIBLE:
from cuda.bindings import _nvml as nvml
from cuda.core.system import _device


@pytest.fixture(autouse=True, scope="module")
Expand All @@ -36,7 +36,7 @@ def test_device_architecture():
for device in system.Device.get_all_devices():
device_arch = device.architecture

assert isinstance(device_arch, system_device.DeviceArchitecture)
assert isinstance(device_arch, system.DeviceArchitecture)
if sys.version_info < (3, 12):
assert device_arch.id in nvml.DeviceArch.__members__.values()
else:
Expand All @@ -52,7 +52,7 @@ def test_device_bar1_memory():
bar1_memory_info.used,
)

assert isinstance(bar1_memory_info, system_device.BAR1MemoryInfo)
assert isinstance(bar1_memory_info, system.BAR1MemoryInfo)
assert isinstance(free, int)
assert isinstance(total, int)
assert isinstance(used, int)
Expand Down Expand Up @@ -93,7 +93,7 @@ def test_device_memory():
memory_info = device.memory_info
free, total, used, reserved = memory_info.free, memory_info.total, memory_info.used, memory_info.reserved

assert isinstance(memory_info, system_device.MemoryInfo)
assert isinstance(memory_info, system.MemoryInfo)
assert isinstance(free, int)
assert isinstance(total, int)
assert isinstance(used, int)
Expand All @@ -116,7 +116,7 @@ def test_device_name():
def test_device_pci_info():
for device in system.Device.get_all_devices():
pci_info = device.pci_info
assert isinstance(pci_info, system_device.PciInfo)
assert isinstance(pci_info, system.PciInfo)

assert isinstance(pci_info.bus_id, str)
assert re.match("[a-f0-9]{8}:[a-f0-9]{2}:[a-f0-9]{2}.[a-f0-9]", pci_info.bus_id.lower())
Expand Down Expand Up @@ -183,9 +183,57 @@ def test_device_uuid():
],
)
def test_unpack_bitmask(params):
assert system_device._unpack_bitmask(array.array("Q", params["input"])) == params["output"]
assert _device._unpack_bitmask(array.array("Q", params["input"])) == params["output"]


def test_unpack_bitmask_single_value():
with pytest.raises(TypeError):
system_device._unpack_bitmask(1)
_device._unpack_bitmask(1)


def test_field_values():
for device in system.Device.get_all_devices():
# TODO: Are there any fields that return double's? It would be good to
# test those.

field_ids = [
system.FieldId.DEV_TOTAL_ENERGY_CONSUMPTION,
system.FieldId.DEV_PCIE_COUNT_TX_BYTES,
]
field_values = device.get_field_values(field_ids)
field_values.validate()

with pytest.raises(TypeError):
field_values["invalid_index"]

assert isinstance(field_values, system.FieldValues)
assert len(field_values) == len(field_ids)

raw_values = field_values.get_all_values()
assert all(x == y.value for x, y in zip(raw_values, field_values))

for field_id, field_value in zip(field_ids, field_values):
assert field_value.field_id == field_id
assert type(field_value.value) is int
assert field_value.latency_usec >= 0
assert field_value.timestamp >= 0

orig_timestamp = field_values[0].timestamp
field_values = device.get_field_values(field_ids)
assert field_values[0].timestamp >= orig_timestamp

# Test only one element, because that's weirdly a special case
field_ids = [
system.FieldId.DEV_PCIE_REPLAY_COUNTER,
]
field_values = device.get_field_values(field_ids)
assert len(field_values) == 1
field_values.validate()
old_value = field_values[0].value

# Test clear_field_values
device.clear_field_values(field_ids)
field_values = device.get_field_values(field_ids)
field_values.validate()
assert len(field_values) == 1
assert field_values[0].value <= old_value
Loading