Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cuda_core/cuda/core/_memory/_device_memory_resource.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ import multiprocessing
import platform # no-cython-lint
import uuid

from ._peer_access_utils import plan_peer_access_update
from cuda.core._memory._peer_access_utils import plan_peer_access_update
from cuda.core._utils.cuda_utils import check_multiprocessing_start_method

__all__ = ['DeviceMemoryResource', 'DeviceMemoryResourceOptions']
Expand Down
4 changes: 3 additions & 1 deletion cuda_core/cuda/core/system/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0

Expand All @@ -18,6 +18,8 @@
]


from cuda.core.system import typing

from ._system import *

if CUDA_BINDINGS_NVML_IS_COMPATIBLE:
Expand Down
39 changes: 0 additions & 39 deletions cuda_core/cuda/core/system/_clock.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -3,41 +3,12 @@
# SPDX-License-Identifier: Apache-2.0


class ClockId(StrEnum):
"""
Clock Ids. These are used in combination with :class:`ClockType` to specify a single clock value.
"""
CURRENT = "current"
CUSTOMER_BOOST_MAX = "customer_boost_max"
# APP_CLOCK_TARGET and APP_CLOCK_DEFAULT are deprecated so not included here


ClockId.CURRENT.__doc__ = "Current actual clock value."
ClockId.CUSTOMER_BOOST_MAX.__doc__ = "OEM-defined maximum clock rate"


_CLOCK_ID_MAPPING = {
ClockId.CURRENT: nvml.ClockId.CURRENT,
ClockId.CUSTOMER_BOOST_MAX: nvml.ClockId.CUSTOMER_BOOST_MAX,
}


class ClocksEventReasons(StrEnum):
"""
Reasons for a clocks event. These are used in combination with :class:`ClockType` to specify the reason for a clocks event.
"""
NONE = "none"
GPU_IDLE = "gpu_idle"
APPLICATIONS_CLOCKS_SETTING = "applications_clocks_setting"
SW_POWER_CAP = "sw_power_cap"
HW_SLOWDOWN = "hw_slowdown"
SYNC_BOOST = "sync_boost"
SW_THERMAL_SLOWDOWN = "sw_thermal_slowdown"
HW_THERMAL_SLOWDOWN = "hw_thermal_slowdown"
HW_POWER_BRAKE_SLOWDOWN = "hw_power_brake_slowdown"
DISPLAY_CLOCK_SETTING = "display_clock_setting"


_CLOCKS_EVENT_REASONS_MAPPING = {
nvml.ClocksEventReasons.EVENT_REASON_NONE: ClocksEventReasons.NONE,
nvml.ClocksEventReasons.EVENT_REASON_GPU_IDLE: ClocksEventReasons.GPU_IDLE,
Expand All @@ -52,16 +23,6 @@ _CLOCKS_EVENT_REASONS_MAPPING = {
}


class ClockType(StrEnum):
"""
Clock types. All speeds are in Mhz.
"""
GRAPHICS = "graphics"
SM = "sm"
MEMORY = "memory"
VIDEO = "video"


_CLOCK_TYPE_MAPPING = {
ClockType.GRAPHICS: nvml.ClockType.CLOCK_GRAPHICS,
ClockType.SM: nvml.ClockType.CLOCK_SM,
Expand Down
34 changes: 0 additions & 34 deletions cuda_core/cuda/core/system/_cooler.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -3,46 +3,12 @@
# SPDX-License-Identifier: Apache-2.0


class CoolerControl(StrEnum):
"""
Cooler control type.
"""
TOGGLE = "toggle"
VARIABLE = "variable"


CoolerControl.TOGGLE.__doc__ = """
This cooler can only be toggled either ON or OFF (e.g. a switch).
"""
CoolerControl.VARIABLE.__doc__ = """
This cooler's level can be adjusted from some minimum to some maximum (e.g. a knob).
"""


_COOLER_CONTROL_MAPPING = {
nvml.CoolerControl.THERMAL_COOLER_SIGNAL_TOGGLE: CoolerControl.TOGGLE,
nvml.CoolerControl.THERMAL_COOLER_SIGNAL_VARIABLE: CoolerControl.VARIABLE,
}


class CoolerTarget(StrEnum):
"""
Cooler target.
"""
NONE = "none"
GPU = "gpu"
MEMORY = "memory"
POWER_SUPPLY = "power_supply"
# THERMAL_GPU_RELATED is a composite target, so it is omitted here and will
# get returned as 3 separate targets: GPU, MEMORY, and POWER_SUPPLY.


CoolerTarget.NONE.__doc__ = "This cooler controls nothing."
CoolerTarget.GPU.__doc__ = "This cooler can cool the GPU."
CoolerTarget.MEMORY.__doc__ = "This cooler can cool the memory."
CoolerTarget.POWER_SUPPLY.__doc__ = "This cooler can cool the power supply."


_COOLER_TARGET_MAPPING = {
nvml.CoolerTarget.THERMAL_NONE: CoolerTarget.NONE,
nvml.CoolerTarget.THERMAL_GPU: CoolerTarget.GPU,
Expand Down
146 changes: 21 additions & 125 deletions cuda_core/cuda/core/system/_device.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -5,22 +5,34 @@
from libc.stdint cimport intptr_t, uint64_t
from libc.math cimport ceil

import sys
if sys.version_info >= (3, 11):
from enum import StrEnum
else:
from backports.strenum import StrEnum
from multiprocessing import cpu_count
from typing import Iterable
import warnings

from cuda.bindings import nvml
try:
from cuda.bindings._internal._fast_enum import FastEnum
except ImportError:
from enum import IntEnum as FastEnum

from ._nvml_context cimport initialize
from cuda.core.system.typing import (
AddressingMode,
AffinityScope,
DeviceArch,
ClockId,
ClocksEventReasons,
ClockType,
CoolerControl,
CoolerTarget,
DeviceArch,
EventType,
FanControlPolicy,
FieldId,
GpuP2PCapsIndex,
GpuP2PStatus,
GpuTopologyLevel,
InforomObject,
TemperatureThresholds,
ThermalController,
ThermalTarget,
)


cdef object _pstate_to_int(object pstate):
Expand Down Expand Up @@ -57,53 +69,12 @@ include "_temperature.pxi"
include "_utilization.pxi"


class AddressingMode(StrEnum):
"""
Addressing mode of a device.

For Kepler™ or newer fully supported devices.
"""
HMM = "hmm"
ATS = "ats"


AddressingMode.HMM.__doc__ = """
System allocated memory (``malloc``, ``mmap``) is addressable from the device
(GPU), via software-based mirroring of the CPU's page tables, on the GPU.
"""


AddressingMode.ATS.__doc__ = """
System allocated memory (``malloc``, ``mmap``) is addressable from the device
(GPU), via Address Translation Services. This means that there is (effectively)
a single set of page tables, and the CPU and GPU both use them.
"""


_ADDRESSING_MODE_MAPPING = {
nvml.DeviceAddressingModeType.DEVICE_ADDRESSING_MODE_HMM: AddressingMode.HMM,
nvml.DeviceAddressingModeType.DEVICE_ADDRESSING_MODE_ATS: AddressingMode.ATS,
}


class AffinityScope(StrEnum):
"""
Scope for affinity queries.
"""
NODE = "node"
SOCKET = "socket"


AffinityScope.NODE.__doc__ = """
The NUMA node is the scope of the affinity query. This is the default scope.
"""


AffinityScope.SOCKET.__doc__ = """
The CPU socket is the scope of the affinity query.
"""


_AFFINITY_SCOPE_MAPPING = {
AffinityScope.NODE: nvml.AffinityScope.NODE,
AffinityScope.SOCKET: nvml.AffinityScope.SOCKET,
Expand Down Expand Up @@ -132,37 +103,6 @@ _BRAND_TYPE_MAPPING = {
}


# This uses FastEnum instead of StrEnum because the ordering of the values is
# meaningful, e.g. Kepler "or later"
class DeviceArch(FastEnum):
"""
Device architecture.
"""
KEPLER = int(nvml.DeviceArch.KEPLER)
MAXWELL = int(nvml.DeviceArch.MAXWELL)
PASCAL = int(nvml.DeviceArch.PASCAL)
VOLTA = int(nvml.DeviceArch.VOLTA)
TURING = int(nvml.DeviceArch.TURING)
AMPERE = int(nvml.DeviceArch.AMPERE)
ADA = int(nvml.DeviceArch.ADA)
HOPPER = int(nvml.DeviceArch.HOPPER)
BLACKWELL = int(nvml.DeviceArch.BLACKWELL)
UNKNOWN = int(nvml.DeviceArch.UNKNOWN)


class GpuP2PCapsIndex(StrEnum):
"""
GPU peer-to-peer capabilities index.
"""
READ = "read"
WRITE = "write"
NVLINK = "nvlink"
ATOMICS = "atomics"
PCI = "pci"
PROP = "prop"
UNKNOWN = "unknown"


_GPU_P2P_CAPS_INDEX_MAPPING = {
GpuP2PCapsIndex.READ: nvml.GpuP2PCapsIndex.P2P_CAPS_INDEX_READ,
GpuP2PCapsIndex.WRITE: nvml.GpuP2PCapsIndex.P2P_CAPS_INDEX_WRITE,
Expand All @@ -174,19 +114,6 @@ _GPU_P2P_CAPS_INDEX_MAPPING = {
}


class GpuP2PStatus(StrEnum):
"""
GPU peer-to-peer status.
"""
OK = "ok"
CHIPSET_NOT_SUPPORTED = "chipset not supported"
GPU_NOT_SUPPORTED = "GPU not supported"
IOH_TOPOLOGY_NOT_SUPPORTED = "IOH topology not supported"
DISABLED_BY_REGKEY = "disabled by regkey"
NOT_SUPPORTED = "not supported"
UNKNOWN = "unknown"


_GPU_P2P_STATUS_MAPPING = {
nvml.GpuP2PStatus.P2P_STATUS_OK: GpuP2PStatus.OK,
nvml.GpuP2PStatus.P2P_STATUS_CHIPSET_NOT_SUPPORTED: GpuP2PStatus.CHIPSET_NOT_SUPPORTED,
Expand All @@ -198,18 +125,6 @@ _GPU_P2P_STATUS_MAPPING = {
}


class GpuTopologyLevel(StrEnum):
"""
Represents level relationships within a system between two GPUs.
"""
INTERNAL = "internal"
SINGLE = "single"
MULTIPLE = "multiple"
HOSTBRIDGE = "hostbridge"
NODE = "node"
SYSTEM = "system"


_GPU_TOPOLOGY_LEVEL_MAPPING = {
GpuTopologyLevel.INTERNAL: nvml.GpuTopologyLevel.TOPOLOGY_INTERNAL,
GpuTopologyLevel.SINGLE: nvml.GpuTopologyLevel.TOPOLOGY_SINGLE,
Expand Down Expand Up @@ -1204,27 +1119,8 @@ def get_p2p_status(device1: Device, device2: Device, index: GpuP2PCapsIndex | st


__all__ = [
"AddressingMode",
"AffinityScope",
"ClockId",
"ClocksEventReasons",
"ClockType",
"CoolerControl",
"CoolerTarget",
"Device",
"DeviceArch",
"EventType",
"FanControlPolicy",
"FieldId",
"get_p2p_status",
"get_topology_common_ancestor",
"GpuP2PCapsIndex",
"GpuP2PStatus",
"GpuTopologyLevel",
"InforomObject",
"NvlinkInfo",
"TemperatureThresholds",
"ThermalController",
"ThermalTarget",
"Utilization",
]
30 changes: 0 additions & 30 deletions cuda_core/cuda/core/system/_event.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -3,36 +3,6 @@
# SPDX-License-Identifier: Apache-2.0


class EventType(StrEnum):
"""
Event types that can be waited on with :class:`DeviceEvents`.
"""
NONE = "none"
SINGLE_BIT_ECC_ERROR = "single_bit_ecc_error"
DOUBLE_BIT_ECC_ERROR = "double_bit_ecc_error"
PSTATE = "pstate"
XID_CRITICAL_ERROR = "xid_critical_error"
CLOCK = "clock"
POWER_SOURCE_CHANGE = "power_source_change"
MIG_CONFIG_CHANGE = "mig_config_change"
SINGLE_BIT_ECC_ERROR_STORM = "single_bit_ecc_error_storm"
DRAM_RETIREMENT_EVENT = "dram_retirement_event"
DRAM_RETIREMENT_FAILURE = "dram_retirement_failure"
NON_FATAL_POISON_ERROR = "non_fatal_poison_error"
FATAL_POISON_ERROR = "fatal_poison_error"
GPU_UNAVAILABLE_ERROR = "gpu_unavailable_error"
GPU_RECOVERY_ACTION = "gpu_recovery_action"


EventType.PSTATE.__doc__ = """
Event about PState changes

On Fermi™ architecture, PState changes are also an indicator that GPU is throttling down due to
no work being executed on the GPU, power capping or thermal capping. In a typical situation,
Fermi-based GPU should stay in P0 for the duration of the execution of the compute process.
"""


_EVENT_TYPE_MAPPING = {
nvml.EventType.NONE: EventType.NONE,
nvml.EventType.SINGLE_BIT_ECC_ERROR: EventType.SINGLE_BIT_ECC_ERROR,
Expand Down
8 changes: 0 additions & 8 deletions cuda_core/cuda/core/system/_fan.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,6 @@
# SPDX-License-Identifier: Apache-2.0


class FanControlPolicy(StrEnum):
"""
Fan control policies.
"""
TEMPERATURE_CONTROLLED = "temperature_controlled"
MANUAL = "manual"


_FAN_CONTROL_POLICY_MAPPING = {
nvml.FanControlPolicy.TEMPERATURE_CONTINUOUS_SW: FanControlPolicy.TEMPERATURE_CONTROLLED,
nvml.FanControlPolicy.MANUAL: FanControlPolicy.MANUAL,
Expand Down
3 changes: 0 additions & 3 deletions cuda_core/cuda/core/system/_field_values.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,6 @@
# SPDX-License-Identifier: Apache-2.0


FieldId = nvml.FieldId


cdef class FieldValue:
"""
Represents the data from a single field value.
Expand Down
Loading