Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cuda_core/cuda/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def _import_versioned_module():
del _import_versioned_module


from cuda.core import system, utils
from cuda.core import checkpoint, system, utils
from cuda.core._device import Device
from cuda.core._event import Event, EventOptions
from cuda.core._graphics import GraphicsResource
Expand Down
248 changes: 248 additions & 0 deletions cuda_core/cuda/core/checkpoint.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,248 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0

Comment thread
leofang marked this conversation as resolved.
import ctypes as _ctypes
from collections.abc import Mapping as _Mapping
from typing import Any as _Any

from cuda.core._utils.cuda_utils import handle_return as _handle_cuda_return
from cuda.core._utils.version import binding_version as _binding_version
from cuda.core._utils.version import driver_version as _driver_version
from cuda.core.typing import ProcessStateT as _ProcessStateT

try:
from cuda.bindings import driver as _driver
except ImportError:
from cuda import cuda as _driver
Comment thread
leofang marked this conversation as resolved.


_PROCESS_STATE_NAME_ATTRS: tuple[tuple[str, _ProcessStateT], ...] = (
("CU_PROCESS_STATE_RUNNING", "running"),
("CU_PROCESS_STATE_LOCKED", "locked"),
("CU_PROCESS_STATE_CHECKPOINTED", "checkpointed"),
("CU_PROCESS_STATE_FAILED", "failed"),
)

_REQUIRED_BINDING_ATTRS = (
"cuCheckpointProcessCheckpoint",
"cuCheckpointProcessGetRestoreThreadId",
"cuCheckpointProcessGetState",
"cuCheckpointProcessLock",
"cuCheckpointProcessRestore",
"cuCheckpointProcessUnlock",
"CUcheckpointGpuPair",
"CUcheckpointLockArgs",
"CUprocessState",
"CUcheckpointRestoreArgs",
)
_REQUIRED_DRIVER_VERSION = (12, 8, 0)
_driver_capability_checked = False


class Process:
"""
CUDA process that can be locked, checkpointed, restored, and unlocked.

Parameters
----------
pid : int
Process ID of the CUDA process.
"""

__slots__ = ("pid",)

def __init__(self, pid: int):
self.pid = _check_pid(pid)
Comment on lines +53 to +56
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: move .pid to ._pid and add

@property 
def pid(self):
    return self._pid

to make it readonly.


@property
def state(self) -> _ProcessStateT:
"""
CUDA checkpoint state for this process.
"""
driver = _get_driver()
state = _call_driver(driver, driver.cuCheckpointProcessGetState, self.pid)
state_names = _get_process_state_names(driver)
try:
return state_names[state]
except KeyError as e:
state_value = int(state)
raise RuntimeError(f"Unknown CUDA checkpoint process state: {state_value}") from e

@property
def restore_thread_id(self) -> int:
"""
CUDA restore thread ID for this process.
"""
driver = _get_driver()
return _call_driver(driver, driver.cuCheckpointProcessGetRestoreThreadId, self.pid)

def lock(self, timeout_ms: int = 0) -> None:
Comment thread
leofang marked this conversation as resolved.
"""
Lock this process, blocking further CUDA API calls.

Parameters
----------
timeout_ms : int, optional
Timeout in milliseconds. A value of 0 indicates no timeout.
"""
driver = _get_driver()
args = driver.CUcheckpointLockArgs()
args.timeoutMs = _check_timeout_ms(timeout_ms)
_call_driver(driver, driver.cuCheckpointProcessLock, self.pid, args)

def checkpoint(self) -> None:
"""
Checkpoint the GPU memory contents of this locked process.
"""
driver = _get_driver()
_call_driver(driver, driver.cuCheckpointProcessCheckpoint, self.pid, None)

def restore(self, gpu_mapping: _Mapping[_Any, _Any] | None = None) -> None:
"""
Restore this checkpointed process.

Parameters
----------
gpu_mapping : mapping, optional
GPU UUID remapping from each checkpointed GPU UUID to the GPU UUID
to restore onto. For migration workflows, provide mappings for
every CUDA-visible GPU.
"""
driver = _get_driver()
args = _make_restore_args(driver, gpu_mapping)
_call_driver(driver, driver.cuCheckpointProcessRestore, self.pid, args)

def unlock(self) -> None:
"""
Unlock this locked process so it can resume CUDA API calls.
"""
driver = _get_driver()
_call_driver(driver, driver.cuCheckpointProcessUnlock, self.pid, None)


def _get_driver():
global _driver_capability_checked
if _driver_capability_checked:
return _driver

binding_ver = _binding_version()
if not _binding_version_supports_checkpoint(binding_ver):
raise RuntimeError(
"CUDA checkpointing requires cuda.bindings with CUDA checkpoint API support. "
f"Found cuda.bindings {'.'.join(str(part) for part in binding_ver[:3])}."
)

missing = [name for name in _REQUIRED_BINDING_ATTRS if not hasattr(_driver, name)]
if missing:
raise RuntimeError(
f"CUDA checkpointing requires cuda.bindings with CUDA checkpoint API support. Missing: {', '.join(missing)}"
)

driver_ver = _driver_version()
if driver_ver < _REQUIRED_DRIVER_VERSION:
raise RuntimeError(
"CUDA checkpointing is not supported by the installed NVIDIA driver. "
"Upgrade to a driver version with CUDA checkpoint API support."
)

_driver_capability_checked = True
return _driver
Comment thread
leofang marked this conversation as resolved.


def _binding_version_supports_checkpoint(version) -> bool:
major, minor, patch = version[:3]
return (major == 12 and (minor, patch) >= (8, 0)) or (major == 13 and (minor, patch) >= (0, 2)) or major > 13


def _get_process_state_names(driver) -> dict[_Any, _ProcessStateT]:
return {getattr(driver.CUprocessState, attr): state_name for attr, state_name in _PROCESS_STATE_NAME_ATTRS}


def _call_driver(driver, func, *args):
try:
result = func(*args)
except RuntimeError as e:
if "cuCheckpointProcess" in str(e) and "not found" in str(e):
raise RuntimeError(
"CUDA checkpointing is not supported by the installed NVIDIA driver. "
"Upgrade to a driver version with CUDA checkpoint API support."
) from e
raise
return _handle_return(driver, result)


def _handle_return(driver, result):
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Q: Can this check be consolidated with the above one (_call_driver)?

err = result[0]
not_supported_errors = (
getattr(driver.CUresult, "CUDA_ERROR_NOT_FOUND", None),
getattr(driver.CUresult, "CUDA_ERROR_NOT_SUPPORTED", None),
)
if err in not_supported_errors:
raise RuntimeError(
"CUDA checkpointing is not supported by the installed NVIDIA driver. "
"Upgrade to a driver version with CUDA checkpoint API support."
)

return _handle_cuda_return(result)


def _check_pid(pid: int) -> int:
if isinstance(pid, bool) or not isinstance(pid, int):
raise TypeError("pid must be an int")
if pid <= 0:
raise ValueError("pid must be a positive int")
return pid


def _check_timeout_ms(timeout_ms: int) -> int:
if isinstance(timeout_ms, bool) or not isinstance(timeout_ms, int):
raise TypeError("timeout_ms must be an int")
if timeout_ms < 0:
raise ValueError("timeout_ms must be >= 0")
return timeout_ms


def _make_restore_args(driver, gpu_mapping: _Mapping[_Any, _Any] | None):
if gpu_mapping is None:
return None
if not isinstance(gpu_mapping, _Mapping):
raise TypeError("gpu_mapping must be a mapping from checkpointed GPU UUID to restore GPU UUID")

pairs = []
for old_uuid, new_uuid in gpu_mapping.items():
pair = driver.CUcheckpointGpuPair()
buffers = []
pair.oldUuid = _as_cuuuid(driver, old_uuid, buffers)
pair.newUuid = _as_cuuuid(driver, new_uuid, buffers)
pairs.append(pair)

if not pairs:
return None

args = driver.CUcheckpointRestoreArgs()
args.gpuPairs = pairs
args.gpuPairsCount = len(pairs)
Comment thread
leofang marked this conversation as resolved.
return args


def _as_cuuuid(driver, value, buffers):
"""Convert *value* to a ``CUuuid``.

Accepts a ``CUuuid`` instance (returned as-is) or a UUID string in
the ``"xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"`` format returned by
:attr:`Device.uuid`.
"""
if isinstance(value, str):
raw = bytes.fromhex(value.replace("-", ""))
if len(raw) != 16:
raise ValueError(f"GPU UUID string must be 32 hex characters (with optional hyphens), got {value!r}")
buf = _ctypes.create_string_buffer(raw, 16)
buffers.append(buf)
return driver.CUuuid(_ctypes.addressof(buf))
return value


__all__ = [
"Process",
]
5 changes: 5 additions & 0 deletions cuda_core/cuda/core/typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,15 @@

"""Public type aliases and protocols used in cuda.core API signatures."""

from typing import Literal as _Literal

from cuda.core._memory._buffer import DevicePointerT
from cuda.core._stream import IsStreamT

ProcessStateT = _Literal["running", "locked", "checkpointed", "failed"]

__all__ = [
"DevicePointerT",
"IsStreamT",
"ProcessStateT",
]
59 changes: 59 additions & 0 deletions cuda_core/docs/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,65 @@ CUDA compilation toolchain
LinkerOptions


CUDA process checkpointing
--------------------------

The :mod:`cuda.core.checkpoint` module wraps the CUDA driver process
checkpoint APIs. These APIs are intended for Linux process checkpoint and
restore workflows, and require a CUDA driver with checkpoint API support and
a ``cuda-bindings`` version that exposes those driver entry points.

Checkpointing is typically driven by a coordinator process acting on a target
CUDA process, similar to attaching a debugger or sending a signal. The target
process is identified by process ID. Linux and the CUDA driver enforce process
permissions; checkpointing another user's process may require elevated
permissions such as ``CAP_SYS_PTRACE`` or administrator privileges.

The CUDA checkpoint APIs prepare CUDA-managed GPU state for process-level
checkpoint and restore. They do not capture the CPU process image by
themselves; full process checkpoint workflows still need a CPU-side process
checkpointing tool such as CRIU. A minimal coordinator-side sequence looks like
this:

.. code-block:: python

import os

from cuda.core import checkpoint

target_pid = os.getpid() # or the PID of another CUDA process
process = checkpoint.Process(target_pid)
process.lock(timeout_ms=5000)
process.checkpoint()

# Capture or restore the CPU process image outside cuda.core.

process.restore()
process.unlock()

``Process.state`` returns one of ``"running"``, ``"locked"``,
``"checkpointed"``, or ``"failed"``. Restore may optionally remap GPUs by
passing ``gpu_mapping`` from each checkpointed GPU UUID to the GPU UUID that
should be used during restore. For migration workflows, provide mappings for
every CUDA-visible GPU. The mapping may use ``CUuuid`` objects or the UUID
strings returned by :attr:`Device.uuid`. A successful restore returns the
process to the locked state; call ``Process.unlock`` after restore to allow
CUDA API calls to resume.

The CUDA driver requires restore to run from the process restore thread.
Use ``Process.restore_thread_id`` to discover that thread before calling
``Process.restore`` from a checkpoint coordinator. Restore also requires
persistence mode to be enabled or ``cuInit`` to have been called before
execution.

.. autosummary::
:toctree: generated/

:template: class.rst

checkpoint.Process


CUDA system information and NVIDIA Management Library (NVML)
------------------------------------------------------------

Expand Down
1 change: 1 addition & 0 deletions cuda_core/docs/source/api_private.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ CUDA runtime
:toctree: generated/

typing.DevicePointerT
typing.ProcessStateT
_memory._virtual_memory_resource.VirtualMemoryAllocationTypeT
_memory._virtual_memory_resource.VirtualMemoryLocationTypeT
_memory._virtual_memory_resource.VirtualMemoryGranularityT
Expand Down
5 changes: 4 additions & 1 deletion cuda_core/docs/source/release/1.0.0-notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,10 @@ Highlights
New features
------------

- TBD
- Added the :mod:`cuda.core.checkpoint` module for CUDA process checkpointing,
including string process state queries, lock/checkpoint/restore/unlock
operations, and GPU UUID remapping support for restore.
(`#1343 <https://github.com/NVIDIA/cuda-python/issues/1343>`__)


Fixes and enhancements
Expand Down
Loading
Loading