NVIDIA · kkraus14 · Apr 28, 2026 · Apr 29, 2026 · May 2, 2026 · May 2, 2026
diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py
@@ -28,7 +28,7 @@ def _import_versioned_module():
 del _import_versioned_module
 
 
-from cuda.core import system, utils
+from cuda.core import checkpoint, system, utils
 from cuda.core._device import Device
 from cuda.core._event import Event, EventOptions
 from cuda.core._graphics import GraphicsResource

diff --git a/cuda_core/cuda/core/checkpoint.py b/cuda_core/cuda/core/checkpoint.py
@@ -0,0 +1,248 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import ctypes as _ctypes
+from collections.abc import Mapping as _Mapping
+from typing import Any as _Any
+
+from cuda.core._utils.cuda_utils import handle_return as _handle_cuda_return
+from cuda.core._utils.version import binding_version as _binding_version
+from cuda.core._utils.version import driver_version as _driver_version
+from cuda.core.typing import ProcessStateT as _ProcessStateT
+
+try:
+    from cuda.bindings import driver as _driver
+except ImportError:
+    from cuda import cuda as _driver
+
+
+_PROCESS_STATE_NAME_ATTRS: tuple[tuple[str, _ProcessStateT], ...] = (
+    ("CU_PROCESS_STATE_RUNNING", "running"),
+    ("CU_PROCESS_STATE_LOCKED", "locked"),
+    ("CU_PROCESS_STATE_CHECKPOINTED", "checkpointed"),
+    ("CU_PROCESS_STATE_FAILED", "failed"),
+)
+
+_REQUIRED_BINDING_ATTRS = (
+    "cuCheckpointProcessCheckpoint",
+    "cuCheckpointProcessGetRestoreThreadId",
+    "cuCheckpointProcessGetState",
+    "cuCheckpointProcessLock",
+    "cuCheckpointProcessRestore",
+    "cuCheckpointProcessUnlock",
+    "CUcheckpointGpuPair",
+    "CUcheckpointLockArgs",
+    "CUprocessState",
+    "CUcheckpointRestoreArgs",
+)
+_REQUIRED_DRIVER_VERSION = (12, 8, 0)
+_driver_capability_checked = False
+
+
+class Process:
+    """
+    CUDA process that can be locked, checkpointed, restored, and unlocked.
+
+    Parameters
+    ----------
+    pid : int
+        Process ID of the CUDA process.
+    """
+
+    __slots__ = ("pid",)
+
+    def __init__(self, pid: int):
+        self.pid = _check_pid(pid)
+
+    @property
+    def state(self) -> _ProcessStateT:
+        """
+        CUDA checkpoint state for this process.
+        """
+        driver = _get_driver()
+        state = _call_driver(driver, driver.cuCheckpointProcessGetState, self.pid)
+        state_names = _get_process_state_names(driver)
+        try:
+            return state_names[state]
+        except KeyError as e:
+            state_value = int(state)
+            raise RuntimeError(f"Unknown CUDA checkpoint process state: {state_value}") from e
+
+    @property
+    def restore_thread_id(self) -> int:
+        """
+        CUDA restore thread ID for this process.
+        """
+        driver = _get_driver()
+        return _call_driver(driver, driver.cuCheckpointProcessGetRestoreThreadId, self.pid)
+
+    def lock(self, timeout_ms: int = 0) -> None:
+        """
+        Lock this process, blocking further CUDA API calls.
+
+        Parameters
+        ----------
+        timeout_ms : int, optional
+            Timeout in milliseconds. A value of 0 indicates no timeout.
+        """
+        driver = _get_driver()
+        args = driver.CUcheckpointLockArgs()
+        args.timeoutMs = _check_timeout_ms(timeout_ms)
+        _call_driver(driver, driver.cuCheckpointProcessLock, self.pid, args)
+
+    def checkpoint(self) -> None:
+        """
+        Checkpoint the GPU memory contents of this locked process.
+        """
+        driver = _get_driver()
+        _call_driver(driver, driver.cuCheckpointProcessCheckpoint, self.pid, None)
+
+    def restore(self, gpu_mapping: _Mapping[_Any, _Any] | None = None) -> None:
+        """
+        Restore this checkpointed process.
+
+        Parameters
+        ----------
+        gpu_mapping : mapping, optional
+            GPU UUID remapping from each checkpointed GPU UUID to the GPU UUID
+            to restore onto. For migration workflows, provide mappings for
+            every CUDA-visible GPU.
+        """
+        driver = _get_driver()
+        args = _make_restore_args(driver, gpu_mapping)
+        _call_driver(driver, driver.cuCheckpointProcessRestore, self.pid, args)
+
+    def unlock(self) -> None:
+        """
+        Unlock this locked process so it can resume CUDA API calls.
+        """
+        driver = _get_driver()
+        _call_driver(driver, driver.cuCheckpointProcessUnlock, self.pid, None)
+
+
+def _get_driver():
+    global _driver_capability_checked
+    if _driver_capability_checked:
+        return _driver
+
+    binding_ver = _binding_version()
+    if not _binding_version_supports_checkpoint(binding_ver):
+        raise RuntimeError(
+            "CUDA checkpointing requires cuda.bindings with CUDA checkpoint API support. "
+            f"Found cuda.bindings {'.'.join(str(part) for part in binding_ver[:3])}."
+        )
+
+    missing = [name for name in _REQUIRED_BINDING_ATTRS if not hasattr(_driver, name)]
+    if missing:
+        raise RuntimeError(
+            f"CUDA checkpointing requires cuda.bindings with CUDA checkpoint API support. Missing: {', '.join(missing)}"
+        )
+
+    driver_ver = _driver_version()
+    if driver_ver < _REQUIRED_DRIVER_VERSION:
+        raise RuntimeError(
+            "CUDA checkpointing is not supported by the installed NVIDIA driver. "
+            "Upgrade to a driver version with CUDA checkpoint API support."
+        )
+
+    _driver_capability_checked = True
+    return _driver
+
+
+def _binding_version_supports_checkpoint(version) -> bool:
+    major, minor, patch = version[:3]
+    return (major == 12 and (minor, patch) >= (8, 0)) or (major == 13 and (minor, patch) >= (0, 2)) or major > 13
+
+
+def _get_process_state_names(driver) -> dict[_Any, _ProcessStateT]:
+    return {getattr(driver.CUprocessState, attr): state_name for attr, state_name in _PROCESS_STATE_NAME_ATTRS}
+
+
+def _call_driver(driver, func, *args):
+    try:
+        result = func(*args)
+    except RuntimeError as e:
+        if "cuCheckpointProcess" in str(e) and "not found" in str(e):
+            raise RuntimeError(
+                "CUDA checkpointing is not supported by the installed NVIDIA driver. "
+                "Upgrade to a driver version with CUDA checkpoint API support."
+            ) from e
+        raise
+    return _handle_return(driver, result)
+
+
+def _handle_return(driver, result):
+    err = result[0]
+    not_supported_errors = (
+        getattr(driver.CUresult, "CUDA_ERROR_NOT_FOUND", None),
+        getattr(driver.CUresult, "CUDA_ERROR_NOT_SUPPORTED", None),
+    )
+    if err in not_supported_errors:
+        raise RuntimeError(
+            "CUDA checkpointing is not supported by the installed NVIDIA driver. "
+            "Upgrade to a driver version with CUDA checkpoint API support."
+        )
+
+    return _handle_cuda_return(result)
+
+
+def _check_pid(pid: int) -> int:
+    if isinstance(pid, bool) or not isinstance(pid, int):
+        raise TypeError("pid must be an int")
+    if pid <= 0:
+        raise ValueError("pid must be a positive int")
+    return pid
+
+
+def _check_timeout_ms(timeout_ms: int) -> int:
+    if isinstance(timeout_ms, bool) or not isinstance(timeout_ms, int):
+        raise TypeError("timeout_ms must be an int")
+    if timeout_ms < 0:
+        raise ValueError("timeout_ms must be >= 0")
+    return timeout_ms
+
+
+def _make_restore_args(driver, gpu_mapping: _Mapping[_Any, _Any] | None):
+    if gpu_mapping is None:
+        return None
+    if not isinstance(gpu_mapping, _Mapping):
+        raise TypeError("gpu_mapping must be a mapping from checkpointed GPU UUID to restore GPU UUID")
+
+    pairs = []
+    for old_uuid, new_uuid in gpu_mapping.items():
+        pair = driver.CUcheckpointGpuPair()
+        buffers = []
+        pair.oldUuid = _as_cuuuid(driver, old_uuid, buffers)
+        pair.newUuid = _as_cuuuid(driver, new_uuid, buffers)
+        pairs.append(pair)
+
+    if not pairs:
+        return None
+
+    args = driver.CUcheckpointRestoreArgs()
+    args.gpuPairs = pairs
+    args.gpuPairsCount = len(pairs)
+    return args
+
+
+def _as_cuuuid(driver, value, buffers):
+    """Convert *value* to a ``CUuuid``.
+
+    Accepts a ``CUuuid`` instance (returned as-is) or a UUID string in
+    the ``"xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"`` format returned by
+    :attr:`Device.uuid`.
+    """
+    if isinstance(value, str):
+        raw = bytes.fromhex(value.replace("-", ""))
+        if len(raw) != 16:
+            raise ValueError(f"GPU UUID string must be 32 hex characters (with optional hyphens), got {value!r}")
+        buf = _ctypes.create_string_buffer(raw, 16)
+        buffers.append(buf)
+        return driver.CUuuid(_ctypes.addressof(buf))
+    return value
+
+
+__all__ = [
+    "Process",
+]
diff --git a/cuda_core/cuda/core/typing.py b/cuda_core/cuda/core/typing.py
@@ -4,10 +4,15 @@
 
 """Public type aliases and protocols used in cuda.core API signatures."""
 
+from typing import Literal as _Literal
+
 from cuda.core._memory._buffer import DevicePointerT
 from cuda.core._stream import IsStreamT
 
+ProcessStateT = _Literal["running", "locked", "checkpointed", "failed"]
+
 __all__ = [
     "DevicePointerT",
     "IsStreamT",
+    "ProcessStateT",
 ]
diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst
@@ -174,6 +174,65 @@ CUDA compilation toolchain
    LinkerOptions
 
 
+CUDA process checkpointing
+--------------------------
+
+The :mod:`cuda.core.checkpoint` module wraps the CUDA driver process
+checkpoint APIs. These APIs are intended for Linux process checkpoint and
+restore workflows, and require a CUDA driver with checkpoint API support and
+a ``cuda-bindings`` version that exposes those driver entry points.
+
+Checkpointing is typically driven by a coordinator process acting on a target
+CUDA process, similar to attaching a debugger or sending a signal. The target
+process is identified by process ID. Linux and the CUDA driver enforce process
+permissions; checkpointing another user's process may require elevated
+permissions such as ``CAP_SYS_PTRACE`` or administrator privileges.
+
+The CUDA checkpoint APIs prepare CUDA-managed GPU state for process-level
+checkpoint and restore. They do not capture the CPU process image by
+themselves; full process checkpoint workflows still need a CPU-side process
+checkpointing tool such as CRIU. A minimal coordinator-side sequence looks like
+this:
+
+.. code-block:: python
+
+   import os
+
+   from cuda.core import checkpoint
+
+   target_pid = os.getpid()  # or the PID of another CUDA process
+   process = checkpoint.Process(target_pid)
+   process.lock(timeout_ms=5000)
+   process.checkpoint()
+
+   # Capture or restore the CPU process image outside cuda.core.
+
+   process.restore()
+   process.unlock()
+
+``Process.state`` returns one of ``"running"``, ``"locked"``,
+``"checkpointed"``, or ``"failed"``. Restore may optionally remap GPUs by
+passing ``gpu_mapping`` from each checkpointed GPU UUID to the GPU UUID that
+should be used during restore. For migration workflows, provide mappings for
+every CUDA-visible GPU. The mapping may use ``CUuuid`` objects or the UUID
+strings returned by :attr:`Device.uuid`. A successful restore returns the
+process to the locked state; call ``Process.unlock`` after restore to allow
+CUDA API calls to resume.
+
+The CUDA driver requires restore to run from the process restore thread.
+Use ``Process.restore_thread_id`` to discover that thread before calling
+``Process.restore`` from a checkpoint coordinator. Restore also requires
+persistence mode to be enabled or ``cuInit`` to have been called before
+execution.
+
+.. autosummary::
+   :toctree: generated/
+
+   :template: class.rst
+
+   checkpoint.Process
+
+
 CUDA system information and NVIDIA Management Library (NVML)
 ------------------------------------------------------------
 

diff --git a/cuda_core/docs/source/api_private.rst b/cuda_core/docs/source/api_private.rst
@@ -17,6 +17,7 @@ CUDA runtime
    :toctree: generated/
 
    typing.DevicePointerT
+   typing.ProcessStateT
    _memory._virtual_memory_resource.VirtualMemoryAllocationTypeT
    _memory._virtual_memory_resource.VirtualMemoryLocationTypeT
    _memory._virtual_memory_resource.VirtualMemoryGranularityT

diff --git a/cuda_core/docs/source/release/1.0.0-notes.rst b/cuda_core/docs/source/release/1.0.0-notes.rst
@@ -16,7 +16,10 @@ Highlights
 New features
 ------------
 
-- TBD
+- Added the :mod:`cuda.core.checkpoint` module for CUDA process checkpointing,
+  including string process state queries, lock/checkpoint/restore/unlock
+  operations, and GPU UUID remapping support for restore.
+  (`#1343 <https://github.com/NVIDIA/cuda-python/issues/1343>`__)
 
 
 Fixes and enhancements