diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py index dfd52accea3..3152c9ceacf 100644 --- a/cuda_core/cuda/core/__init__.py +++ b/cuda_core/cuda/core/__init__.py @@ -28,7 +28,7 @@ def _import_versioned_module(): del _import_versioned_module -from cuda.core import system, utils +from cuda.core import checkpoint, system, utils from cuda.core._device import Device from cuda.core._event import Event, EventOptions from cuda.core._graphics import GraphicsResource diff --git a/cuda_core/cuda/core/checkpoint.py b/cuda_core/cuda/core/checkpoint.py new file mode 100644 index 00000000000..b5831f030ed --- /dev/null +++ b/cuda_core/cuda/core/checkpoint.py @@ -0,0 +1,254 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +import ctypes as _ctypes +from collections.abc import Mapping as _Mapping +from typing import Any as _Any + +from cuda.core._utils.cuda_utils import handle_return as _handle_cuda_return +from cuda.core._utils.version import binding_version as _binding_version +from cuda.core._utils.version import driver_version as _driver_version +from cuda.core.typing import ProcessStateT as _ProcessStateT + +try: + from cuda.bindings import driver as _driver +except ImportError: + from cuda import cuda as _driver + + +_PROCESS_STATE_NAME_ATTRS: tuple[tuple[str, _ProcessStateT], ...] = ( + ("CU_PROCESS_STATE_RUNNING", "running"), + ("CU_PROCESS_STATE_LOCKED", "locked"), + ("CU_PROCESS_STATE_CHECKPOINTED", "checkpointed"), + ("CU_PROCESS_STATE_FAILED", "failed"), +) + +_REQUIRED_BINDING_ATTRS = ( + "cuCheckpointProcessCheckpoint", + "cuCheckpointProcessGetRestoreThreadId", + "cuCheckpointProcessGetState", + "cuCheckpointProcessLock", + "cuCheckpointProcessRestore", + "cuCheckpointProcessUnlock", + "CUcheckpointGpuPair", + "CUcheckpointLockArgs", + "CUprocessState", + "CUcheckpointRestoreArgs", +) +_REQUIRED_DRIVER_VERSION = (12, 8, 0) +_driver_capability_checked = False + + +class Process: + """ + CUDA process that can be locked, checkpointed, restored, and unlocked. + + Parameters + ---------- + pid : int + Process ID of the CUDA process. + """ + + __slots__ = ("_pid",) + + def __init__(self, pid: int): + self._pid = _check_pid(pid) + + @property + def pid(self) -> int: + """ + Process ID of the CUDA process. + """ + return self._pid + + @property + def state(self) -> _ProcessStateT: + """ + CUDA checkpoint state for this process. + """ + driver = _get_driver() + state = _call_driver(driver, driver.cuCheckpointProcessGetState, self._pid) + state_names = _get_process_state_names(driver) + try: + return state_names[state] + except KeyError as e: + state_value = int(state) + raise RuntimeError(f"Unknown CUDA checkpoint process state: {state_value}") from e + + @property + def restore_thread_id(self) -> int: + """ + CUDA restore thread ID for this process. + """ + driver = _get_driver() + return _call_driver(driver, driver.cuCheckpointProcessGetRestoreThreadId, self._pid) + + def lock(self, timeout_ms: int = 0) -> None: + """ + Lock this process, blocking further CUDA API calls. + + Parameters + ---------- + timeout_ms : int, optional + Timeout in milliseconds. A value of 0 indicates no timeout. + """ + driver = _get_driver() + args = driver.CUcheckpointLockArgs() + args.timeoutMs = _check_timeout_ms(timeout_ms) + _call_driver(driver, driver.cuCheckpointProcessLock, self._pid, args) + + def checkpoint(self) -> None: + """ + Checkpoint the GPU memory contents of this locked process. + """ + driver = _get_driver() + _call_driver(driver, driver.cuCheckpointProcessCheckpoint, self._pid, None) + + def restore(self, gpu_mapping: _Mapping[_Any, _Any] | None = None) -> None: + """ + Restore this checkpointed process. + + Parameters + ---------- + gpu_mapping : mapping, optional + GPU UUID remapping from each checkpointed GPU UUID to the GPU UUID + to restore onto. For migration workflows, provide mappings for + every GPU visible to the kernel-mode driver. User-space masking + such as ``CUDA_VISIBLE_DEVICES`` does not reduce this mapping + requirement. + """ + driver = _get_driver() + args = _make_restore_args(driver, gpu_mapping) + _call_driver(driver, driver.cuCheckpointProcessRestore, self._pid, args) + + def unlock(self) -> None: + """ + Unlock this locked process so it can resume CUDA API calls. + """ + driver = _get_driver() + _call_driver(driver, driver.cuCheckpointProcessUnlock, self._pid, None) + + +def _get_driver(): + global _driver_capability_checked + if _driver_capability_checked: + return _driver + + binding_ver = _binding_version() + if not _binding_version_supports_checkpoint(binding_ver): + raise RuntimeError( + "CUDA checkpointing requires cuda.bindings with CUDA checkpoint API support. " + f"Found cuda.bindings {'.'.join(str(part) for part in binding_ver[:3])}." + ) + + missing = [name for name in _REQUIRED_BINDING_ATTRS if not hasattr(_driver, name)] + if missing: + raise RuntimeError( + f"CUDA checkpointing requires cuda.bindings with CUDA checkpoint API support. Missing: {', '.join(missing)}" + ) + + driver_ver = _driver_version() + if driver_ver < _REQUIRED_DRIVER_VERSION: + raise RuntimeError( + "CUDA checkpointing is not supported by the installed NVIDIA driver. " + "Upgrade to a driver version with CUDA checkpoint API support." + ) + + _driver_capability_checked = True + return _driver + + +def _binding_version_supports_checkpoint(version) -> bool: + major, minor, patch = version[:3] + return (major == 12 and (minor, patch) >= (8, 0)) or (major == 13 and (minor, patch) >= (0, 2)) or major > 13 + + +def _get_process_state_names(driver) -> dict[_Any, _ProcessStateT]: + return {getattr(driver.CUprocessState, attr): state_name for attr, state_name in _PROCESS_STATE_NAME_ATTRS} + + +def _call_driver(driver, func, *args): + try: + result = func(*args) + except RuntimeError as e: + if "cuCheckpointProcess" in str(e) and "not found" in str(e): + raise RuntimeError( + "CUDA checkpointing is not supported by the installed NVIDIA driver. " + "Upgrade to a driver version with CUDA checkpoint API support." + ) from e + raise + + err = result[0] + not_supported_errors = ( + getattr(driver.CUresult, "CUDA_ERROR_NOT_FOUND", None), + getattr(driver.CUresult, "CUDA_ERROR_NOT_SUPPORTED", None), + ) + if err in not_supported_errors: + raise RuntimeError( + "CUDA checkpointing is not supported by the installed NVIDIA driver. " + "Upgrade to a driver version with CUDA checkpoint API support." + ) + + return _handle_cuda_return(result) + + +def _check_pid(pid: int) -> int: + if isinstance(pid, bool) or not isinstance(pid, int): + raise TypeError("pid must be an int") + if pid <= 0: + raise ValueError("pid must be a positive int") + return pid + + +def _check_timeout_ms(timeout_ms: int) -> int: + if isinstance(timeout_ms, bool) or not isinstance(timeout_ms, int): + raise TypeError("timeout_ms must be an int") + if timeout_ms < 0: + raise ValueError("timeout_ms must be >= 0") + return timeout_ms + + +def _make_restore_args(driver, gpu_mapping: _Mapping[_Any, _Any] | None): + if gpu_mapping is None: + return None + if not isinstance(gpu_mapping, _Mapping): + raise TypeError("gpu_mapping must be a mapping from checkpointed GPU UUID to restore GPU UUID") + + pairs = [] + for old_uuid, new_uuid in gpu_mapping.items(): + pair = driver.CUcheckpointGpuPair() + buffers = [] + pair.oldUuid = _as_cuuuid(driver, old_uuid, buffers) + pair.newUuid = _as_cuuuid(driver, new_uuid, buffers) + pairs.append(pair) + + if not pairs: + return None + + args = driver.CUcheckpointRestoreArgs() + args.gpuPairs = pairs + args.gpuPairsCount = len(pairs) + return args + + +def _as_cuuuid(driver, value, buffers): + """Convert *value* to a ``CUuuid``. + + Accepts a ``CUuuid`` instance (returned as-is) or a UUID string in + the ``"xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"`` format returned by + :attr:`Device.uuid`. + """ + if isinstance(value, str): + raw = bytes.fromhex(value.replace("-", "")) + if len(raw) != 16: + raise ValueError(f"GPU UUID string must be 32 hex characters (with optional hyphens), got {value!r}") + buf = _ctypes.create_string_buffer(raw, 16) + buffers.append(buf) + return driver.CUuuid(_ctypes.addressof(buf)) + return value + + +__all__ = [ + "Process", +] diff --git a/cuda_core/cuda/core/typing.py b/cuda_core/cuda/core/typing.py index a66ab1881fb..e95331d463f 100644 --- a/cuda_core/cuda/core/typing.py +++ b/cuda_core/cuda/core/typing.py @@ -4,10 +4,15 @@ """Public type aliases and protocols used in cuda.core API signatures.""" +from typing import Literal as _Literal + from cuda.core._memory._buffer import DevicePointerT from cuda.core._stream import IsStreamT +ProcessStateT = _Literal["running", "locked", "checkpointed", "failed"] + __all__ = [ "DevicePointerT", "IsStreamT", + "ProcessStateT", ] diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst index 88780732d54..667417bce28 100644 --- a/cuda_core/docs/source/api.rst +++ b/cuda_core/docs/source/api.rst @@ -174,6 +174,68 @@ CUDA compilation toolchain LinkerOptions +CUDA process checkpointing +-------------------------- + +The :mod:`cuda.core.checkpoint` module wraps the CUDA driver process +checkpoint APIs. These APIs are intended for Linux process checkpoint and +restore workflows, and require a CUDA driver with checkpoint API support and +a ``cuda-bindings`` version that exposes those driver entry points. + +Checkpointing is typically driven by a coordinator process acting on a target +CUDA process, similar to attaching a debugger or sending a signal. The target +process is identified by process ID. Linux and the CUDA driver enforce process +permissions; checkpointing another user's process may require elevated +permissions such as ``CAP_SYS_PTRACE`` or administrator privileges. + +The CUDA checkpoint APIs prepare CUDA-managed GPU state for process-level +checkpoint and restore. They do not capture the CPU process image by +themselves; full process checkpoint workflows still need a CPU-side process +checkpointing tool such as CRIU. A minimal coordinator-side sequence looks like +this: + +.. code-block:: python + + import os + + from cuda.core import checkpoint + + target_pid = os.getpid() # or the PID of another CUDA process + process = checkpoint.Process(target_pid) + process.lock(timeout_ms=5000) + process.checkpoint() + + # Capture or restore the CPU process image outside cuda.core. + + process.restore() + process.unlock() + +``Process.state`` returns one of ``"running"``, ``"locked"``, +``"checkpointed"``, or ``"failed"``. Restore may optionally remap GPUs by +passing ``gpu_mapping`` from each checkpointed GPU UUID to the GPU UUID that +should be used during restore. For migration workflows, provide mappings for +every GPU visible to the NVIDIA kernel-mode driver at checkpoint time. +User-space masking such as ``CUDA_VISIBLE_DEVICES`` does not reduce this +mapping requirement, so applications that rely on user-space GPU masking may +not be valid migration targets. The mapping may use ``CUuuid`` objects or the +UUID strings returned by :attr:`Device.uuid`. A successful restore returns the +process to the locked state; call ``Process.unlock`` after restore to allow +CUDA API calls to resume. + +The CUDA driver requires restore to run from the process restore thread. +Use ``Process.restore_thread_id`` to discover that thread before calling +``Process.restore`` from a checkpoint coordinator. Restore also requires +persistence mode to be enabled or ``cuInit`` to have been called before +execution. + +.. autosummary:: + :toctree: generated/ + + :template: class.rst + + checkpoint.Process + + CUDA system information and NVIDIA Management Library (NVML) ------------------------------------------------------------ diff --git a/cuda_core/docs/source/api_private.rst b/cuda_core/docs/source/api_private.rst index 141773967e8..3db572d619d 100644 --- a/cuda_core/docs/source/api_private.rst +++ b/cuda_core/docs/source/api_private.rst @@ -17,6 +17,7 @@ CUDA runtime :toctree: generated/ typing.DevicePointerT + typing.ProcessStateT _memory._virtual_memory_resource.VirtualMemoryAllocationTypeT _memory._virtual_memory_resource.VirtualMemoryLocationTypeT _memory._virtual_memory_resource.VirtualMemoryGranularityT diff --git a/cuda_core/docs/source/release/1.0.0-notes.rst b/cuda_core/docs/source/release/1.0.0-notes.rst index 34eff571005..f5d3645c3d6 100644 --- a/cuda_core/docs/source/release/1.0.0-notes.rst +++ b/cuda_core/docs/source/release/1.0.0-notes.rst @@ -16,7 +16,10 @@ Highlights New features ------------ -- TBD +- Added the :mod:`cuda.core.checkpoint` module for CUDA process checkpointing, + including string process state queries, lock/checkpoint/restore/unlock + operations, and GPU UUID remapping support for restore. + (`#1343 `__) Fixes and enhancements diff --git a/cuda_core/tests/test_checkpoint.py b/cuda_core/tests/test_checkpoint.py new file mode 100644 index 00000000000..d3adb87deb6 --- /dev/null +++ b/cuda_core/tests/test_checkpoint.py @@ -0,0 +1,428 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +# Real GPU tests for cuda.core.checkpoint — no mocks. +# +# Driver-backed lifecycle tests run through an isolated coordinator/target +# process pair so hangs can be timed out without wedging the pytest process. +# +# Migration tests attempt GPU UUID remapping following the pattern from +# NVIDIA/cuda-checkpoint r580-migration-api.c. They require ≥2 GPUs of +# the same chip type, an unmasked CUDA device view, and a driver that supports +# migration; the tests skip gracefully when the hardware or driver cannot +# satisfy this. + +import os +import signal +import subprocess +import sys +import textwrap +from contextlib import suppress + +import pytest + +from cuda.core import checkpoint + +# -- Skip condition ------------------------------------------------------- + + +def _checkpoint_available(): + """Return True if the checkpoint API is usable on this system.""" + try: + checkpoint._get_driver() + return True + except RuntimeError: + return False + + +needs_checkpoint = pytest.mark.skipif( + sys.platform != "linux" or not _checkpoint_available(), + reason="CUDA checkpoint API requires Linux and a supported driver/bindings", +) + + +# -- Helpers --------------------------------------------------------------- + + +_SCENARIO_SKIP_EXIT_CODE = 77 + +_SCENARIO_COMMON = r""" +import os +import subprocess +import sys +from contextlib import suppress + +from cuda.core import Device, checkpoint +from cuda.core._utils.cuda_utils import CUDAError + +EXIT_SKIP = 77 + +TARGET_SCRIPT = r''' +import sys + +from cuda.core import Device + +device_index = int(sys.argv[1]) +Device(device_index).set_current() +print(f"READY:{Device().uuid}", flush=True) + +for line in sys.stdin: + command = line.strip() + if command == "uuid": + print(f"UUID:{Device().uuid}", flush=True) + elif command == "exit": + break +''' + + +def skip(reason): + print(f"SKIP: {reason}", flush=True) + raise SystemExit(EXIT_SKIP) + + +def run_or_skip_unsupported(func, *args, **kwargs): + try: + return func(*args, **kwargs) + except RuntimeError as exc: + if "CUDA checkpointing is not supported" in str(exc): + skip(str(exc)) + raise + + +def build_rotation_mapping(devices): + n = len(devices) + return {devices[i].uuid: devices[(i + 1) % n].uuid for i in range(n)} + + +def find_same_chip_pair(devices): + seen = {} + for i, dev in enumerate(devices): + name = dev.name + if name in seen: + return (seen[name], i) + seen[name] = i + return None + + +def read_prefixed(target, prefix): + line = target.stdout.readline() + if not line: + stderr = target.stderr.read() + raise RuntimeError(f"checkpoint target exited before {prefix!r}; stderr:\n{stderr}") + line = line.strip() + if not line.startswith(prefix): + raise RuntimeError(f"expected target output prefix {prefix!r}, got {line!r}") + return line[len(prefix):] + + +def start_target(device_index=0): + target = subprocess.Popen( + [sys.executable, "-c", TARGET_SCRIPT, str(device_index)], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + try: + ready_uuid = read_prefixed(target, "READY:") + except Exception: + stop_target(target) + raise + return target, ready_uuid + + +def stop_target(target): + if target.poll() is None: + with suppress(Exception): + target.stdin.write("exit\n") + target.stdin.flush() + try: + target.wait(timeout=5) + except subprocess.TimeoutExpired: + target.kill() + target.wait() + + +def target_uuid(target): + target.stdin.write("uuid\n") + target.stdin.flush() + return read_prefixed(target, "UUID:") + + +def checkpoint_restore(proc, gpu_mapping=None): + run_or_skip_unsupported(proc.lock, timeout_ms=5000) + run_or_skip_unsupported(proc.checkpoint) + try: + run_or_skip_unsupported(proc.restore, gpu_mapping=gpu_mapping) + except (CUDAError, RuntimeError) as exc: + with suppress(Exception): + proc.restore() + with suppress(Exception): + proc.unlock() + if "INVALID_VALUE" in str(exc): + skip( + "Driver does not support GPU migration on this hardware " + "(CUDA_ERROR_INVALID_VALUE; see NVBug 5437334)" + ) + raise + proc.unlock() +""" + + +def _run_checkpoint_scenario_or_skip(body: str, *, timeout: int = 90) -> None: + """Run mutating checkpoint/restore scenarios out-of-process. + + The CUDA checkpoint APIs can block inside the driver when a runner exposes + symbols but the platform path cannot complete checkpoint/restore. Running + the scenario in its own process group lets the parent test skip that runner + cleanly instead of hanging the entire CI job. + """ + script = _SCENARIO_COMMON + "\n" + textwrap.dedent(body) + proc = subprocess.Popen( # noqa: S603 - controlled test subprocess using this Python executable. + [sys.executable, "-c", script], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + start_new_session=True, + ) + try: + stdout, stderr = proc.communicate(timeout=timeout) + except subprocess.TimeoutExpired: + with suppress(ProcessLookupError): + os.killpg(proc.pid, signal.SIGKILL) + stdout, stderr = proc.communicate() + pytest.skip( + f"CUDA checkpoint scenario timed out after {timeout}s; driver/hardware did not complete " + f"checkpoint/restore.\nstdout:\n{stdout}\nstderr:\n{stderr}" + ) + + if proc.returncode == _SCENARIO_SKIP_EXIT_CODE: + reason = stdout.strip() or stderr.strip() or "CUDA checkpoint scenario skipped" + pytest.skip(reason) + if proc.returncode != 0: + pytest.fail( + f"CUDA checkpoint scenario failed with exit code {proc.returncode}.\nstdout:\n{stdout}\nstderr:\n{stderr}" + ) + + +# -- Input validation (no GPU / driver needed) ----------------------------- + + +class TestInputValidation: + @pytest.mark.parametrize( + ("args", "error_type", "match"), + [ + (("abc",), TypeError, "pid must be an int"), + ((True,), TypeError, "pid must be an int"), + ((0,), ValueError, "pid must be a positive int"), + ((-1,), ValueError, "pid must be a positive int"), + ], + ) + def test_process_rejects_invalid_pid(self, args, error_type, match): + with pytest.raises(error_type, match=match): + checkpoint.Process(*args) + + def test_public_symbols(self): + assert checkpoint.__all__ == ["Process"] + assert not hasattr(checkpoint, "ProcessStateT") + + def test_pid_is_read_only(self): + proc = checkpoint.Process(1) + assert proc.pid == 1 + with pytest.raises(AttributeError): + proc.pid = 2 + + +# -- Lifecycle (single GPU, real driver) ----------------------------------- + + +@needs_checkpoint +class TestCheckpointLifecycle: + def test_initial_state_is_running(self): + _run_checkpoint_scenario_or_skip( + """ + target, _ = start_target() + proc = checkpoint.Process(target.pid) + try: + assert proc.state == "running" + finally: + stop_target(target) + """ + ) + + def test_restore_thread_id_is_positive(self): + _run_checkpoint_scenario_or_skip( + """ + target, _ = start_target() + proc = checkpoint.Process(target.pid) + try: + tid = proc.restore_thread_id + assert isinstance(tid, int) + assert tid > 0 + finally: + stop_target(target) + """ + ) + + def test_lock_unlock(self): + _run_checkpoint_scenario_or_skip( + """ + target, _ = start_target() + proc = checkpoint.Process(target.pid) + try: + run_or_skip_unsupported(proc.lock) + assert proc.state == "locked" + proc.unlock() + assert proc.state == "running" + finally: + stop_target(target) + """ + ) + + def test_lock_default_timeout(self): + """lock() with the default timeout_ms=0 (no timeout).""" + _run_checkpoint_scenario_or_skip( + """ + target, _ = start_target() + proc = checkpoint.Process(target.pid) + try: + run_or_skip_unsupported(proc.lock) + assert proc.state == "locked" + proc.unlock() + finally: + stop_target(target) + """ + ) + + def test_lock_with_timeout(self): + _run_checkpoint_scenario_or_skip( + """ + target, _ = start_target() + proc = checkpoint.Process(target.pid) + try: + run_or_skip_unsupported(proc.lock, timeout_ms=5000) + assert proc.state == "locked" + proc.unlock() + finally: + stop_target(target) + """ + ) + + def test_full_cycle_no_migration(self): + """lock -> checkpoint -> restore -> unlock, verify state at each step.""" + _run_checkpoint_scenario_or_skip( + """ + target, _ = start_target() + proc = checkpoint.Process(target.pid) + try: + run_or_skip_unsupported(proc.lock, timeout_ms=5000) + assert proc.state == "locked" + + run_or_skip_unsupported(proc.checkpoint) + assert proc.state == "checkpointed" + + run_or_skip_unsupported(proc.restore) + assert proc.state == "locked" # restore leaves process locked + + proc.unlock() + assert proc.state == "running" + finally: + stop_target(target) + """ + ) + + +# -- GPU migration (>= 2 same-chip GPUs, real driver) --------------------- + + +@needs_checkpoint +class TestCheckpointGpuMigration: + """GPU UUID remapping tests following the r580-migration-api.c pattern. + + These tests require at least two GPUs of the same chip type and a + driver that supports checkpoint migration. They skip when the + hardware cannot satisfy this (e.g. heterogeneous GPUs, or a driver + build where migration returns CUDA_ERROR_INVALID_VALUE — see + NVBug 5437334). + """ + + def test_rotation_migrates_context(self): + """Rotate context through all GPUs and back to the origin. + + Builds a rotation mapping (device i -> device (i+1) % N) for + every visible device and performs N rotations. After each step + the context device UUID is checked. After N steps the context + should be back on the original device. + """ + _run_checkpoint_scenario_or_skip( + """ + devices = Device.get_all_devices() + if "CUDA_VISIBLE_DEVICES" in os.environ: + skip( + "GPU migration tests require an unmasked CUDA device view because " + "the checkpoint mapping must cover every GPU visible to the kernel-mode driver" + ) + if len(devices) < 2: + skip("GPU migration tests require at least 2 GPUs") + if find_same_chip_pair(devices) is None: + skip("GPU migration requires at least 2 GPUs of the same chip type") + + gpu_mapping = build_rotation_mapping(devices) + target, uuid_origin = start_target(0) + proc = checkpoint.Process(target.pid) + try: + for step in range(len(devices)): + expected_uuid = devices[(step + 1) % len(devices)].uuid + checkpoint_restore(proc, gpu_mapping=gpu_mapping) + observed_uuid = target_uuid(target) + assert observed_uuid == expected_uuid, ( + f"Step {step}: expected UUID {expected_uuid}, got {observed_uuid}" + ) + + assert target_uuid(target) == uuid_origin + finally: + stop_target(target) + """, + timeout=180, + ) + + def test_swap_identical_gpus(self): + """Swap context between two GPUs of the same chip type. + + Sets the context on one of the pair members so that a successful + migration is observable (the context UUID changes). + """ + _run_checkpoint_scenario_or_skip( + """ + devices = Device.get_all_devices() + if "CUDA_VISIBLE_DEVICES" in os.environ: + skip( + "GPU migration tests require an unmasked CUDA device view because " + "the checkpoint mapping must cover every GPU visible to the kernel-mode driver" + ) + pair = find_same_chip_pair(devices) + if pair is None: + skip("No two GPUs of the same chip type found") + + i, j = pair + gpu_mapping = {d.uuid: d.uuid for d in devices} + gpu_mapping[devices[i].uuid] = devices[j].uuid + gpu_mapping[devices[j].uuid] = devices[i].uuid + + target, uuid_before = start_target(i) + proc = checkpoint.Process(target.pid) + try: + assert uuid_before == devices[i].uuid + + checkpoint_restore(proc, gpu_mapping=gpu_mapping) + uuid_after = target_uuid(target) + + if uuid_after == devices[i].uuid: + skip("Driver accepted GPU swap but migration is a no-op on this hardware/driver version") + assert uuid_after == devices[j].uuid + finally: + stop_target(target) + """, + timeout=120, + ) diff --git a/cuda_core/tests/test_typing_imports.py b/cuda_core/tests/test_typing_imports.py index c05e3ae3b37..2e207d55d8b 100644 --- a/cuda_core/tests/test_typing_imports.py +++ b/cuda_core/tests/test_typing_imports.py @@ -10,10 +10,12 @@ def test_typing_module_imports(): from cuda.core.typing import ( DevicePointerT, IsStreamT, + ProcessStateT, ) assert DevicePointerT is not None assert IsStreamT is not None + assert set(ProcessStateT.__args__) == {"running", "locked", "checkpointed", "failed"} def test_typing_matches_private_definitions(): @@ -23,7 +25,9 @@ def test_typing_matches_private_definitions(): from cuda.core.typing import ( DevicePointerT, IsStreamT, + ProcessStateT, ) assert DevicePointerT is _DevicePointerT assert IsStreamT is _IsStreamT + assert set(ProcessStateT.__args__) == {"running", "locked", "checkpointed", "failed"}