From 6fa4ca38b56f8c4351619730b42bae0deb970dea Mon Sep 17 00:00:00 2001 From: Dmitry Meyer Date: Tue, 28 Apr 2026 15:20:42 +0000 Subject: [PATCH] Kubernetes: configure `imagePullSecrets` New permissions required: rules: resources: ["secrets"] verbs: ["create", "delete"] Fixes: https://github.com/dstackai/dstack/issues/3829 --- docs/docs/concepts/backends.md | 3 + .../core/backends/kubernetes/compute.py | 32 ++++++++ .../core/backends/kubernetes/resources.py | 18 +++++ .../core/backends/vastai/api_client.py | 2 +- .../server/services/backends/provisioning.py | 3 +- .../_internal/server/services/docker.py | 78 +++---------------- src/dstack/_internal/utils/docker.py | 45 +++++++++++ src/tests/_internal/utils/test_docker.py | 60 ++++++++++++++ 8 files changed, 172 insertions(+), 69 deletions(-) create mode 100644 src/dstack/_internal/utils/docker.py create mode 100644 src/tests/_internal/utils/test_docker.py diff --git a/docs/docs/concepts/backends.md b/docs/docs/concepts/backends.md index a57ca08fbe..5bded6ba03 100644 --- a/docs/docs/concepts/backends.md +++ b/docs/docs/concepts/backends.md @@ -1145,6 +1145,9 @@ projects: - apiGroups: [""] resources: ["persistentvolumeclaims"] verbs: ["get", "create", "delete"] + - apiGroups: [""] + resources: ["secrets"] + verbs: ["create", "delete"] ``` Ensure you've created a ClusterRoleBinding to grant the role to the user or the service account you're using. diff --git a/src/dstack/_internal/core/backends/kubernetes/compute.py b/src/dstack/_internal/core/backends/kubernetes/compute.py index 4ea833bd08..bbde8d42e2 100644 --- a/src/dstack/_internal/core/backends/kubernetes/compute.py +++ b/src/dstack/_internal/core/backends/kubernetes/compute.py @@ -43,6 +43,7 @@ OBJECT_NAME_MAX_LENGTH, PodPhase, TaintEffect, + build_dockerconfigjson, filter_invalid_labels, format_dstack_label_key, format_memory, @@ -162,6 +163,25 @@ def run_job( project_ssh_public_key=project_ssh_public_key.strip(), ) + image_pull_secrets: Optional[list[client.V1LocalObjectReference]] = None + if job.job_spec.registry_auth is not None: + registry_auth_secret_name = _get_registry_auth_secret_name(instance_name) + dockerconfigjson = build_dockerconfigjson( + image_name=job.job_spec.image_name, + username=job.job_spec.registry_auth.username, + password=job.job_spec.registry_auth.password, + ) + registry_auth_secret = client.V1Secret( + metadata=client.V1ObjectMeta(name=registry_auth_secret_name), + type="kubernetes.io/dockerconfigjson", + string_data={".dockerconfigjson": dockerconfigjson}, + ) + self.api.create_namespaced_secret( + namespace=self.config.namespace, + body=registry_auth_secret, + ) + image_pull_secrets = [client.V1LocalObjectReference(name=registry_auth_secret_name)] + resources_requests: dict[str, str] = {} resources_limits: dict[str, str] = {} node_affinity: Optional[client.V1NodeAffinity] = None @@ -311,6 +331,7 @@ def run_job( volume_mounts=volume_mounts, ) ], + image_pull_secrets=image_pull_secrets, affinity=client.V1Affinity( node_affinity=node_affinity, ), @@ -437,6 +458,13 @@ def terminate_instance( namespace=self.config.namespace, body=client.V1DeleteOptions(), ) + call_api_method( + self.api.delete_namespaced_secret, + expected=404, + name=_get_registry_auth_secret_name(instance_id), + namespace=self.config.namespace, + body=client.V1DeleteOptions(), + ) def create_gateway( self, @@ -1108,3 +1136,7 @@ def _run_ssh_command( def _get_pod_service_name(pod_name: str) -> str: return f"{pod_name}-service" + + +def _get_registry_auth_secret_name(pod_name: str) -> str: + return f"{pod_name}-registry-auth" diff --git a/src/dstack/_internal/core/backends/kubernetes/resources.py b/src/dstack/_internal/core/backends/kubernetes/resources.py index 0a3907a984..fa29d7b513 100644 --- a/src/dstack/_internal/core/backends/kubernetes/resources.py +++ b/src/dstack/_internal/core/backends/kubernetes/resources.py @@ -1,4 +1,6 @@ +import base64 import dataclasses +import json import re from collections.abc import Mapping from decimal import Decimal @@ -26,6 +28,7 @@ ) from dstack._internal.core.models.resources import CPUSpec, GPUSpec, Memory from dstack._internal.core.models.runs import Requirements +from dstack._internal.utils import docker as docker_utils from dstack._internal.utils.common import get_or_error from dstack._internal.utils.logging import get_logger @@ -179,6 +182,21 @@ def format_dstack_label_key(name: str) -> str: return f"k8s.dstack.ai/{name}" +def build_dockerconfigjson(image_name: str, username: str, password: str) -> str: + registry = docker_utils.parse_image_name(image_name).registry + if registry is None or docker_utils.is_default_registry(registry): + # https://kubernetes.io/docs/tasks/configure-pod-container/pull-image-private-registry/ + # > Use https://index.docker.io/v1/ for DockerHub + registry = "https://index.docker.io/v1/" + auth = base64.b64encode(f"{username}:{password}".encode()).decode() + entry = { + "username": username, + "password": password, + "auth": auth, + } + return json.dumps({"auths": {registry: entry}}) + + parse_quantity = cast( Callable[[Union[str, int, float, Decimal]], Decimal], _kubernetes_utils.parse_quantity ) diff --git a/src/dstack/_internal/core/backends/vastai/api_client.py b/src/dstack/_internal/core/backends/vastai/api_client.py index 9646c5fa5c..1d8e4d8f36 100644 --- a/src/dstack/_internal/core/backends/vastai/api_client.py +++ b/src/dstack/_internal/core/backends/vastai/api_client.py @@ -5,7 +5,7 @@ import requests from requests.adapters import HTTPAdapter, Retry -import dstack._internal.server.services.docker as docker +import dstack._internal.utils.docker as docker from dstack._internal.core.consts import DSTACK_RUNNER_SSH_PORT from dstack._internal.core.errors import NoCapacityError from dstack._internal.core.models.common import RegistryAuth diff --git a/src/dstack/_internal/server/services/backends/provisioning.py b/src/dstack/_internal/server/services/backends/provisioning.py index 0ae3187a03..b02deb8ade 100644 --- a/src/dstack/_internal/server/services/backends/provisioning.py +++ b/src/dstack/_internal/server/services/backends/provisioning.py @@ -7,7 +7,8 @@ from dstack._internal.core.models.runs import JobProvisioningData from dstack._internal.core.models.volumes import InstanceMountPoint from dstack._internal.server.schemas.runner import GPUDevice -from dstack._internal.server.services.docker import apply_server_docker_defaults, parse_image_name +from dstack._internal.server.services.docker import apply_server_docker_defaults +from dstack._internal.utils.docker import parse_image_name # https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa.html#efa-instance-types _AWS_EFA_ENABLED_INSTANCE_TYPE_PATTERNS = [ diff --git a/src/dstack/_internal/server/services/docker.py b/src/dstack/_internal/server/services/docker.py index 5aead476b6..41580e473b 100644 --- a/src/dstack/_internal/server/services/docker.py +++ b/src/dstack/_internal/server/services/docker.py @@ -9,16 +9,16 @@ from typing_extensions import Annotated from dstack._internal.core.errors import DockerRegistryError -from dstack._internal.core.models.common import ( - CoreModel, - FrozenCoreModel, - RegistryAuth, -) +from dstack._internal.core.models.common import CoreModel, RegistryAuth from dstack._internal.server import settings as server_settings from dstack._internal.server.utils.common import join_byte_stream_checked +from dstack._internal.utils.docker import ( + LEGACY_DEFAULT_REGISTRY, + is_default_registry, + parse_image_name, +) DEFAULT_PLATFORM = "linux/amd64" -DEFAULT_REGISTRY = "index.docker.io" MAX_CONFIG_OBJECT_SIZE = 2**22 # 4 MiB REGISTRY_REQUEST_TIMEOUT = 20 @@ -35,14 +35,6 @@ def __call__(self, dxf: DXF, response: requests.Response) -> None: ) -class DockerImage(FrozenCoreModel): - image: str - registry: Optional[str] = None - repo: str - tag: str - digest: Optional[str] = None - - class ImageConfig(CoreModel): user: Annotated[Optional[str], Field(alias="User")] = None entrypoint: Annotated[Optional[List[str]], Field(alias="Entrypoint")] = None @@ -75,8 +67,12 @@ class ImageManifest(CoreModel): def get_image_config(image_name: str, registry_auth: Optional[RegistryAuth]) -> ImageConfigObject: image = parse_image_name(image_name) + registry = image.registry + if registry is None or is_default_registry(registry): + registry = LEGACY_DEFAULT_REGISTRY + registry_client = DXF( - host=image.registry or DEFAULT_REGISTRY, + host=registry, repo=image.repo, auth=DXFAuthAdapter(registry_auth), # type: ignore[assignment] timeout=REGISTRY_REQUEST_TIMEOUT, @@ -100,58 +96,6 @@ def get_image_config(image_name: str, registry_auth: Optional[RegistryAuth]) -> raise DockerRegistryError(e) -def parse_image_name(image: str) -> DockerImage: - """ - :param image: docker image name - :return: registry host, repo, tag, digest - - >>> parse_image_name("ubuntu:22.04") - DockerImage(registry=None, repo='library/ubuntu', tag='22.04', digest=None) - >>> parse_image_name("dstackai/miniforge:py3.9-0.2") - DockerImage(registry=None, repo='dstackai/miniforge', tag='py3.9-0.2', digest=None) - >>> parse_image_name("ghcr.io/dstackai/miniforge") - DockerImage(registry='ghcr.io', repo='dstackai/miniforge', tag='latest', digest=None) - >>> parse_image_name("dstackai/miniforge@sha256:a4ba18a847a172a248d68faf6689e69fae4779b90b250211b79a26d21ddd6a15") - DockerImage(registry=None, repo='dstackai/miniforge', tag='latest', digest='sha256:a4ba18a847a172a248d68faf6689e69fae4779b90b250211b79a26d21ddd6a15') - """ - - digest = None - if "@" in image.split("/")[-1]: - image, digest = image.rsplit("@", maxsplit=1) - - tag = "latest" - if ":" in image.split("/")[-1]: # avoid detecting port as a tag - image, tag = image.rsplit(":", maxsplit=1) - - registry = None - components = image.split("/") - if len(components) == 1: # default registry, official image - repo = "library/" + components[0] - elif not is_host(components[0]): # default registry, custom image - repo = "/".join(components) - else: # custom registry - registry = components[0] - repo = "/".join(components[1:]) - - return DockerImage(image=image, registry=registry, repo=repo, tag=tag, digest=digest) - - -def is_host(s: str) -> bool: - """ - >>> is_host("localhost") - True - >>> is_host("localhost:5000") - True - >>> is_host("ghcr.io") - True - >>> is_host("127.0.0.1") - True - >>> is_host("dstackai") - False - """ - return s == "localhost" or ":" in s or "." in s - - def apply_server_docker_defaults( image_name: str, registry_auth: Optional[RegistryAuth], diff --git a/src/dstack/_internal/utils/docker.py b/src/dstack/_internal/utils/docker.py new file mode 100644 index 0000000000..780f37e86d --- /dev/null +++ b/src/dstack/_internal/utils/docker.py @@ -0,0 +1,45 @@ +from dataclasses import dataclass +from typing import Optional + +# https://github.com/distribution/reference/blob/0965666a6ade2e06035fe352e38344be1e68951a/normalize.go#L11-L31 +DEFAULT_REGISTRY = "docker.io" +LEGACY_DEFAULT_REGISTRY = "index.docker.io" + + +@dataclass(kw_only=True) +class DockerImage: + image: str + registry: Optional[str] = None + repo: str + tag: str + digest: Optional[str] = None + + +def parse_image_name(image: str) -> DockerImage: + digest = None + if "@" in image.split("/")[-1]: + image, digest = image.rsplit("@", maxsplit=1) + + tag = "latest" + if ":" in image.split("/")[-1]: # avoid detecting port as a tag + image, tag = image.rsplit(":", maxsplit=1) + + registry = None + components = image.split("/") + if len(components) == 1: # default registry, official image + repo = "library/" + components[0] + elif not _is_host(components[0]): # default registry, custom image + repo = "/".join(components) + else: # custom registry + registry = components[0] + repo = "/".join(components[1:]) + + return DockerImage(image=image, registry=registry, repo=repo, tag=tag, digest=digest) + + +def is_default_registry(registry: str) -> bool: + return registry in [DEFAULT_REGISTRY, LEGACY_DEFAULT_REGISTRY] + + +def _is_host(s: str) -> bool: + return s == "localhost" or ":" in s or "." in s diff --git a/src/tests/_internal/utils/test_docker.py b/src/tests/_internal/utils/test_docker.py new file mode 100644 index 0000000000..b2a16e4dc2 --- /dev/null +++ b/src/tests/_internal/utils/test_docker.py @@ -0,0 +1,60 @@ +import pytest + +from dstack._internal.utils.docker import DockerImage, _is_host, parse_image_name + + +class TestParseImageName: + @pytest.mark.parametrize( + ["image", "expected"], + [ + ( + "ubuntu:22.04", + DockerImage(image="ubuntu", registry=None, repo="library/ubuntu", tag="22.04"), + ), + ( + "dstackai/miniforge:py3.9-0.2", + DockerImage( + image="dstackai/miniforge", + registry=None, + repo="dstackai/miniforge", + tag="py3.9-0.2", + ), + ), + ( + "ghcr.io/dstackai/miniforge", + DockerImage( + image="ghcr.io/dstackai/miniforge", + registry="ghcr.io", + repo="dstackai/miniforge", + tag="latest", + ), + ), + ( + "dstackai/miniforge@sha256:a4ba18a847a172a248d68faf6689e69fae4779b90b250211b79a26d21ddd6a15", + DockerImage( + image="dstackai/miniforge", + registry=None, + repo="dstackai/miniforge", + tag="latest", + digest="sha256:a4ba18a847a172a248d68faf6689e69fae4779b90b250211b79a26d21ddd6a15", + ), + ), + ], + ) + def test_parse(self, image: str, expected: DockerImage) -> None: + assert parse_image_name(image) == expected + + +class TestIsHost: + @pytest.mark.parametrize( + ["value", "expected"], + [ + ("localhost", True), + ("localhost:5000", True), + ("ghcr.io", True), + ("127.0.0.1", True), + ("dstackai", False), + ], + ) + def test_is_host(self, value: str, expected: bool) -> None: + assert _is_host(value) is expected