Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 8 additions & 21 deletions src/dstack/_internal/core/backends/runpod/api_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ def create_pod(
network_volume_id: Optional[str] = None,
allowed_cuda_versions: Optional[List[str]] = None,
bid_per_gpu: Optional[float] = None,
container_registry_auth_id: Optional[str] = None,
) -> Dict:
resp = self._make_request(
{
Expand All @@ -95,6 +96,7 @@ def create_pod(
network_volume_id=network_volume_id,
allowed_cuda_versions=allowed_cuda_versions,
bid_per_gpu=bid_per_gpu,
container_registry_auth_id=container_registry_auth_id,
)
}
)
Expand Down Expand Up @@ -142,26 +144,18 @@ def create_cpu_pod(
)
return resp.json()["data"]["deployCpuPod"]

def edit_pod(
def update_pod_container_registry_auth(
self,
pod_id: str,
image_name: str,
container_disk_in_gb: int,
container_registry_auth_id: str,
# Default pod volume is 20GB.
# Runpod errors if it's not specified for podEditJob.
volume_in_gb: int = 20,
) -> str:
resp = self._make_request(
{
"query": f"""
mutation {{
podEditJob(input: {{
podId: "{pod_id}"
imageName: "{image_name}"
containerDiskInGb: {container_disk_in_gb}
containerRegistryAuthId: "{container_registry_auth_id}"
volumeInGb: {volume_in_gb}
}}) {{
id
}}
Expand Down Expand Up @@ -454,29 +448,24 @@ def _generate_pod_deployment_mutation(
network_volume_id: Optional[str] = None,
allowed_cuda_versions: Optional[List[str]] = None,
bid_per_gpu: Optional[float] = None,
container_registry_auth_id: Optional[str] = None,
) -> str:
"""
Generates a mutation to deploy pod.
"""
input_fields = []

# ------------------------------ Required Fields ----------------------------- #
input_fields.append(f'name: "{name}"')
input_fields.append(f'imageName: "{image_name}"')
input_fields.append(f'gpuTypeId: "{gpu_type_id}"')

# ------------------------------ Default Fields ------------------------------ #
input_fields.append(f"cloudType: {cloud_type}")
input_fields.append(f'minCudaVersion: "{RunpodProvider.MIN_CUDA_VERSION}"')

if start_ssh:
input_fields.append("startSsh: true")

if support_public_ip:
input_fields.append("supportPublicIp: true")
else:
input_fields.append("supportPublicIp: false")

# ------------------------------ Optional Fields ----------------------------- #
if bid_per_gpu is not None:
input_fields.append(f"bidPerGpu: {bid_per_gpu}")
if data_center_id is not None:
Expand Down Expand Up @@ -507,20 +496,18 @@ def _generate_pod_deployment_mutation(
input_fields.append(f"env: [{env_string}]")
if template_id is not None:
input_fields.append(f'templateId: "{template_id}"')

if network_volume_id is not None:
input_fields.append(f'networkVolumeId: "{network_volume_id}"')

if allowed_cuda_versions is not None:
allowed_cuda_versions_string = ", ".join(
[f'"{version}"' for version in allowed_cuda_versions]
)
input_fields.append(f"allowedCudaVersions: [{allowed_cuda_versions_string}]")

input_fields.append(f'minCudaVersion: "{RunpodProvider.MIN_CUDA_VERSION}"')
if container_registry_auth_id is not None:
input_fields.append(f'containerRegistryAuthId: "{container_registry_auth_id}"')

pod_deploy = "podFindAndDeployOnDemand" if bid_per_gpu is None else "podRentInterruptable"
# Format input fields

input_string = ", ".join(input_fields)
return f"""
mutation {{
Expand Down
24 changes: 8 additions & 16 deletions src/dstack/_internal/core/backends/runpod/compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@ def run_job(
resp = self.api_client.create_cpu_pod(
name=pod_name,
image_name=job.job_spec.image_name,
container_registry_auth_id=container_registry_auth_id,
instance_id=instance_offer.instance.name,
cloud_type="SECURE",
deploy_cost=instance_offer.price,
Expand Down Expand Up @@ -193,6 +194,7 @@ def run_job(
resp = self.api_client.create_pod(
name=pod_name,
image_name=job.job_spec.image_name,
container_registry_auth_id=container_registry_auth_id,
gpu_type_id=instance_offer.instance.name,
cloud_type=cloud_type,
data_center_id=data_center_id,
Expand All @@ -212,18 +214,6 @@ def run_job(

instance_id = resp["id"]

# Call edit_pod to pass container_registry_auth_id.
# Expect a long time (~5m) for the pod to pick up the creds.
# TODO: remove editPod once Runpod's create mutations support docker's username/password
# (or a reliable containerRegistryAuthId at create time).
if container_registry_auth_id is not None:
instance_id = self.api_client.edit_pod(
pod_id=instance_id,
image_name=job.job_spec.image_name,
container_disk_in_gb=disk_size,
container_registry_auth_id=container_registry_auth_id,
)

if (
self._last_cleanup_time is None
or self._last_cleanup_time
Expand Down Expand Up @@ -316,13 +306,15 @@ def run_jobs(
env={"RUNPOD_POD_USER": "0"},
)

# An "edit pod" trick to pass container registry creds.
# Unlike create mutations for individual pods, createCluster mutation doesn't accept
# containerRegistryAuthId.
# The workaround is to inject containerRegistryAuthId into already created pods.
# Expect a long time (~5m) for the pods to pick up the creds.
# TODO: remove once createCluster supports containerRegistryAuthId
if container_registry_auth_id is not None:
for pod in resp["pods"]:
self.api_client.edit_pod(
self.api_client.update_pod_container_registry_auth(
pod_id=pod["id"],
image_name=master_job.job_spec.image_name,
container_disk_in_gb=disk_size,
container_registry_auth_id=container_registry_auth_id,
)

Expand Down
Loading