diff --git a/src/dstack/_internal/core/backends/runpod/api_client.py b/src/dstack/_internal/core/backends/runpod/api_client.py index 11eca19e72..d7441963aa 100644 --- a/src/dstack/_internal/core/backends/runpod/api_client.py +++ b/src/dstack/_internal/core/backends/runpod/api_client.py @@ -70,6 +70,7 @@ def create_pod( network_volume_id: Optional[str] = None, allowed_cuda_versions: Optional[List[str]] = None, bid_per_gpu: Optional[float] = None, + container_registry_auth_id: Optional[str] = None, ) -> Dict: resp = self._make_request( { @@ -95,6 +96,7 @@ def create_pod( network_volume_id=network_volume_id, allowed_cuda_versions=allowed_cuda_versions, bid_per_gpu=bid_per_gpu, + container_registry_auth_id=container_registry_auth_id, ) } ) @@ -142,15 +144,10 @@ def create_cpu_pod( ) return resp.json()["data"]["deployCpuPod"] - def edit_pod( + def update_pod_container_registry_auth( self, pod_id: str, - image_name: str, - container_disk_in_gb: int, container_registry_auth_id: str, - # Default pod volume is 20GB. - # Runpod errors if it's not specified for podEditJob. - volume_in_gb: int = 20, ) -> str: resp = self._make_request( { @@ -158,10 +155,7 @@ def edit_pod( mutation {{ podEditJob(input: {{ podId: "{pod_id}" - imageName: "{image_name}" - containerDiskInGb: {container_disk_in_gb} containerRegistryAuthId: "{container_registry_auth_id}" - volumeInGb: {volume_in_gb} }}) {{ id }} @@ -454,29 +448,24 @@ def _generate_pod_deployment_mutation( network_volume_id: Optional[str] = None, allowed_cuda_versions: Optional[List[str]] = None, bid_per_gpu: Optional[float] = None, + container_registry_auth_id: Optional[str] = None, ) -> str: """ Generates a mutation to deploy pod. """ input_fields = [] - - # ------------------------------ Required Fields ----------------------------- # input_fields.append(f'name: "{name}"') input_fields.append(f'imageName: "{image_name}"') input_fields.append(f'gpuTypeId: "{gpu_type_id}"') - - # ------------------------------ Default Fields ------------------------------ # input_fields.append(f"cloudType: {cloud_type}") + input_fields.append(f'minCudaVersion: "{RunpodProvider.MIN_CUDA_VERSION}"') if start_ssh: input_fields.append("startSsh: true") - if support_public_ip: input_fields.append("supportPublicIp: true") else: input_fields.append("supportPublicIp: false") - - # ------------------------------ Optional Fields ----------------------------- # if bid_per_gpu is not None: input_fields.append(f"bidPerGpu: {bid_per_gpu}") if data_center_id is not None: @@ -507,20 +496,18 @@ def _generate_pod_deployment_mutation( input_fields.append(f"env: [{env_string}]") if template_id is not None: input_fields.append(f'templateId: "{template_id}"') - if network_volume_id is not None: input_fields.append(f'networkVolumeId: "{network_volume_id}"') - if allowed_cuda_versions is not None: allowed_cuda_versions_string = ", ".join( [f'"{version}"' for version in allowed_cuda_versions] ) input_fields.append(f"allowedCudaVersions: [{allowed_cuda_versions_string}]") - - input_fields.append(f'minCudaVersion: "{RunpodProvider.MIN_CUDA_VERSION}"') + if container_registry_auth_id is not None: + input_fields.append(f'containerRegistryAuthId: "{container_registry_auth_id}"') pod_deploy = "podFindAndDeployOnDemand" if bid_per_gpu is None else "podRentInterruptable" - # Format input fields + input_string = ", ".join(input_fields) return f""" mutation {{ diff --git a/src/dstack/_internal/core/backends/runpod/compute.py b/src/dstack/_internal/core/backends/runpod/compute.py index 5a3a23210a..d312db5f2e 100644 --- a/src/dstack/_internal/core/backends/runpod/compute.py +++ b/src/dstack/_internal/core/backends/runpod/compute.py @@ -165,6 +165,7 @@ def run_job( resp = self.api_client.create_cpu_pod( name=pod_name, image_name=job.job_spec.image_name, + container_registry_auth_id=container_registry_auth_id, instance_id=instance_offer.instance.name, cloud_type="SECURE", deploy_cost=instance_offer.price, @@ -193,6 +194,7 @@ def run_job( resp = self.api_client.create_pod( name=pod_name, image_name=job.job_spec.image_name, + container_registry_auth_id=container_registry_auth_id, gpu_type_id=instance_offer.instance.name, cloud_type=cloud_type, data_center_id=data_center_id, @@ -212,18 +214,6 @@ def run_job( instance_id = resp["id"] - # Call edit_pod to pass container_registry_auth_id. - # Expect a long time (~5m) for the pod to pick up the creds. - # TODO: remove editPod once Runpod's create mutations support docker's username/password - # (or a reliable containerRegistryAuthId at create time). - if container_registry_auth_id is not None: - instance_id = self.api_client.edit_pod( - pod_id=instance_id, - image_name=job.job_spec.image_name, - container_disk_in_gb=disk_size, - container_registry_auth_id=container_registry_auth_id, - ) - if ( self._last_cleanup_time is None or self._last_cleanup_time @@ -316,13 +306,15 @@ def run_jobs( env={"RUNPOD_POD_USER": "0"}, ) - # An "edit pod" trick to pass container registry creds. + # Unlike create mutations for individual pods, createCluster mutation doesn't accept + # containerRegistryAuthId. + # The workaround is to inject containerRegistryAuthId into already created pods. + # Expect a long time (~5m) for the pods to pick up the creds. + # TODO: remove once createCluster supports containerRegistryAuthId if container_registry_auth_id is not None: for pod in resp["pods"]: - self.api_client.edit_pod( + self.api_client.update_pod_container_registry_auth( pod_id=pod["id"], - image_name=master_job.job_spec.image_name, - container_disk_in_gb=disk_size, container_registry_auth_id=container_registry_auth_id, )