From ceb0531af442cf18f919d08084c0b7468f17cc4f Mon Sep 17 00:00:00 2001 From: HR Wu <5631010+heiruwu@users.noreply.github.com> Date: Thu, 11 Apr 2024 04:10:54 +0800 Subject: [PATCH] chore(helm): mark GPU node as no CPU resource available (#474) Because - In current setup, GPU node has CPU resources available, causing pure CPU model will sometime be scheduled onto GPU node, which is not desirable, we want CPU models to only be scheduled on CPU node. This commit - mark GPU node as no CPU resource available for scheduling --- charts/core/templates/ray-service/ray-service.yaml | 3 +++ charts/core/values.yaml | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/charts/core/templates/ray-service/ray-service.yaml b/charts/core/templates/ray-service/ray-service.yaml index 5846acd6..b2e8f38d 100644 --- a/charts/core/templates/ray-service/ray-service.yaml +++ b/charts/core/templates/ray-service/ray-service.yaml @@ -114,6 +114,9 @@ spec: maxReplicas: {{ $workerGroupSpecs.maxReplicas }} groupName: {{ $workerGroupSpecs.groupName }} rayStartParams: + {{- if $workerGroupSpecs.gpuWorkerGroup.enabled }} + num-cpus: "0" + {{- end }} {{- if $workerGroupSpecs.gpuWorkerGroup.customResource }} resources: {{ $workerGroupSpecs.gpuWorkerGroup.customResource }} {{- end }} diff --git a/charts/core/values.yaml b/charts/core/values.yaml index 7901cf11..2a4a21ae 100644 --- a/charts/core/values.yaml +++ b/charts/core/values.yaml @@ -687,11 +687,11 @@ rayService: headGroupSpec: resources: limits: - cpu: "0" + cpu: "2" memory: "4Gi" nvidia.com/gpu: 0 requests: - cpu: "0" + cpu: "2" memory: "4Gi" nvidia.com/gpu: 0 affinity: {}