From d042a00e27058aa6c6e8f68b182d4e98cd43bcfc Mon Sep 17 00:00:00 2001 From: hansinikarunarathne Date: Sun, 18 Aug 2024 00:27:09 +0530 Subject: [PATCH 01/24] enable istio support for the ray Signed-off-by: hansinikarunarathne --- contrib/ray/raycluster_example.yaml | 140 ------------------ contrib/ray/raycluster_istio.yaml | 104 +++++++++++++ .../ray/raycluster_istio_headless_svc.yaml | 64 ++++++++ 3 files changed, 168 insertions(+), 140 deletions(-) delete mode 100644 contrib/ray/raycluster_example.yaml create mode 100644 contrib/ray/raycluster_istio.yaml create mode 100644 contrib/ray/raycluster_istio_headless_svc.yaml diff --git a/contrib/ray/raycluster_example.yaml b/contrib/ray/raycluster_example.yaml deleted file mode 100644 index 0de4047263..0000000000 --- a/contrib/ray/raycluster_example.yaml +++ /dev/null @@ -1,140 +0,0 @@ -apiVersion: ray.io/v1 -kind: RayCluster -metadata: - name: kubeflow-raycluster -spec: - rayVersion: '2.23.0' - # If `enableInTreeAutoscaling` is true, the Autoscaler sidecar will be added to the Ray head pod. - enableInTreeAutoscaling: true - # `autoscalerOptions` is an OPTIONAL field specifying configuration overrides for the Ray Autoscaler. - # The example configuration shown below below represents the DEFAULT values. - # (You may delete autoscalerOptions if the defaults are suitable.) - autoscalerOptions: - # Default: Upscaling is not rate-limited. This mode adds new worker pods to handle increased workload as quiclky as possible. - upscalingMode: Default - # `idleTimeoutSeconds` is the number of seconds to wait before scaling down a worker pod which is not using Ray resources. - idleTimeoutSeconds: 60 - # Ray head pod configuration - headGroupSpec: - # Kubernetes Service Type. - serviceType: ClusterIP - # The following params are used to complete the ray start: ray start --head --block --dashboard-host: '0.0.0.0' ... - rayStartParams: - # Setting "num-cpus: 0" to avoid any Ray actors or tasks being scheduled on the Ray head Pod. - num-cpus: "0" - dashboard-host: '0.0.0.0' - block: 'true' - # pod template - template: - metadata: - # Custom labels. NOTE: To avoid conflicts with KubeRay operator, do not define custom labels start with `raycluster`. - # Refer to https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/ - # The ray head must not have an Istio sidecar - # TODO add an authorizationpolicy in the future for the ray head - labels: - sidecar.istio.io/inject: "false" - spec: - containers: - - name: ray-head - image: rayproject/ray:2.23.0-py311-cpu - ports: - - containerPort: 6379 - name: gcs - - containerPort: 8265 - name: dashboard - - containerPort: 10001 - name: client - lifecycle: - preStop: - exec: - command: ["/bin/sh","-c","ray stop"] - volumeMounts: - - mountPath: /tmp/ray - name: ray-logs - # The resource requests and limits in this config are too small for production! - # It is better to use a few large Ray pod than many small ones. - # For production, it is ideal to size each Ray pod to take up the - # entire Kubernetes node on which it is scheduled. - resources: - limits: - cpu: "1" - memory: "2G" - requests: - cpu: "100m" - memory: "2G" - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: ["ALL"] - runAsNonRoot: true - seccompProfile: - type: RuntimeDefault - volumes: - - name: ray-logs - emptyDir: {} - workerGroupSpecs: - # the pod replicas in this group typed worker - - replicas: 1 - minReplicas: 1 - maxReplicas: 10 - # logical group name, for this called small-group, also can be functional - groupName: small-group - rayStartParams: - block: 'true' - #pod template - template: - metadata: - labels: - # Disable the sidecars for the ray wokers - # TODO add an authorizationpolicy in the future for the ray worker - sidecar.istio.io/inject: "false" - spec: - containers: - - name: ray-worker - image: rayproject/ray:2.23.0-py311-cpu - lifecycle: - preStop: - exec: - command: ["/bin/sh","-c","ray stop"] - # use volumeMounts.Optional. - # Refer to https://kubernetes.io/docs/concepts/storage/volumes/ - volumeMounts: - - mountPath: /tmp/ray - name: ray-logs - # The resource requests and limits in this config are too small for production! - # It is better to use a few large Ray pod than many small ones. - # For production, it is ideal to size each Ray pod to take up the - # entire Kubernetes node on which it is scheduled. - resources: - limits: - cpu: "1" - memory: "1G" - requests: - cpu: "300m" - memory: "1G" - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: ["ALL"] - runAsNonRoot: true - seccompProfile: - type: RuntimeDefault - initContainers: - # the env var $RAY_IP is set by the operator if missing, with the value of the head service name - - name: init - image: busybox:1.36 - # Change the cluster postfix if you don't have a default setting - command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for K8s Service $RAY_IP; sleep 2; done"] - securityContext: - runAsUser: 1000 - allowPrivilegeEscalation: false - capabilities: - drop: ["ALL"] - runAsNonRoot: true - seccompProfile: - type: RuntimeDefault - # use volumes - # Refer to https://kubernetes.io/docs/concepts/storage/volumes/ - volumes: - - name: ray-logs - emptyDir: {} diff --git a/contrib/ray/raycluster_istio.yaml b/contrib/ray/raycluster_istio.yaml new file mode 100644 index 0000000000..308aa24362 --- /dev/null +++ b/contrib/ray/raycluster_istio.yaml @@ -0,0 +1,104 @@ +apiVersion: ray.io/v1 +kind: RayCluster +metadata: + name: raycluster-istio +spec: + rayVersion: '2.23.0' + enableInTreeAutoscaling: true + autoscalerOptions: + upscalingMode: Default + idleTimeoutSeconds: 60 + headGroupSpec: + rayStartParams: + num-cpus: '1' + node-manager-port: '6380' + object-manager-port: '6381' + runtime-env-agent-port: '6382' + dashboard-agent-grpc-port: '6383' + dashboard-agent-listen-port: '52365' + metrics-export-port: '8080' + max-worker-port: '10012' + node-ip-address: $(hostname -I | tr -d ' ' | sed 's/\./-/g').raycluster-istio-headless-svc.default.svc.cluster.local + template: + spec: + containers: + - name: ray-head + image: rayproject/ray:2.23.0-py311-cpu + lifecycle: + preStop: + exec: + command: ["/bin/sh","-c","ray stop"] + volumeMounts: + - mountPath: /tmp/ray + name: ray-logs + # The resource requests and limits in this config are too small for production! + # It is better to use a few large Ray pod than many small ones. + # For production, it is ideal to size each Ray pod to take up the + # entire Kubernetes node on which it is scheduled. + resources: + limits: + cpu: "2" + memory: "2G" + requests: + cpu: "100m" + memory: "2G" + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + runAsNonRoot: true + seccompProfile: + type: RuntimeDefault + volumes: + - name: ray-logs + emptyDir: {} + workerGroupSpecs: + - replicas: 1 + minReplicas: 1 + maxReplicas: 1 + groupName: small-group + rayStartParams: + num-cpus: '1' + node-manager-port: '6380' + object-manager-port: '6381' + runtime-env-agent-port: '6382' + dashboard-agent-grpc-port: '6383' + dashboard-agent-listen-port: '52365' + metrics-export-port: '8080' + max-worker-port: '10012' + node-ip-address: $(hostname -I | tr -d ' ' | sed 's/\./-/g').raycluster-istio-headless-svc.default.svc.cluster.local + template: + spec: + containers: + - name: ray-worker + image: rayproject/ray:2.23.0-py311-cpu + lifecycle: + preStop: + exec: + command: ["/bin/sh","-c","ray stop"] + # use volumeMounts.Optional. + # Refer to https://kubernetes.io/docs/concepts/storage/volumes/ + volumeMounts: + - mountPath: /tmp/ray + name: ray-logs + # The resource requests and limits in this config are too small for production! + # It is better to use a few large Ray pod than many small ones. + # For production, it is ideal to size each Ray pod to take up the + # entire Kubernetes node on which it is scheduled. + resources: + limits: + cpu: "2" + memory: "1G" + requests: + cpu: "300m" + memory: "1G" + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + runAsNonRoot: true + seccompProfile: + type: RuntimeDefault + volumes: + - name: ray-logs + emptyDir: {} diff --git a/contrib/ray/raycluster_istio_headless_svc.yaml b/contrib/ray/raycluster_istio_headless_svc.yaml new file mode 100644 index 0000000000..2e73971691 --- /dev/null +++ b/contrib/ray/raycluster_istio_headless_svc.yaml @@ -0,0 +1,64 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + ray.io/headless-worker-svc: raycluster-istio + name: raycluster-istio-headless-svc + namespace: default +spec: + clusterIP: None + selector: + ray.io/cluster: raycluster-istio + publishNotReadyAddresses: true + ports: + - name: node-manager-port + port: 6380 + appProtocol: grpc + - name: object-manager-port + port: 6381 + appProtocol: grpc + - name: runtime-env-agent-port + port: 6382 + appProtocol: grpc + - name: dashboard-agent-grpc-port + port: 6383 + appProtocol: grpc + - name: dashboard-agent-listen-port + port: 52365 + appProtocol: http + - name: metrics-export-port + port: 8080 + appProtocol: http + - name: p10002 + port: 10002 + appProtocol: grpc + - name: p10003 + port: 10003 + appProtocol: grpc + - name: p10004 + port: 10004 + appProtocol: grpc + - name: p10005 + port: 10005 + appProtocol: grpc + - name: p10006 + port: 10006 + appProtocol: grpc + - name: p10007 + port: 10007 + appProtocol: grpc + - name: p10008 + port: 10008 + appProtocol: grpc + - name: p10009 + port: 10009 + appProtocol: grpc + - name: p10010 + port: 10010 + appProtocol: grpc + - name: p10011 + port: 10011 + appProtocol: grpc + - name: p10012 + port: 10012 + appProtocol: grpc \ No newline at end of file From 29312320c678e9d783616a40223fc04bb21f8bc6 Mon Sep 17 00:00:00 2001 From: hansinikarunarathne Date: Sun, 18 Aug 2024 00:37:07 +0530 Subject: [PATCH 02/24] change the yaml file name Signed-off-by: hansinikarunarathne --- contrib/ray/{raycluster_istio.yaml => raycluster_example.yaml} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename contrib/ray/{raycluster_istio.yaml => raycluster_example.yaml} (100%) diff --git a/contrib/ray/raycluster_istio.yaml b/contrib/ray/raycluster_example.yaml similarity index 100% rename from contrib/ray/raycluster_istio.yaml rename to contrib/ray/raycluster_example.yaml From bfafe2aa32fa8f0777247ebbf723a23a70c83fd3 Mon Sep 17 00:00:00 2001 From: hansinikarunarathne Date: Thu, 22 Aug 2024 21:23:58 +0530 Subject: [PATCH 03/24] made changes in in raycluster test Signed-off-by: hansinikarunarathne --- contrib/ray/raycluster_example.yaml | 2 +- contrib/ray/raycluster_istio_headless_svc.yaml | 2 +- contrib/ray/test.sh | 6 ++++++ 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/contrib/ray/raycluster_example.yaml b/contrib/ray/raycluster_example.yaml index 308aa24362..dffe810cca 100644 --- a/contrib/ray/raycluster_example.yaml +++ b/contrib/ray/raycluster_example.yaml @@ -1,7 +1,7 @@ apiVersion: ray.io/v1 kind: RayCluster metadata: - name: raycluster-istio + name: kubeflow-raycluster spec: rayVersion: '2.23.0' enableInTreeAutoscaling: true diff --git a/contrib/ray/raycluster_istio_headless_svc.yaml b/contrib/ray/raycluster_istio_headless_svc.yaml index 2e73971691..c2ef229dbf 100644 --- a/contrib/ray/raycluster_istio_headless_svc.yaml +++ b/contrib/ray/raycluster_istio_headless_svc.yaml @@ -8,7 +8,7 @@ metadata: spec: clusterIP: None selector: - ray.io/cluster: raycluster-istio + ray.io/cluster: kubeflow-raycluster publishNotReadyAddresses: true ports: - name: node-manager-port diff --git a/contrib/ray/test.sh b/contrib/ray/test.sh index bca7f3b376..a676b8136d 100755 --- a/contrib/ray/test.sh +++ b/contrib/ray/test.sh @@ -34,6 +34,9 @@ function trap_handler { trap trap_handler EXIT +# Install Istio +./tests/gh-actions/install_istio.sh + # Install KubeRay operator kustomize build kuberay-operator/overlays/standalone | kubectl -n $NAMESPACE apply --server-side -f - @@ -41,6 +44,9 @@ kustomize build kuberay-operator/overlays/standalone | kubectl -n $NAMESPACE app kubectl -n $NAMESPACE wait --for=condition=available --timeout=600s deploy/kuberay-operator kubectl -n $NAMESPACE get pod -l app.kubernetes.io/component=kuberay-operator +# Create a RayCluster Headless serivice +kubectl -n $NAMESPACE apply -f raycluster_istio_headless_svc.yaml + # Create a RayCluster custom resource. kubectl -n $NAMESPACE apply -f raycluster_example.yaml From adc3aec7454ae9941add6ae7c51b0e1f6fd6e701 Mon Sep 17 00:00:00 2001 From: hansinikarunarathne Date: Thu, 22 Aug 2024 21:27:39 +0530 Subject: [PATCH 04/24] made a fix Signed-off-by: hansinikarunarathne --- contrib/ray/test.sh | 3 --- 1 file changed, 3 deletions(-) diff --git a/contrib/ray/test.sh b/contrib/ray/test.sh index a676b8136d..f43a36b3fb 100755 --- a/contrib/ray/test.sh +++ b/contrib/ray/test.sh @@ -34,9 +34,6 @@ function trap_handler { trap trap_handler EXIT -# Install Istio -./tests/gh-actions/install_istio.sh - # Install KubeRay operator kustomize build kuberay-operator/overlays/standalone | kubectl -n $NAMESPACE apply --server-side -f - From c16a65ae1a8d266d0abf80702c02a5a615bda346 Mon Sep 17 00:00:00 2001 From: hansinikarunarathne Date: Thu, 22 Aug 2024 21:31:46 +0530 Subject: [PATCH 05/24] Changed the namespace default to kubeflow in the headless service of raycluster Signed-off-by: hansinikarunarathne --- contrib/ray/raycluster_istio_headless_svc.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/ray/raycluster_istio_headless_svc.yaml b/contrib/ray/raycluster_istio_headless_svc.yaml index c2ef229dbf..61b2c4103b 100644 --- a/contrib/ray/raycluster_istio_headless_svc.yaml +++ b/contrib/ray/raycluster_istio_headless_svc.yaml @@ -4,7 +4,7 @@ metadata: labels: ray.io/headless-worker-svc: raycluster-istio name: raycluster-istio-headless-svc - namespace: default + namespace: kubeflow spec: clusterIP: None selector: From 08dee52b1d7c127639c8b03644136481a6b858ae Mon Sep 17 00:00:00 2001 From: hansinikarunarathne Date: Thu, 22 Aug 2024 22:19:18 +0530 Subject: [PATCH 06/24] add additional step Signed-off-by: hansinikarunarathne --- contrib/ray/test.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/contrib/ray/test.sh b/contrib/ray/test.sh index f43a36b3fb..16c75d534e 100755 --- a/contrib/ray/test.sh +++ b/contrib/ray/test.sh @@ -47,6 +47,8 @@ kubectl -n $NAMESPACE apply -f raycluster_istio_headless_svc.yaml # Create a RayCluster custom resource. kubectl -n $NAMESPACE apply -f raycluster_example.yaml +kubectl get pods -n $NAMESPACE --watch + # Wait for the RayCluster to be ready. sleep 5 kubectl -n $NAMESPACE wait --for=condition=ready pod -l ray.io/cluster=kubeflow-raycluster --timeout=900s From 6d20065dacb47a4820f352cc9bd34f05c37fd575 Mon Sep 17 00:00:00 2001 From: hansinikarunarathne Date: Thu, 22 Aug 2024 22:53:40 +0530 Subject: [PATCH 07/24] enable istio in kubelow namesapce Signed-off-by: hansinikarunarathne --- contrib/ray/test.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/contrib/ray/test.sh b/contrib/ray/test.sh index 16c75d534e..7b6c113ae0 100755 --- a/contrib/ray/test.sh +++ b/contrib/ray/test.sh @@ -37,6 +37,8 @@ trap trap_handler EXIT # Install KubeRay operator kustomize build kuberay-operator/overlays/standalone | kubectl -n $NAMESPACE apply --server-side -f - +kubectl label namespace kubeflow istio-injection=enabled + # Wait for the operator to be ready. kubectl -n $NAMESPACE wait --for=condition=available --timeout=600s deploy/kuberay-operator kubectl -n $NAMESPACE get pod -l app.kubernetes.io/component=kuberay-operator @@ -47,8 +49,6 @@ kubectl -n $NAMESPACE apply -f raycluster_istio_headless_svc.yaml # Create a RayCluster custom resource. kubectl -n $NAMESPACE apply -f raycluster_example.yaml -kubectl get pods -n $NAMESPACE --watch - # Wait for the RayCluster to be ready. sleep 5 kubectl -n $NAMESPACE wait --for=condition=ready pod -l ray.io/cluster=kubeflow-raycluster --timeout=900s From 95b3728a9117049a7948e344cd15e7325cafda77 Mon Sep 17 00:00:00 2001 From: hansinikarunarathne Date: Thu, 22 Aug 2024 23:01:36 +0530 Subject: [PATCH 08/24] Chnage namespace to install istio Signed-off-by: hansinikarunarathne --- contrib/ray/test.sh | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/contrib/ray/test.sh b/contrib/ray/test.sh index 7b6c113ae0..31cc43a99a 100755 --- a/contrib/ray/test.sh +++ b/contrib/ray/test.sh @@ -34,6 +34,32 @@ function trap_handler { trap trap_handler EXIT +# Download Istioctl and its manifests. +export ISTIO_VERSION=1.21.1 +curl -L https://istio.io/downloadIstio | sh - +cd istio-1.21.1 +export PATH=$PWD/bin:$PATH + +# Install Istio with: +# 1. 100% trace sampling for demo purposes. +# 2. "sanitize_te" disabled for proper gRPC interception. This is required by Istio 1.21.0 (https://github.com/istio/istio/issues/49685). +# 3. TLS 1.3 enabled. +istioctl install -y -f - < Date: Thu, 22 Aug 2024 23:05:41 +0530 Subject: [PATCH 09/24] fix a issue Signed-off-by: hansinikarunarathne --- contrib/ray/test.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/contrib/ray/test.sh b/contrib/ray/test.sh index 31cc43a99a..6aef176803 100755 --- a/contrib/ray/test.sh +++ b/contrib/ray/test.sh @@ -60,6 +60,8 @@ spec: minProtocolVersion: TLSV1_3 EOF +cd + # Install KubeRay operator kustomize build kuberay-operator/overlays/standalone | kubectl -n $NAMESPACE apply --server-side -f - From eb75babe9c8a9ee34bac3aac7a035e510455f4b2 Mon Sep 17 00:00:00 2001 From: hansinikarunarathne Date: Thu, 22 Aug 2024 23:10:30 +0530 Subject: [PATCH 10/24] fix a issue Signed-off-by: hansinikarunarathne --- contrib/ray/test.sh | 52 ++++++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/contrib/ray/test.sh b/contrib/ray/test.sh index 6aef176803..d4f1b298a9 100755 --- a/contrib/ray/test.sh +++ b/contrib/ray/test.sh @@ -35,32 +35,32 @@ function trap_handler { trap trap_handler EXIT # Download Istioctl and its manifests. -export ISTIO_VERSION=1.21.1 -curl -L https://istio.io/downloadIstio | sh - -cd istio-1.21.1 -export PATH=$PWD/bin:$PATH - -# Install Istio with: -# 1. 100% trace sampling for demo purposes. -# 2. "sanitize_te" disabled for proper gRPC interception. This is required by Istio 1.21.0 (https://github.com/istio/istio/issues/49685). -# 3. TLS 1.3 enabled. -istioctl install -y -f - < Date: Thu, 22 Aug 2024 23:21:00 +0530 Subject: [PATCH 11/24] fix folder structure issue Signed-off-by: hansinikarunarathne --- contrib/ray/test.sh | 52 ++++++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/contrib/ray/test.sh b/contrib/ray/test.sh index d4f1b298a9..4dbb395d28 100755 --- a/contrib/ray/test.sh +++ b/contrib/ray/test.sh @@ -35,32 +35,32 @@ function trap_handler { trap trap_handler EXIT # Download Istioctl and its manifests. -# export ISTIO_VERSION=1.21.1 -# curl -L https://istio.io/downloadIstio | sh - -# cd istio-1.21.1 -# export PATH=$PWD/bin:$PATH - -# # Install Istio with: -# # 1. 100% trace sampling for demo purposes. -# # 2. "sanitize_te" disabled for proper gRPC interception. This is required by Istio 1.21.0 (https://github.com/istio/istio/issues/49685). -# # 3. TLS 1.3 enabled. -# istioctl install -y -f - < Date: Thu, 22 Aug 2024 23:27:42 +0530 Subject: [PATCH 12/24] enable istion labele to kubeflow Signed-off-by: hansinikarunarathne --- contrib/ray/test.sh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/contrib/ray/test.sh b/contrib/ray/test.sh index 4dbb395d28..de543cdae3 100755 --- a/contrib/ray/test.sh +++ b/contrib/ray/test.sh @@ -62,11 +62,14 @@ EOF cd .. +kubectl label namespace kubeflow istio-injection=enabled + +kubectl get namespaces --selector=istio-injection=enabled + + # Install KubeRay operator kustomize build kuberay-operator/overlays/standalone | kubectl -n $NAMESPACE apply --server-side -f - -kubectl label namespace kubeflow istio-injection=enabled - # Wait for the operator to be ready. kubectl -n $NAMESPACE wait --for=condition=available --timeout=600s deploy/kuberay-operator kubectl -n $NAMESPACE get pod -l app.kubernetes.io/component=kuberay-operator From 75be977486bbeecb067b666f5e92d8fe4e0e787f Mon Sep 17 00:00:00 2001 From: hansinikarunarathne Date: Thu, 22 Aug 2024 23:37:43 +0530 Subject: [PATCH 13/24] create kubelow namespace Signed-off-by: hansinikarunarathne --- contrib/ray/test.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/contrib/ray/test.sh b/contrib/ray/test.sh index de543cdae3..a2dbdf490e 100755 --- a/contrib/ray/test.sh +++ b/contrib/ray/test.sh @@ -40,6 +40,8 @@ curl -L https://istio.io/downloadIstio | sh - cd istio-1.21.1 export PATH=$PWD/bin:$PATH +kubectl create namespace $NAMESPACE + # Install Istio with: # 1. 100% trace sampling for demo purposes. # 2. "sanitize_te" disabled for proper gRPC interception. This is required by Istio 1.21.0 (https://github.com/istio/istio/issues/49685). From 83b829d03f6c2b5a7f3448c60b4c4a4f2e83b93d Mon Sep 17 00:00:00 2001 From: hansinikarunarathne Date: Thu, 22 Aug 2024 23:55:06 +0530 Subject: [PATCH 14/24] add pod watch Signed-off-by: hansinikarunarathne --- contrib/ray/test.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/contrib/ray/test.sh b/contrib/ray/test.sh index a2dbdf490e..c18e7aac8f 100755 --- a/contrib/ray/test.sh +++ b/contrib/ray/test.sh @@ -82,6 +82,8 @@ kubectl -n $NAMESPACE apply -f raycluster_istio_headless_svc.yaml # Create a RayCluster custom resource. kubectl -n $NAMESPACE apply -f raycluster_example.yaml +kubectl get pods -n $NAMESPACE --watch + # Wait for the RayCluster to be ready. sleep 5 kubectl -n $NAMESPACE wait --for=condition=ready pod -l ray.io/cluster=kubeflow-raycluster --timeout=900s From 29ca6bd5c3c21f6348cd00b64926e6caaeb04b99 Mon Sep 17 00:00:00 2001 From: hansinikarunarathne Date: Fri, 23 Aug 2024 00:43:10 +0530 Subject: [PATCH 15/24] remove watch Signed-off-by: hansinikarunarathne --- contrib/ray/test.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/contrib/ray/test.sh b/contrib/ray/test.sh index c18e7aac8f..a2dbdf490e 100755 --- a/contrib/ray/test.sh +++ b/contrib/ray/test.sh @@ -82,8 +82,6 @@ kubectl -n $NAMESPACE apply -f raycluster_istio_headless_svc.yaml # Create a RayCluster custom resource. kubectl -n $NAMESPACE apply -f raycluster_example.yaml -kubectl get pods -n $NAMESPACE --watch - # Wait for the RayCluster to be ready. sleep 5 kubectl -n $NAMESPACE wait --for=condition=ready pod -l ray.io/cluster=kubeflow-raycluster --timeout=900s From d4060285b3e6f4ad2ac7b55a070eac88d0513ecc Mon Sep 17 00:00:00 2001 From: hansinikarunarathne Date: Sun, 8 Sep 2024 12:42:21 +0530 Subject: [PATCH 16/24] Add istio authorization policy for ray_cluster Signed-off-by: hansinikarunarathne --- .github/workflows/ray_test.yaml | 3 + .../overlays/standalone/kustomization.yaml | 1 - .../overlays/standalone/namespace.yaml | 4 - contrib/ray/raycluster_example.yaml | 5 +- contrib/ray/raycluster_istio_auth_policy.yaml | 8 ++ contrib/ray/test.sh | 97 ++++++++----------- 6 files changed, 53 insertions(+), 65 deletions(-) delete mode 100644 contrib/ray/kuberay-operator/overlays/standalone/namespace.yaml create mode 100644 contrib/ray/raycluster_istio_auth_policy.yaml diff --git a/.github/workflows/ray_test.yaml b/.github/workflows/ray_test.yaml index 20e3870153..0d878927ac 100644 --- a/.github/workflows/ray_test.yaml +++ b/.github/workflows/ray_test.yaml @@ -16,6 +16,9 @@ jobs: - name: Install KinD, Create KinD cluster and Install kustomize run: ./tests/gh-actions/install_KinD_create_KinD_cluster_install_kustomize.sh + - name: Install Istio + run: ./tests/gh-actions/install_istio.sh + - name: Build & Apply manifests run: | cd contrib/ray/ diff --git a/contrib/ray/kuberay-operator/overlays/standalone/kustomization.yaml b/contrib/ray/kuberay-operator/overlays/standalone/kustomization.yaml index fb5cfeb375..5520f1897a 100644 --- a/contrib/ray/kuberay-operator/overlays/standalone/kustomization.yaml +++ b/contrib/ray/kuberay-operator/overlays/standalone/kustomization.yaml @@ -1,4 +1,3 @@ namespace: kubeflow resources: - ../../base -- namespace.yaml diff --git a/contrib/ray/kuberay-operator/overlays/standalone/namespace.yaml b/contrib/ray/kuberay-operator/overlays/standalone/namespace.yaml deleted file mode 100644 index 7a940e4673..0000000000 --- a/contrib/ray/kuberay-operator/overlays/standalone/namespace.yaml +++ /dev/null @@ -1,4 +0,0 @@ -apiVersion: v1 -kind: Namespace -metadata: - name: kubeflow diff --git a/contrib/ray/raycluster_example.yaml b/contrib/ray/raycluster_example.yaml index dffe810cca..59ee42a262 100644 --- a/contrib/ray/raycluster_example.yaml +++ b/contrib/ray/raycluster_example.yaml @@ -2,6 +2,7 @@ apiVersion: ray.io/v1 kind: RayCluster metadata: name: kubeflow-raycluster + namespace: kubeflow spec: rayVersion: '2.23.0' enableInTreeAutoscaling: true @@ -18,7 +19,7 @@ spec: dashboard-agent-listen-port: '52365' metrics-export-port: '8080' max-worker-port: '10012' - node-ip-address: $(hostname -I | tr -d ' ' | sed 's/\./-/g').raycluster-istio-headless-svc.default.svc.cluster.local + node-ip-address: $(hostname -I | tr -d ' ' | sed 's/\./-/g').raycluster-istio-headless-svc.kubeflow.svc.cluster.local template: spec: containers: @@ -66,7 +67,7 @@ spec: dashboard-agent-listen-port: '52365' metrics-export-port: '8080' max-worker-port: '10012' - node-ip-address: $(hostname -I | tr -d ' ' | sed 's/\./-/g').raycluster-istio-headless-svc.default.svc.cluster.local + node-ip-address: $(hostname -I | tr -d ' ' | sed 's/\./-/g').raycluster-istio-headless-svc.kubeflow.svc.cluster.local template: spec: containers: diff --git a/contrib/ray/raycluster_istio_auth_policy.yaml b/contrib/ray/raycluster_istio_auth_policy.yaml new file mode 100644 index 0000000000..0798753cd9 --- /dev/null +++ b/contrib/ray/raycluster_istio_auth_policy.yaml @@ -0,0 +1,8 @@ +apiVersion: security.istio.io/v1 +kind: AuthorizationPolicy +metadata: + name: allow-all + namespace: kubeflow +spec: + rules: + - {} diff --git a/contrib/ray/test.sh b/contrib/ray/test.sh index a2dbdf490e..5f6dcdc231 100755 --- a/contrib/ray/test.sh +++ b/contrib/ray/test.sh @@ -7,62 +7,40 @@ TIMEOUT=120 # timeout in seconds SLEEP_INTERVAL=30 # interval between checks in seconds RAY_VERSION=2.23.0 -function trap_handler { - kill $PID - # Delete RayCluster - kubectl -n $NAMESPACE delete -f raycluster_example.yaml - - # Wait for all Ray Pods to be deleted. - start_time=$(date +%s) - while true; do - pods=$(kubectl -n $NAMESPACE get pods -o json | jq '.items | length') - if [ "$pods" -eq 1 ]; then - break - fi - current_time=$(date +%s) - elapsed_time=$((current_time - start_time)) - if [ "$elapsed_time" -ge "$TIMEOUT" ]; then - echo "Timeout exceeded. Exiting loop." - exit 1 - fi - sleep $SLEEP_INTERVAL - done - - # Delete KubeRay operator - kustomize build kuberay-operator/base | kubectl -n $NAMESPACE delete -f - -} - -trap trap_handler EXIT - -# Download Istioctl and its manifests. -export ISTIO_VERSION=1.21.1 -curl -L https://istio.io/downloadIstio | sh - -cd istio-1.21.1 -export PATH=$PWD/bin:$PATH - -kubectl create namespace $NAMESPACE - -# Install Istio with: -# 1. 100% trace sampling for demo purposes. -# 2. "sanitize_te" disabled for proper gRPC interception. This is required by Istio 1.21.0 (https://github.com/istio/istio/issues/49685). -# 3. TLS 1.3 enabled. -istioctl install -y -f - < /dev/null 2>&1; then + kubectl create namespace $NAMESPACE +fi kubectl label namespace kubeflow istio-injection=enabled @@ -77,10 +55,13 @@ kubectl -n $NAMESPACE wait --for=condition=available --timeout=600s deploy/kuber kubectl -n $NAMESPACE get pod -l app.kubernetes.io/component=kuberay-operator # Create a RayCluster Headless serivice -kubectl -n $NAMESPACE apply -f raycluster_istio_headless_svc.yaml +kubectl apply -f raycluster_istio_headless_svc.yaml + +# Create a RayCluster AuthorizationPolicy +kubectl apply -f raycluster_istio_auth_policy.yaml # Create a RayCluster custom resource. -kubectl -n $NAMESPACE apply -f raycluster_example.yaml +kubectl apply -f raycluster_example.yaml # Wait for the RayCluster to be ready. sleep 5 From aac18edcc06a82f5ea938aa8fcc36c8970a38af7 Mon Sep 17 00:00:00 2001 From: hansinikarunarathne Date: Sun, 8 Sep 2024 13:11:22 +0530 Subject: [PATCH 17/24] fix uncommented changes Signed-off-by: hansinikarunarathne --- contrib/ray/test.sh | 56 ++++++++++++++++++++++----------------------- 1 file changed, 27 insertions(+), 29 deletions(-) diff --git a/contrib/ray/test.sh b/contrib/ray/test.sh index 5f6dcdc231..0590ae14a2 100755 --- a/contrib/ray/test.sh +++ b/contrib/ray/test.sh @@ -7,35 +7,33 @@ TIMEOUT=120 # timeout in seconds SLEEP_INTERVAL=30 # interval between checks in seconds RAY_VERSION=2.23.0 -# function trap_handler { -# kill $PID -# # Delete RayCluster -# kubectl -n $NAMESPACE delete -f raycluster_example.yaml - -# # Wait for all Ray Pods to be deleted. -# start_time=$(date +%s) -# while true; do -# pods=$(kubectl -n $NAMESPACE get pods -o json | jq '.items | length') -# if [ "$pods" -eq 1 ]; then -# break -# fi -# current_time=$(date +%s) -# elapsed_time=$((current_time - start_time)) -# if [ "$elapsed_time" -ge "$TIMEOUT" ]; then -# echo "Timeout exceeded. Exiting loop." -# exit 1 -# fi -# sleep $SLEEP_INTERVAL -# done - -# # Delete KubeRay operator -# kustomize build kuberay-operator/base | kubectl -n $NAMESPACE delete -f - - -# } - -# trap trap_handler EXIT - -# kubectl create namespace $NAMESPACE +function trap_handler { + kill $PID + # Delete RayCluster + kubectl -n $NAMESPACE delete -f raycluster_example.yaml + + # Wait for all Ray Pods to be deleted. + start_time=$(date +%s) + while true; do + pods=$(kubectl -n $NAMESPACE get pods -o json | jq '.items | length') + if [ "$pods" -eq 1 ]; then + break + fi + current_time=$(date +%s) + elapsed_time=$((current_time - start_time)) + if [ "$elapsed_time" -ge "$TIMEOUT" ]; then + echo "Timeout exceeded. Exiting loop." + exit 1 + fi + sleep $SLEEP_INTERVAL + done + + # Delete KubeRay operator + kustomize build kuberay-operator/base | kubectl -n $NAMESPACE delete -f - + +} + +trap trap_handler EXIT # Check if namespace exists, if not, create it if ! kubectl get namespace $NAMESPACE > /dev/null 2>&1; then From 32d02bcf1d3d97ca0b579ecb34861813e883c121 Mon Sep 17 00:00:00 2001 From: hansinikarunarathne Date: Mon, 9 Sep 2024 15:58:57 +0530 Subject: [PATCH 18/24] made namespace change Signed-off-by: hansinikarunarathne --- .../overlays/standalone/kustomization.yaml | 1 + .../overlays/standalone/namespace.yaml | 6 ++++++ contrib/ray/raycluster_example.yaml | 1 - contrib/ray/raycluster_istio_headless_svc.yaml | 1 - contrib/ray/test.sh | 11 ++--------- 5 files changed, 9 insertions(+), 11 deletions(-) create mode 100644 contrib/ray/kuberay-operator/overlays/standalone/namespace.yaml diff --git a/contrib/ray/kuberay-operator/overlays/standalone/kustomization.yaml b/contrib/ray/kuberay-operator/overlays/standalone/kustomization.yaml index 5520f1897a..fb5cfeb375 100644 --- a/contrib/ray/kuberay-operator/overlays/standalone/kustomization.yaml +++ b/contrib/ray/kuberay-operator/overlays/standalone/kustomization.yaml @@ -1,3 +1,4 @@ namespace: kubeflow resources: - ../../base +- namespace.yaml diff --git a/contrib/ray/kuberay-operator/overlays/standalone/namespace.yaml b/contrib/ray/kuberay-operator/overlays/standalone/namespace.yaml new file mode 100644 index 0000000000..56e1932179 --- /dev/null +++ b/contrib/ray/kuberay-operator/overlays/standalone/namespace.yaml @@ -0,0 +1,6 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: kubeflow + labels: + istio-injection: enabled \ No newline at end of file diff --git a/contrib/ray/raycluster_example.yaml b/contrib/ray/raycluster_example.yaml index 59ee42a262..57fbd4fad5 100644 --- a/contrib/ray/raycluster_example.yaml +++ b/contrib/ray/raycluster_example.yaml @@ -2,7 +2,6 @@ apiVersion: ray.io/v1 kind: RayCluster metadata: name: kubeflow-raycluster - namespace: kubeflow spec: rayVersion: '2.23.0' enableInTreeAutoscaling: true diff --git a/contrib/ray/raycluster_istio_headless_svc.yaml b/contrib/ray/raycluster_istio_headless_svc.yaml index 61b2c4103b..fe89f81a73 100644 --- a/contrib/ray/raycluster_istio_headless_svc.yaml +++ b/contrib/ray/raycluster_istio_headless_svc.yaml @@ -4,7 +4,6 @@ metadata: labels: ray.io/headless-worker-svc: raycluster-istio name: raycluster-istio-headless-svc - namespace: kubeflow spec: clusterIP: None selector: diff --git a/contrib/ray/test.sh b/contrib/ray/test.sh index 0590ae14a2..d8bd955bb9 100755 --- a/contrib/ray/test.sh +++ b/contrib/ray/test.sh @@ -35,13 +35,6 @@ function trap_handler { trap trap_handler EXIT -# Check if namespace exists, if not, create it -if ! kubectl get namespace $NAMESPACE > /dev/null 2>&1; then - kubectl create namespace $NAMESPACE -fi - -kubectl label namespace kubeflow istio-injection=enabled - kubectl get namespaces --selector=istio-injection=enabled @@ -53,13 +46,13 @@ kubectl -n $NAMESPACE wait --for=condition=available --timeout=600s deploy/kuber kubectl -n $NAMESPACE get pod -l app.kubernetes.io/component=kuberay-operator # Create a RayCluster Headless serivice -kubectl apply -f raycluster_istio_headless_svc.yaml +kubectl -n $NAMESPACE apply -f raycluster_istio_headless_svc.yaml # Create a RayCluster AuthorizationPolicy kubectl apply -f raycluster_istio_auth_policy.yaml # Create a RayCluster custom resource. -kubectl apply -f raycluster_example.yaml +kubectl -n $NAMESPACE apply -f raycluster_example.yaml # Wait for the RayCluster to be ready. sleep 5 From 98a70d88d405973fca16e25bbf816ce3149e644d Mon Sep 17 00:00:00 2001 From: hansinikarunarathne Date: Tue, 10 Sep 2024 00:36:54 +0530 Subject: [PATCH 19/24] Added user namespace and multitenancy Signed-off-by: hansinikarunarathne --- .github/workflows/ray_test.yaml | 17 +++- contrib/ray/Makefile | 3 +- .../overlays/standalone/kustomization.yaml | 3 +- .../overlays/standalone/namespace.yaml | 6 -- contrib/ray/raycluster_example.yaml | 92 ++++++++++++++++--- contrib/ray/raycluster_istio_auth_policy.yaml | 8 -- .../ray/raycluster_istio_headless_svc.yaml | 63 ------------- contrib/ray/test.sh | 24 ++--- 8 files changed, 107 insertions(+), 109 deletions(-) delete mode 100644 contrib/ray/kuberay-operator/overlays/standalone/namespace.yaml delete mode 100644 contrib/ray/raycluster_istio_auth_policy.yaml delete mode 100644 contrib/ray/raycluster_istio_headless_svc.yaml diff --git a/.github/workflows/ray_test.yaml b/.github/workflows/ray_test.yaml index 0d878927ac..5c3eb72c0b 100644 --- a/.github/workflows/ray_test.yaml +++ b/.github/workflows/ray_test.yaml @@ -16,10 +16,23 @@ jobs: - name: Install KinD, Create KinD cluster and Install kustomize run: ./tests/gh-actions/install_KinD_create_KinD_cluster_install_kustomize.sh - - name: Install Istio - run: ./tests/gh-actions/install_istio.sh + - name: Install Istio with external authentication + run: ./tests/gh-actions/install_istio_with_ext_auth.sh + + - name: Install cert-manager + run: ./tests/gh-actions/install_cert_manager.sh + + - name: Create kubeflow namespace + run: kustomize build common/kubeflow-namespace/base | kubectl apply -f - + + - name: Install KF Multi Tenancy + run: ./tests/gh-actions/install_multi_tenancy.sh + + - name: Create KF Profile + run: kustomize build common/user-namespace/base | kubectl apply -f - - name: Build & Apply manifests run: | cd contrib/ray/ + export KF_PROFILE=kubeflow-user-example-com make test diff --git a/contrib/ray/Makefile b/contrib/ray/Makefile index 1480d0d327..bfbcd1deeb 100644 --- a/contrib/ray/Makefile +++ b/contrib/ray/Makefile @@ -8,5 +8,4 @@ kuberay-operator/base: .PHONY: test test: - ./test.sh - + ./test.sh ${KF_PROFILE} \ No newline at end of file diff --git a/contrib/ray/kuberay-operator/overlays/standalone/kustomization.yaml b/contrib/ray/kuberay-operator/overlays/standalone/kustomization.yaml index fb5cfeb375..9637061216 100644 --- a/contrib/ray/kuberay-operator/overlays/standalone/kustomization.yaml +++ b/contrib/ray/kuberay-operator/overlays/standalone/kustomization.yaml @@ -1,4 +1,3 @@ namespace: kubeflow resources: -- ../../base -- namespace.yaml +- ../../base \ No newline at end of file diff --git a/contrib/ray/kuberay-operator/overlays/standalone/namespace.yaml b/contrib/ray/kuberay-operator/overlays/standalone/namespace.yaml deleted file mode 100644 index 56e1932179..0000000000 --- a/contrib/ray/kuberay-operator/overlays/standalone/namespace.yaml +++ /dev/null @@ -1,6 +0,0 @@ -apiVersion: v1 -kind: Namespace -metadata: - name: kubeflow - labels: - istio-injection: enabled \ No newline at end of file diff --git a/contrib/ray/raycluster_example.yaml b/contrib/ray/raycluster_example.yaml index 57fbd4fad5..268edfbe18 100644 --- a/contrib/ray/raycluster_example.yaml +++ b/contrib/ray/raycluster_example.yaml @@ -1,3 +1,79 @@ +apiVersion: security.istio.io/v1beta1 +kind: AuthorizationPolicy +metadata: + name: allow-ray-workers-head +spec: + action: ALLOW + rules: + - to: + - operation: + ports: + - "6379" +--- +apiVersion: v1 +kind: Service +metadata: + labels: + ray.io/headless-worker-svc: raycluster-istio + name: raycluster-istio-headless-svc +spec: + clusterIP: None + selector: + ray.io/cluster: kubeflow-raycluster + publishNotReadyAddresses: true + ports: + - name: node-manager-port + port: 6380 + appProtocol: grpc + - name: object-manager-port + port: 6381 + appProtocol: grpc + - name: runtime-env-agent-port + port: 6382 + appProtocol: grpc + - name: dashboard-agent-grpc-port + port: 6383 + appProtocol: grpc + - name: dashboard-agent-listen-port + port: 52365 + appProtocol: http + - name: metrics-export-port + port: 8080 + appProtocol: http + - name: p10002 + port: 10002 + appProtocol: grpc + - name: p10003 + port: 10003 + appProtocol: grpc + - name: p10004 + port: 10004 + appProtocol: grpc + - name: p10005 + port: 10005 + appProtocol: grpc + - name: p10006 + port: 10006 + appProtocol: grpc + - name: p10007 + port: 10007 + appProtocol: grpc + - name: p10008 + port: 10008 + appProtocol: grpc + - name: p10009 + port: 10009 + appProtocol: grpc + - name: p10010 + port: 10010 + appProtocol: grpc + - name: p10011 + port: 10011 + appProtocol: grpc + - name: p10012 + port: 10012 + appProtocol: grpc +--- apiVersion: ray.io/v1 kind: RayCluster metadata: @@ -18,7 +94,7 @@ spec: dashboard-agent-listen-port: '52365' metrics-export-port: '8080' max-worker-port: '10012' - node-ip-address: $(hostname -I | tr -d ' ' | sed 's/\./-/g').raycluster-istio-headless-svc.kubeflow.svc.cluster.local + node-ip-address: $(hostname -I | tr -d ' ' | sed 's/\./-/g').raycluster-istio-headless-svc.kubeflow-user-example-com.svc.cluster.local template: spec: containers: @@ -31,13 +107,9 @@ spec: volumeMounts: - mountPath: /tmp/ray name: ray-logs - # The resource requests and limits in this config are too small for production! - # It is better to use a few large Ray pod than many small ones. - # For production, it is ideal to size each Ray pod to take up the - # entire Kubernetes node on which it is scheduled. resources: limits: - cpu: "2" + cpu: "1" memory: "2G" requests: cpu: "100m" @@ -66,7 +138,7 @@ spec: dashboard-agent-listen-port: '52365' metrics-export-port: '8080' max-worker-port: '10012' - node-ip-address: $(hostname -I | tr -d ' ' | sed 's/\./-/g').raycluster-istio-headless-svc.kubeflow.svc.cluster.local + node-ip-address: $(hostname -I | tr -d ' ' | sed 's/\./-/g').raycluster-istio-headless-svc.kubeflow-user-example-com.svc.cluster.local template: spec: containers: @@ -81,13 +153,9 @@ spec: volumeMounts: - mountPath: /tmp/ray name: ray-logs - # The resource requests and limits in this config are too small for production! - # It is better to use a few large Ray pod than many small ones. - # For production, it is ideal to size each Ray pod to take up the - # entire Kubernetes node on which it is scheduled. resources: limits: - cpu: "2" + cpu: "1" memory: "1G" requests: cpu: "300m" diff --git a/contrib/ray/raycluster_istio_auth_policy.yaml b/contrib/ray/raycluster_istio_auth_policy.yaml deleted file mode 100644 index 0798753cd9..0000000000 --- a/contrib/ray/raycluster_istio_auth_policy.yaml +++ /dev/null @@ -1,8 +0,0 @@ -apiVersion: security.istio.io/v1 -kind: AuthorizationPolicy -metadata: - name: allow-all - namespace: kubeflow -spec: - rules: - - {} diff --git a/contrib/ray/raycluster_istio_headless_svc.yaml b/contrib/ray/raycluster_istio_headless_svc.yaml deleted file mode 100644 index fe89f81a73..0000000000 --- a/contrib/ray/raycluster_istio_headless_svc.yaml +++ /dev/null @@ -1,63 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - labels: - ray.io/headless-worker-svc: raycluster-istio - name: raycluster-istio-headless-svc -spec: - clusterIP: None - selector: - ray.io/cluster: kubeflow-raycluster - publishNotReadyAddresses: true - ports: - - name: node-manager-port - port: 6380 - appProtocol: grpc - - name: object-manager-port - port: 6381 - appProtocol: grpc - - name: runtime-env-agent-port - port: 6382 - appProtocol: grpc - - name: dashboard-agent-grpc-port - port: 6383 - appProtocol: grpc - - name: dashboard-agent-listen-port - port: 52365 - appProtocol: http - - name: metrics-export-port - port: 8080 - appProtocol: http - - name: p10002 - port: 10002 - appProtocol: grpc - - name: p10003 - port: 10003 - appProtocol: grpc - - name: p10004 - port: 10004 - appProtocol: grpc - - name: p10005 - port: 10005 - appProtocol: grpc - - name: p10006 - port: 10006 - appProtocol: grpc - - name: p10007 - port: 10007 - appProtocol: grpc - - name: p10008 - port: 10008 - appProtocol: grpc - - name: p10009 - port: 10009 - appProtocol: grpc - - name: p10010 - port: 10010 - appProtocol: grpc - - name: p10011 - port: 10011 - appProtocol: grpc - - name: p10012 - port: 10012 - appProtocol: grpc \ No newline at end of file diff --git a/contrib/ray/test.sh b/contrib/ray/test.sh index d8bd955bb9..5d8fbf4af5 100755 --- a/contrib/ray/test.sh +++ b/contrib/ray/test.sh @@ -2,13 +2,12 @@ set -euxo -NAMESPACE=kubeflow +NAMESPACE=$1 TIMEOUT=120 # timeout in seconds SLEEP_INTERVAL=30 # interval between checks in seconds RAY_VERSION=2.23.0 function trap_handler { - kill $PID # Delete RayCluster kubectl -n $NAMESPACE delete -f raycluster_example.yaml @@ -16,7 +15,8 @@ function trap_handler { start_time=$(date +%s) while true; do pods=$(kubectl -n $NAMESPACE get pods -o json | jq '.items | length') - if [ "$pods" -eq 1 ]; then + if [ "$pods" -eq 0 ]; then + kill $PID break fi current_time=$(date +%s) @@ -29,29 +29,25 @@ function trap_handler { done # Delete KubeRay operator - kustomize build kuberay-operator/base | kubectl -n $NAMESPACE delete -f - + kustomize build kuberay-operator/base | kubectl -n kubeflow delete -f - } trap trap_handler EXIT +kubectl label namespace $NAMESPACE istio-injection=enabled + kubectl get namespaces --selector=istio-injection=enabled # Install KubeRay operator -kustomize build kuberay-operator/overlays/standalone | kubectl -n $NAMESPACE apply --server-side -f - +kustomize build kuberay-operator/overlays/standalone | kubectl -n kubeflow apply --server-side -f - # Wait for the operator to be ready. -kubectl -n $NAMESPACE wait --for=condition=available --timeout=600s deploy/kuberay-operator -kubectl -n $NAMESPACE get pod -l app.kubernetes.io/component=kuberay-operator - -# Create a RayCluster Headless serivice -kubectl -n $NAMESPACE apply -f raycluster_istio_headless_svc.yaml - -# Create a RayCluster AuthorizationPolicy -kubectl apply -f raycluster_istio_auth_policy.yaml +kubectl -n kubeflow wait --for=condition=available --timeout=600s deploy/kuberay-operator +kubectl -n kubeflow get pod -l app.kubernetes.io/component=kuberay-operator -# Create a RayCluster custom resource. +# Install RayCluster components kubectl -n $NAMESPACE apply -f raycluster_example.yaml # Wait for the RayCluster to be ready. From 7b92d8d76f46f1128f9e1cf39c9af57228446335 Mon Sep 17 00:00:00 2001 From: hansinikarunarathne Date: Tue, 10 Sep 2024 01:30:01 +0530 Subject: [PATCH 20/24] fixed a issue with tets.sh Signed-off-by: hansinikarunarathne --- contrib/ray/test.sh | 57 +++++++++++++++++++++++---------------------- 1 file changed, 29 insertions(+), 28 deletions(-) diff --git a/contrib/ray/test.sh b/contrib/ray/test.sh index 5d8fbf4af5..22730d6d6e 100755 --- a/contrib/ray/test.sh +++ b/contrib/ray/test.sh @@ -7,39 +7,17 @@ TIMEOUT=120 # timeout in seconds SLEEP_INTERVAL=30 # interval between checks in seconds RAY_VERSION=2.23.0 -function trap_handler { - # Delete RayCluster - kubectl -n $NAMESPACE delete -f raycluster_example.yaml - - # Wait for all Ray Pods to be deleted. - start_time=$(date +%s) - while true; do - pods=$(kubectl -n $NAMESPACE get pods -o json | jq '.items | length') - if [ "$pods" -eq 0 ]; then - kill $PID - break - fi - current_time=$(date +%s) - elapsed_time=$((current_time - start_time)) - if [ "$elapsed_time" -ge "$TIMEOUT" ]; then - echo "Timeout exceeded. Exiting loop." - exit 1 - fi - sleep $SLEEP_INTERVAL - done - - # Delete KubeRay operator - kustomize build kuberay-operator/base | kubectl -n kubeflow delete -f - - -} - -trap trap_handler EXIT +while [[ $(kubectl get namespace $NAMESPACE --no-headers 2>/dev/null | wc -l) -eq 0 ]]; do + echo "Waiting for namespace $NAMESPACE to be created..." + sleep 2 +done + +echo "Namespace $NAMESPACE is created!" kubectl label namespace $NAMESPACE istio-injection=enabled kubectl get namespaces --selector=istio-injection=enabled - # Install KubeRay operator kustomize build kuberay-operator/overlays/standalone | kubectl -n kubeflow apply --server-side -f - @@ -73,3 +51,26 @@ else echo "Test failed!" exit 1 fi + +# Delete RayCluster +kubectl -n $NAMESPACE delete -f raycluster_example.yaml + +# Wait for all Ray Pods to be deleted. +start_time=$(date +%s) +while true; do + pods=$(kubectl -n $NAMESPACE get pods -o json | jq '.items | length') + if [ "$pods" -eq 0 ]; then + kill $PID + break + fi + current_time=$(date +%s) + elapsed_time=$((current_time - start_time)) + if [ "$elapsed_time" -ge "$TIMEOUT" ]; then + echo "Timeout exceeded. Exiting loop." + exit 1 + fi + sleep $SLEEP_INTERVAL +done + +# Delete KubeRay operator +kustomize build kuberay-operator/base | kubectl -n kubeflow delete -f - \ No newline at end of file From 847d4537c58775cdf6c53243cd81afc34c52d11c Mon Sep 17 00:00:00 2001 From: hansinikarunarathne Date: Sat, 14 Sep 2024 21:05:18 +0530 Subject: [PATCH 21/24] Did the requested chnages Signed-off-by: hansinikarunarathne --- contrib/ray/raycluster_example.yaml | 49 +++++++++++------------------ contrib/ray/test.sh | 22 ++++++++++--- 2 files changed, 36 insertions(+), 35 deletions(-) diff --git a/contrib/ray/raycluster_example.yaml b/contrib/ray/raycluster_example.yaml index 268edfbe18..0b64ff830e 100644 --- a/contrib/ray/raycluster_example.yaml +++ b/contrib/ray/raycluster_example.yaml @@ -5,10 +5,21 @@ metadata: spec: action: ALLOW rules: + - from: + - source: + principals: + - "cluster.local/ns/kubeflow-user-example-com/sa/kubeflow-raycluster" - to: - operation: ports: - "6379" + - "6380" + - "6381" + - "6382" + - "6383" + - "52365" + - "8080" + - "10012" --- apiVersion: v1 kind: Service @@ -40,37 +51,7 @@ spec: - name: metrics-export-port port: 8080 appProtocol: http - - name: p10002 - port: 10002 - appProtocol: grpc - - name: p10003 - port: 10003 - appProtocol: grpc - - name: p10004 - port: 10004 - appProtocol: grpc - - name: p10005 - port: 10005 - appProtocol: grpc - - name: p10006 - port: 10006 - appProtocol: grpc - - name: p10007 - port: 10007 - appProtocol: grpc - - name: p10008 - port: 10008 - appProtocol: grpc - - name: p10009 - port: 10009 - appProtocol: grpc - - name: p10010 - port: 10010 - appProtocol: grpc - - name: p10011 - port: 10011 - appProtocol: grpc - - name: p10012 + - name: max-worker-port port: 10012 appProtocol: grpc --- @@ -96,6 +77,9 @@ spec: max-worker-port: '10012' node-ip-address: $(hostname -I | tr -d ' ' | sed 's/\./-/g').raycluster-istio-headless-svc.kubeflow-user-example-com.svc.cluster.local template: + metadata: + labels: + sidecar.istio.io/inject: "true" spec: containers: - name: ray-head @@ -140,6 +124,9 @@ spec: max-worker-port: '10012' node-ip-address: $(hostname -I | tr -d ' ' | sed 's/\./-/g').raycluster-istio-headless-svc.kubeflow-user-example-com.svc.cluster.local template: + metadata: + labels: + sidecar.istio.io/inject: "true" spec: containers: - name: ray-worker diff --git a/contrib/ray/test.sh b/contrib/ray/test.sh index 22730d6d6e..450d31c7d7 100755 --- a/contrib/ray/test.sh +++ b/contrib/ray/test.sh @@ -7,9 +7,23 @@ TIMEOUT=120 # timeout in seconds SLEEP_INTERVAL=30 # interval between checks in seconds RAY_VERSION=2.23.0 -while [[ $(kubectl get namespace $NAMESPACE --no-headers 2>/dev/null | wc -l) -eq 0 ]]; do - echo "Waiting for namespace $NAMESPACE to be created..." - sleep 2 +start_time=$(date +%s) +for ((i=0; i/dev/null | wc -l) -eq 1 ]]; then + echo "Namespace $NAMESPACE created." + break + fi + + current_time=$(date +%s) + elapsed_time=$((current_time - start_time)) + + if [ "$elapsed_time" -ge "$TIMEOUT" ]; then + echo "Timeout exceeded. Namespace $NAMESPACE not created." + exit 1 + fi + + echo "Waiting for namespace $NAMESPACE to be created..." + sleep 2 done echo "Namespace $NAMESPACE is created!" @@ -57,7 +71,7 @@ kubectl -n $NAMESPACE delete -f raycluster_example.yaml # Wait for all Ray Pods to be deleted. start_time=$(date +%s) -while true; do +for ((i=0; i Date: Sun, 15 Sep 2024 07:40:30 +0000 Subject: [PATCH 22/24] Changed the sa to default-editor Signed-off-by: Hansini Karunarathne <107214435+hansinikarunarathne@users.noreply.github.com> --- contrib/ray/raycluster_example.yaml | 2 +- contrib/ray/test.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/contrib/ray/raycluster_example.yaml b/contrib/ray/raycluster_example.yaml index 0b64ff830e..baf226257e 100644 --- a/contrib/ray/raycluster_example.yaml +++ b/contrib/ray/raycluster_example.yaml @@ -8,7 +8,7 @@ spec: - from: - source: principals: - - "cluster.local/ns/kubeflow-user-example-com/sa/kubeflow-raycluster" + - "cluster.local/ns/kubeflow-user-example-com/sa/default-editor" - to: - operation: ports: diff --git a/contrib/ray/test.sh b/contrib/ray/test.sh index 450d31c7d7..71c8c5eaeb 100755 --- a/contrib/ray/test.sh +++ b/contrib/ray/test.sh @@ -26,7 +26,7 @@ for ((i=0; i Date: Sun, 15 Sep 2024 09:14:23 +0000 Subject: [PATCH 23/24] add sa to deployment Signed-off-by: Hansini Karunarathne <107214435+hansinikarunarathne@users.noreply.github.com> --- contrib/ray/raycluster_example.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/contrib/ray/raycluster_example.yaml b/contrib/ray/raycluster_example.yaml index baf226257e..5c7e5d3735 100644 --- a/contrib/ray/raycluster_example.yaml +++ b/contrib/ray/raycluster_example.yaml @@ -81,6 +81,7 @@ spec: labels: sidecar.istio.io/inject: "true" spec: + serviceAccountName: default-editor containers: - name: ray-head image: rayproject/ray:2.23.0-py311-cpu @@ -128,6 +129,7 @@ spec: labels: sidecar.istio.io/inject: "true" spec: + serviceAccountName: default-editor containers: - name: ray-worker image: rayproject/ray:2.23.0-py311-cpu From 4f875a37a53c922ff566ef22dcacc04899fd22f6 Mon Sep 17 00:00:00 2001 From: Hansini Karunarathne <107214435+hansinikarunarathne@users.noreply.github.com> Date: Fri, 20 Sep 2024 09:22:27 +0000 Subject: [PATCH 24/24] Update the raycluster Readme file Signed-off-by: Hansini Karunarathne <107214435+hansinikarunarathne@users.noreply.github.com> --- contrib/ray/README.md | 24 +++++++++++++++++++++++- contrib/ray/raycluster_example.yaml | 3 +++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/contrib/ray/README.md b/contrib/ray/README.md index d96a3707cf..3ae2e6f8c4 100644 --- a/contrib/ray/README.md +++ b/contrib/ray/README.md @@ -54,12 +54,31 @@ kubectl get pod -l app.kubernetes.io/component=kuberay-operator -n kubeflow # NAME READY STATUS RESTARTS AGE # kuberay-operator-5b8cd69758-rkpvh 1/1 Running 0 6m23s ``` +## Step 3: Create a namespace +```sh +# Create a namespace: example-"development" +kubectl create ns development + +# Enable isito-injection for the namespace +kubectl label namespace development istio-injection=enabled + +# After creating the namespace, You have to do below mentioned changes in raycluster_example.yaml file(Required changes are also mentioned as comments in yaml file itself) + +# 01. Replace the namesapce of AuthorizationPolicy principal + + principals: + - "cluster.local/ns/development/sa/default-editor" + +# 02. Replace the nampespace of node-ip-address of headGroupSpec and workerGroupSpec + + node-ip-address: $(hostname -I | tr -d ' ' | sed 's/\./-/g').raycluster-istio-headless-svc.development.svc.cluster.local +``` ## Step 3: Install RayCluster ```sh # Create a RayCluster CR, and the KubeRay operator will reconcile a Ray cluster # with 1 head Pod and 1 worker Pod. -# $MY_KUBEFLOW_USER_NAMESPACE is a proper Kubeflow user namespace with istio sidecar injection and never ever the wrong "default" +# $MY_KUBEFLOW_USER_NAMESPACE is the namesapce that has been created in the above step. export MY_KUBEFLOW_USER_NAMESPACE=development kubectl apply -f raycluster_example.yaml -n $MY_KUBEFLOW_USER_NAMESPACE @@ -68,6 +87,9 @@ kubectl get pod -l ray.io/cluster=kubeflow-raycluster -n $MY_KUBEFLOW_USER_NAMES # NAME READY STATUS RESTARTS AGE # kubeflow-raycluster-head-p6dpk 1/1 Running 0 70s # kubeflow-raycluster-worker-small-group-l7j6c 1/1 Running 0 70s + +#Check Raycluster headless service +kubectl get svc -n $MY_KUBEFLOW_USER_NAMESPACE ``` * `raycluster_example.yaml` uses `rayproject/ray:2.23.0-py311-cpu` as its OCI image. Ray is very sensitive to the Python versions and Ray versions between the server (RayCluster) and client (JupyterLab) sides. This image uses: * Python 3.11 diff --git a/contrib/ray/raycluster_example.yaml b/contrib/ray/raycluster_example.yaml index 5c7e5d3735..5cee01bc90 100644 --- a/contrib/ray/raycluster_example.yaml +++ b/contrib/ray/raycluster_example.yaml @@ -8,6 +8,7 @@ spec: - from: - source: principals: + # kubeflow-user-example-com should be replaced with the namespace where the Ray cluster is being deployed - "cluster.local/ns/kubeflow-user-example-com/sa/default-editor" - to: - operation: @@ -75,6 +76,7 @@ spec: dashboard-agent-listen-port: '52365' metrics-export-port: '8080' max-worker-port: '10012' + # kubeflow-user-example-com should be replaced with the namespace where the Ray cluster is being deployed node-ip-address: $(hostname -I | tr -d ' ' | sed 's/\./-/g').raycluster-istio-headless-svc.kubeflow-user-example-com.svc.cluster.local template: metadata: @@ -123,6 +125,7 @@ spec: dashboard-agent-listen-port: '52365' metrics-export-port: '8080' max-worker-port: '10012' + # kubeflow-user-example-com should be replaced with the namespace where the Ray cluster is being deployed node-ip-address: $(hostname -I | tr -d ' ' | sed 's/\./-/g').raycluster-istio-headless-svc.kubeflow-user-example-com.svc.cluster.local template: metadata: