From 09d25127aa6a4825d772c444e74b08020497943d Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Fri, 11 Aug 2023 11:29:43 +0100 Subject: [PATCH 01/24] Add Known Issues heading to start documenting these --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 2edf8a0..11fe8b8 100644 --- a/README.md +++ b/README.md @@ -171,3 +171,5 @@ and then restart the other dependent deployments to propagate changes: ```console kubectl rollout restart deployment slurmd slurmctld login slurmdbd ``` + +# Known Issues From 9979627bbe7c4a5f993a23ce5ca3ba7aacf17f21 Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Fri, 11 Aug 2023 12:24:59 +0100 Subject: [PATCH 02/24] Convert Rook NFS to Helm chart - Adds Rook NFS Helm chart as dependency of Slurm cluster chart - Refactors main values file to allow additional customisation - Adds cleanup job as pre-delete hook to fix uninstall behaviour --- .gitignore | 3 + nfs/deploy-nfs.sh | 11 ---- nfs/pvc.yaml | 11 ---- nfs/sc.yaml | 13 ----- nfs/teardown-nfs.sh | 16 ------ rooknfs/Chart.yaml | 4 ++ rooknfs/README.md | 0 {nfs => rooknfs/crds}/crds.yaml | 0 {nfs => rooknfs/templates}/nfs.yaml | 18 +++--- {nfs => rooknfs/templates}/operator.yaml | 12 ++-- {nfs => rooknfs/templates}/rbac.yaml | 10 ++-- rooknfs/templates/sc.yaml | 17 ++++++ rooknfs/values.yaml | 30 ++++++++++ slurm-cluster-chart/Chart.yaml | 7 ++- .../templates/hooks/pre-delete.yaml | 55 +++++++++++++++++++ .../{login-deployment.yaml => login.yaml} | 8 +-- slurm-cluster-chart/templates/pvc.yaml | 14 +++++ ...rmctld-statefulset.yaml => slurmctld.yaml} | 6 +- .../{slurmd-deployment.yaml => slurmd.yaml} | 9 +-- slurm-cluster-chart/values.yaml | 50 +++++++++++++++-- 20 files changed, 211 insertions(+), 83 deletions(-) create mode 100644 .gitignore delete mode 100755 nfs/deploy-nfs.sh delete mode 100644 nfs/pvc.yaml delete mode 100644 nfs/sc.yaml delete mode 100755 nfs/teardown-nfs.sh create mode 100644 rooknfs/Chart.yaml create mode 100644 rooknfs/README.md rename {nfs => rooknfs/crds}/crds.yaml (100%) rename {nfs => rooknfs/templates}/nfs.yaml (61%) rename {nfs => rooknfs/templates}/operator.yaml (91%) rename {nfs => rooknfs/templates}/rbac.yaml (88%) create mode 100644 rooknfs/templates/sc.yaml create mode 100644 rooknfs/values.yaml create mode 100644 slurm-cluster-chart/templates/hooks/pre-delete.yaml rename slurm-cluster-chart/templates/{login-deployment.yaml => login.yaml} (90%) create mode 100644 slurm-cluster-chart/templates/pvc.yaml rename slurm-cluster-chart/templates/{slurmctld-statefulset.yaml => slurmctld.yaml} (91%) rename slurm-cluster-chart/templates/{slurmd-deployment.yaml => slurmd.yaml} (88%) diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0ba5327 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +# Build artifacts from local helm install +slurm-cluster-chart/Chart.lock +slurm-cluster-chart/charts/ diff --git a/nfs/deploy-nfs.sh b/nfs/deploy-nfs.sh deleted file mode 100755 index b2d2f75..0000000 --- a/nfs/deploy-nfs.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -# Based on https://rook.io/docs/nfs/v1.7/quickstart.html -# Manifests listed explicitly here to guarantee ordering - -kubectl create -f nfs/crds.yaml -kubectl create -f nfs/operator.yaml -kubectl create -f nfs/rbac.yaml -kubectl create -f nfs/nfs.yaml -kubectl create -f nfs/sc.yaml -kubectl create -f nfs/pvc.yaml diff --git a/nfs/pvc.yaml b/nfs/pvc.yaml deleted file mode 100644 index 7f0a3d7..0000000 --- a/nfs/pvc.yaml +++ /dev/null @@ -1,11 +0,0 @@ -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: rook-nfs-pv-claim -spec: - storageClassName: "rook-nfs-share1" - accessModes: - - ReadWriteMany - resources: - requests: - storage: 10Gi diff --git a/nfs/sc.yaml b/nfs/sc.yaml deleted file mode 100644 index 6f9e3ae..0000000 --- a/nfs/sc.yaml +++ /dev/null @@ -1,13 +0,0 @@ -apiVersion: storage.k8s.io/v1 -kind: StorageClass -metadata: - labels: - app: rook-nfs - name: rook-nfs-share1 -parameters: - exportName: share1 - nfsServerName: rook-nfs - nfsServerNamespace: rook-nfs -provisioner: nfs.rook.io/rook-nfs-provisioner -reclaimPolicy: Delete -volumeBindingMode: Immediate diff --git a/nfs/teardown-nfs.sh b/nfs/teardown-nfs.sh deleted file mode 100755 index 4dde364..0000000 --- a/nfs/teardown-nfs.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash - -kubectl delete -f web-service.yaml -kubectl delete -f web-rc.yaml -kubectl delete -f busybox-rc.yaml -kubectl delete -f pvc.yaml -kubectl delete -f pv.yaml -kubectl delete -f nfs.yaml -kubectl delete -f nfs-xfs.yaml -kubectl delete -f nfs-ceph.yaml -kubectl delete -f rbac.yaml -kubectl delete -f psp.yaml -kubectl delete -f scc.yaml # if deployed -kubectl delete -f operator.yaml -kubectl delete -f webhook.yaml # if deployed -kubectl delete -f crds.yaml diff --git a/rooknfs/Chart.yaml b/rooknfs/Chart.yaml new file mode 100644 index 0000000..83a2a11 --- /dev/null +++ b/rooknfs/Chart.yaml @@ -0,0 +1,4 @@ +apiVersion: v2 +name: rooknfs +version: 0.0.1 +description: An packaged installation of Rook NFS for Kubernetes. \ No newline at end of file diff --git a/rooknfs/README.md b/rooknfs/README.md new file mode 100644 index 0000000..e69de29 diff --git a/nfs/crds.yaml b/rooknfs/crds/crds.yaml similarity index 100% rename from nfs/crds.yaml rename to rooknfs/crds/crds.yaml diff --git a/nfs/nfs.yaml b/rooknfs/templates/nfs.yaml similarity index 61% rename from nfs/nfs.yaml rename to rooknfs/templates/nfs.yaml index 742fa34..6fde553 100644 --- a/nfs/nfs.yaml +++ b/rooknfs/templates/nfs.yaml @@ -1,32 +1,36 @@ +{{- if .Values.enabled }} --- # A default storageclass must be present apiVersion: v1 kind: PersistentVolumeClaim metadata: - name: nfs-default-claim - namespace: rook-nfs + name: {{ .Values.claimName}} + namespace: {{ .Values.serverNamespace }} spec: accessModes: - ReadWriteMany resources: requests: - storage: 1Gi + storage: {{ .Values.storageCapacity }} --- apiVersion: nfs.rook.io/v1alpha1 kind: NFSServer metadata: - name: rook-nfs - namespace: rook-nfs + name: {{ .Values.serverName }} + namespace: {{ .Values.serverNamespace }} spec: replicas: 1 exports: - - name: share1 + - name: {{ .Values.shareName }} server: accessMode: ReadWrite squash: "none" # A Persistent Volume Claim must be created before creating NFS CRD instance. persistentVolumeClaim: - claimName: nfs-default-claim + claimName: {{ .Values.claimName }} # A key/value list of annotations annotations: rook: nfs +--- +{{- end }} + diff --git a/nfs/operator.yaml b/rooknfs/templates/operator.yaml similarity index 91% rename from nfs/operator.yaml rename to rooknfs/templates/operator.yaml index b289909..4a1d542 100644 --- a/nfs/operator.yaml +++ b/rooknfs/templates/operator.yaml @@ -1,13 +1,15 @@ +{{- if .Values.enabled }} +--- apiVersion: v1 kind: Namespace metadata: - name: rook-nfs-system # namespace:operator + name: {{ .Values.systemNamespace }} --- apiVersion: v1 kind: ServiceAccount metadata: name: rook-nfs-operator - namespace: rook-nfs-system # namespace:operator + namespace: {{ .Values.systemNamespace }} --- kind: ClusterRoleBinding apiVersion: rbac.authorization.k8s.io/v1 @@ -20,7 +22,7 @@ roleRef: subjects: - kind: ServiceAccount name: rook-nfs-operator - namespace: rook-nfs-system # namespace:operator + namespace: {{ .Values.systemNamespace }} --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole @@ -106,7 +108,7 @@ apiVersion: apps/v1 kind: Deployment metadata: name: rook-nfs-operator - namespace: rook-nfs-system # namespace:operator + namespace: {{ .Values.systemNamespace }} labels: app: rook-nfs-operator spec: @@ -134,3 +136,5 @@ spec: valueFrom: fieldRef: fieldPath: metadata.namespace +--- +{{- end}} diff --git a/nfs/rbac.yaml b/rooknfs/templates/rbac.yaml similarity index 88% rename from nfs/rbac.yaml rename to rooknfs/templates/rbac.yaml index 8e3d9f7..b327740 100644 --- a/nfs/rbac.yaml +++ b/rooknfs/templates/rbac.yaml @@ -1,14 +1,15 @@ +{{- if .Values.enabled }} --- apiVersion: v1 kind: Namespace metadata: - name: rook-nfs + name: {{ .Values.serverNamespace }} --- apiVersion: v1 kind: ServiceAccount metadata: name: rook-nfs-server - namespace: rook-nfs + namespace: {{ .Values.serverNamespace }} --- kind: ClusterRole apiVersion: rbac.authorization.k8s.io/v1 @@ -51,9 +52,10 @@ metadata: subjects: - kind: ServiceAccount name: rook-nfs-server - # replace with namespace where provisioner is deployed - namespace: rook-nfs + namespace: {{ .Values.serverNamespace }} roleRef: kind: ClusterRole name: rook-nfs-provisioner-runner apiGroup: rbac.authorization.k8s.io +--- +{{- end }} \ No newline at end of file diff --git a/rooknfs/templates/sc.yaml b/rooknfs/templates/sc.yaml new file mode 100644 index 0000000..0ad75fe --- /dev/null +++ b/rooknfs/templates/sc.yaml @@ -0,0 +1,17 @@ +{{- if .Values.enabled }} +--- +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + labels: + app: rook-nfs + name: {{ .Values.storageClassName }} +parameters: + exportName: {{ .Values.shareName }} + nfsServerName: {{ .Values.serverName }} + nfsServerNamespace: {{ .Values.serverNamespace }} +provisioner: nfs.rook.io/rook-nfs-provisioner +reclaimPolicy: Delete +volumeBindingMode: Immediate +--- +{{- end }} \ No newline at end of file diff --git a/rooknfs/values.yaml b/rooknfs/values.yaml new file mode 100644 index 0000000..1961fa6 --- /dev/null +++ b/rooknfs/values.yaml @@ -0,0 +1,30 @@ +# Global flag for enabling/disabling all chart resources +# This is useful for allowing charts which use this chart +# as a dependency to toggle usage of this chart based on +# values in the parent chart +enabled: true + +# Name for the NFSServer resource created by rook +serverName: rook-nfs + +# Name for the created storage class +storageClassName: rook-nfs + +# Name for the Read-Write-Once backing PVC created by Rook +claimName: rook-nfs-backing-pv + +# Name for the NFS share within the NFS Resource instance +shareName: share-1 + +# Size of the Read-Write-Once backing storage volume +storageCapacity: 10Gi + +# Image to use for the Rook NFS operator +operatorImage: rook/nfs:master + +# NOTE: For some reason deploying everything in the default +# namespace leads to R-W-M PVCs getting stuck in 'pending' +# state indefinitely, so here we separate out namespaces as +# of various components in the same way as the Rook docs +serverNamespace: rook-nfs +systemNamespace: rook-nfs-system \ No newline at end of file diff --git a/slurm-cluster-chart/Chart.yaml b/slurm-cluster-chart/Chart.yaml index 9e592c0..4dad59b 100644 --- a/slurm-cluster-chart/Chart.yaml +++ b/slurm-cluster-chart/Chart.yaml @@ -21,4 +21,9 @@ version: 0.1.0 # incremented each time you make changes to the application. Versions are not expected to # follow Semantic Versioning. They should reflect the version the application is using. # It is recommended to use it with quotes. -appVersion: "1.16.0" \ No newline at end of file +appVersion: "1.16.0" + +dependencies: + - name: rooknfs + version: 0.0.1 + repository: file://../rooknfs \ No newline at end of file diff --git a/slurm-cluster-chart/templates/hooks/pre-delete.yaml b/slurm-cluster-chart/templates/hooks/pre-delete.yaml new file mode 100644 index 0000000..8cdb1f3 --- /dev/null +++ b/slurm-cluster-chart/templates/hooks/pre-delete.yaml @@ -0,0 +1,55 @@ +{{- if .Values.rooknfs.enabled }} +# NOTE: The cleanup jobs defined here are required to ensure that things which +# Rook NFS is responsible for cleaning up are deleted before deleting the Rook +# pods which do the actual clean up of NFS resources. For example, the RWM PVC +# must be deleted before the Rook StorageClass and provisioner pod. However, +# the PVC cannot be deleted until the pods which are using it are deleted, so +# the various Slurm node pods must actually be the first resources deleted. +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: rook-nfs-cleanup +--- +# TODO: Create a job-specific ClusterRole for the ServiceAccount +# instead of using the cluster-admin role here +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: rook-nfs-cleanup +subjects: +- kind: ServiceAccount + name: rook-nfs-cleanup + namespace: {{ .Release.Namespace }} +roleRef: + kind: ClusterRole + name: cluster-admin +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: rook-nfs-pre-delete-cleanup + annotations: + "helm.sh/hook": pre-delete + "helm.sh/hook-delete-policy": hook-succeeded +spec: + template: + metadata: + name: rook-nfs-pre-delete-cleanup + spec: + serviceAccountName: rook-nfs-cleanup + containers: + - name: tester + image: bitnami/kubectl + command: + - "bin/bash" + - "-c" + - | + kubectl delete -n {{ .Release.Namespace }} deployment {{ .Values.login.name }} --wait --cascade=foreground + kubectl delete -n {{ .Release.Namespace }} statefulset {{ .Values.slurmctld.name }} --wait --cascade=foreground + kubectl delete -n {{ .Release.Namespace }} statefulset {{ .Values.slurmd.name }} --wait --cascade=foreground + kubectl delete -n {{ .Release.Namespace }} pvc {{ .Values.storage.claimName }} --wait + kubectl delete -n {{ .Values.rooknfs.serverNamespace }} nfsservers {{ .Values.rooknfs.serverName }} --wait + restartPolicy: Never +--- +{{- end }} diff --git a/slurm-cluster-chart/templates/login-deployment.yaml b/slurm-cluster-chart/templates/login.yaml similarity index 90% rename from slurm-cluster-chart/templates/login-deployment.yaml rename to slurm-cluster-chart/templates/login.yaml index 48f8f17..ca63392 100644 --- a/slurm-cluster-chart/templates/login-deployment.yaml +++ b/slurm-cluster-chart/templates/login.yaml @@ -5,9 +5,9 @@ metadata: labels: app.kubernetes.io/name: slurm app.kubernetes.io/component: login - name: login + name: {{ .Values.login.name }} spec: - replicas: {{ .Values.replicas.login }} + replicas: {{ .Values.login.replicas }} selector: matchLabels: app.kubernetes.io/name: slurm @@ -29,7 +29,7 @@ spec: ports: - containerPort: 22 volumeMounts: - - mountPath: {{ .Values.nfs.mountPath }} + - mountPath: {{ .Values.storage.mountPath }} name: slurm-jobdir - mountPath: /etc/slurm/ name: slurm-config-volume @@ -51,7 +51,7 @@ spec: volumes: - name: slurm-jobdir persistentVolumeClaim: - claimName: {{ .Values.nfs.claimName }} + claimName: {{ .Values.storage.claimName }} - name: slurm-config-volume configMap: name: {{ .Values.configmaps.slurmConf }} diff --git a/slurm-cluster-chart/templates/pvc.yaml b/slurm-cluster-chart/templates/pvc.yaml new file mode 100644 index 0000000..c5d5955 --- /dev/null +++ b/slurm-cluster-chart/templates/pvc.yaml @@ -0,0 +1,14 @@ +{{- if .Values.rooknfs.enabled }} +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ .Values.storage.claimName }} +spec: + storageClassName: {{ .Values.storageClassName }} + accessModes: + - ReadWriteMany + resources: + requests: + storage: {{ .Values.storage.capacity }} +{{- end }} \ No newline at end of file diff --git a/slurm-cluster-chart/templates/slurmctld-statefulset.yaml b/slurm-cluster-chart/templates/slurmctld.yaml similarity index 91% rename from slurm-cluster-chart/templates/slurmctld-statefulset.yaml rename to slurm-cluster-chart/templates/slurmctld.yaml index dc0bf90..f919c5f 100644 --- a/slurm-cluster-chart/templates/slurmctld-statefulset.yaml +++ b/slurm-cluster-chart/templates/slurmctld.yaml @@ -5,7 +5,7 @@ metadata: labels: app.kubernetes.io/name: slurm app.kubernetes.io/component: slurmctld - name: slurmctld + name: {{ .Values.slurmctld.name }} spec: replicas: 1 selector: @@ -29,7 +29,7 @@ spec: - containerPort: 6817 resources: {} volumeMounts: - - mountPath: {{ .Values.nfs.mountPath }} + - mountPath: {{ .Values.storage.mountPath }} name: slurm-jobdir - mountPath: /etc/slurm/ name: slurm-config-volume @@ -45,7 +45,7 @@ spec: volumes: - name: slurm-jobdir persistentVolumeClaim: - claimName: {{ .Values.nfs.claimName }} + claimName: {{ .Values.storage.claimName }} - name: slurmctld-state persistentVolumeClaim: claimName: var-spool-slurmctld diff --git a/slurm-cluster-chart/templates/slurmd-deployment.yaml b/slurm-cluster-chart/templates/slurmd.yaml similarity index 88% rename from slurm-cluster-chart/templates/slurmd-deployment.yaml rename to slurm-cluster-chart/templates/slurmd.yaml index 4c2396e..4775748 100644 --- a/slurm-cluster-chart/templates/slurmd-deployment.yaml +++ b/slurm-cluster-chart/templates/slurmd.yaml @@ -5,9 +5,9 @@ metadata: labels: app.kubernetes.io/name: slurm app.kubernetes.io/component: slurmd - name: slurmd + name: {{ .Values.slurmd.name }} spec: - replicas: {{ .Values.replicas.slurmd }} + replicas: {{ .Values.slurmd.replicas }} selector: matchLabels: app.kubernetes.io/name: slurm @@ -41,7 +41,8 @@ spec: volumeMounts: - mountPath: /etc/slurm/ name: slurm-config-volume - - mountPath: {{ .Values.nfs.mountPath }} + subPath: slurm.conf + - mountPath: {{ .Values.storage.mountPath }} name: slurm-jobdir - mountPath: /tmp/munge.key name: munge-key-secret @@ -55,7 +56,7 @@ spec: volumes: - name: slurm-jobdir persistentVolumeClaim: - claimName: {{ .Values.nfs.claimName }} + claimName: {{ .Values.storage.claimName }} - name: slurm-config-volume configMap: name: {{ .Values.configmaps.slurmConf }} diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index 7873e5c..eb9501c 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -1,12 +1,52 @@ slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:7f4d64e -replicas: - slurmd: 2 - login: 1 +login: + # Deployment resource name + name: login + replicas: 1 -nfs: +slurmd: + # StatefulSet resource name + name: slurmd + replicas: 2 + +slurmctld: + # StatefulSet resource name + name: slurmctld + # NOTE: We don't include a replicas field here because + # replicas > 1 for slurmctld needs extra Slurm config + +storage: mountPath: /home - claimName: rook-nfs-pv-claim + # The name of a Read-Write-Many StorageClass to use for + # the persistent volume which is shared across Slurm nodes + # Note: If using the default value then you must set + # rooknfs.enabled = true below to ensure that Rook NFS is + # installed on the cluster as a dependency of this Slurm + # chart. If you are using a separate RWM StorageClass, then + # set rooknfs.enabled = false + storageClassName: &storageclassname slurm-rook-nfs + # Name for the R-W-M volume to provision + claimName: slurm-shared-storage + # Capacite of the R-W-M volume + capacity: &capacity 10Gi + + +# Values to be passed to the rook-nfs sub-chart +# See rook-nfs sub-chart for full set of available config values +rooknfs: + enabled: true + storageClassName: *storageclassname + # Name for the NFSServer resource created by Rook + serverName: rook-nfs + # Capacity for the backing Read-Write-*Once* volume + # than Rook will create to provide the actual storage to + # the NFS server. Since we're using the Rook NFS in a + # slightly unconventional way here, we just want to anchor + # this value to the requested storage capacity for the RWM + # volume specified in storage.capacity + storageCapacity: *capacity + sqlImage: mariadb:10.10 From edfdd7c1fe8e14e889f7632249c16b3bb580dcf3 Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Fri, 11 Aug 2023 14:26:24 +0100 Subject: [PATCH 03/24] Fix storageClassName templating typo --- slurm-cluster-chart/templates/pvc.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurm-cluster-chart/templates/pvc.yaml b/slurm-cluster-chart/templates/pvc.yaml index c5d5955..5e934ef 100644 --- a/slurm-cluster-chart/templates/pvc.yaml +++ b/slurm-cluster-chart/templates/pvc.yaml @@ -5,7 +5,7 @@ kind: PersistentVolumeClaim metadata: name: {{ .Values.storage.claimName }} spec: - storageClassName: {{ .Values.storageClassName }} + storageClassName: {{ .Values.storage.storageClassName }} accessModes: - ReadWriteMany resources: From 4407fbe486a3b78bda85f93ac39fc9adda94d0f6 Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Fri, 11 Aug 2023 14:55:20 +0100 Subject: [PATCH 04/24] Remove broken subPath spec --- slurm-cluster-chart/templates/slurmd.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/slurm-cluster-chart/templates/slurmd.yaml b/slurm-cluster-chart/templates/slurmd.yaml index 4775748..ff13019 100644 --- a/slurm-cluster-chart/templates/slurmd.yaml +++ b/slurm-cluster-chart/templates/slurmd.yaml @@ -41,7 +41,6 @@ spec: volumeMounts: - mountPath: /etc/slurm/ name: slurm-config-volume - subPath: slurm.conf - mountPath: {{ .Values.storage.mountPath }} name: slurm-jobdir - mountPath: /tmp/munge.key From 2ac2fd5aae4a3cd7fb824662e87fdf9b6071c384 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Mon, 14 Aug 2023 13:57:03 +0100 Subject: [PATCH 05/24] Working Helm chart publisher workflow (#25) * Added custom packaging to workflow * Trying adding charts to cr packages * Now publishes rook chart * Temporarily removed slurm chart from publisher to publish initial rook chart to repo * Trying with new workflow and temporarily removing dependency * Re-added rook dependency * Added upterm debugging * Changed rooknfs version * Removed debug --- .github/workflows/publish-helm-chart.yml | 47 +++++++------------ rooknfs/values.yaml | 2 +- slurm-cluster-chart/Chart.yaml | 4 +- .../{ => hooks}/check-jobs-finished-hook.yaml | 0 4 files changed, 21 insertions(+), 32 deletions(-) rename slurm-cluster-chart/templates/{ => hooks}/check-jobs-finished-hook.yaml (100%) diff --git a/.github/workflows/publish-helm-chart.yml b/.github/workflows/publish-helm-chart.yml index 8ce0698..516e388 100644 --- a/.github/workflows/publish-helm-chart.yml +++ b/.github/workflows/publish-helm-chart.yml @@ -1,37 +1,26 @@ -name: Release Charts - -on: - push: - branches: - - main - +name: Publish charts +# Run the tasks on every push +on: push jobs: - release: - # depending on default permission settings for your org (contents being read-only or read-write for workloads), you will have to add permissions - # see: https://docs.github.com/en/actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token - permissions: - contents: write + publish_charts: + name: Build and push Helm charts runs-on: ubuntu-latest steps: - - name: Checkout - uses: actions/checkout@v3 + - name: Check out the repository + uses: actions/checkout@v2 with: + # This is important for the semver action to work correctly + # when determining the number of commits since the last tag fetch-depth: 0 + submodules: true - - name: Configure Git - run: | - git config user.name "$GITHUB_ACTOR" - git config user.email "$GITHUB_ACTOR@users.noreply.github.com" - - - name: Install Helm - uses: azure/setup-helm@v3 - env: - GITHUB_TOKEN: "${{ secrets.GITHUB_TOKEN }}" + - name: Get SemVer version for current commit + id: semver + uses: stackhpc/github-actions/semver@master - - name: Run chart-releaser - uses: helm/chart-releaser-action@v1.5.0 + - name: Publish Helm charts + uses: stackhpc/github-actions/helm-publish@master with: - charts_dir: . - env: - CR_TOKEN: "${{ secrets.GITHUB_TOKEN }}" - + token: ${{ secrets.GITHUB_TOKEN }} + version: ${{ steps.semver.outputs.version }} + app-version: ${{ steps.semver.outputs.short-sha }} diff --git a/rooknfs/values.yaml b/rooknfs/values.yaml index 1961fa6..00a3e7f 100644 --- a/rooknfs/values.yaml +++ b/rooknfs/values.yaml @@ -27,4 +27,4 @@ operatorImage: rook/nfs:master # state indefinitely, so here we separate out namespaces as # of various components in the same way as the Rook docs serverNamespace: rook-nfs -systemNamespace: rook-nfs-system \ No newline at end of file +systemNamespace: rook-nfs-system diff --git a/slurm-cluster-chart/Chart.yaml b/slurm-cluster-chart/Chart.yaml index 4dad59b..0177e24 100644 --- a/slurm-cluster-chart/Chart.yaml +++ b/slurm-cluster-chart/Chart.yaml @@ -25,5 +25,5 @@ appVersion: "1.16.0" dependencies: - name: rooknfs - version: 0.0.1 - repository: file://../rooknfs \ No newline at end of file + version: ">=0-0" + repository: file://../rooknfs diff --git a/slurm-cluster-chart/templates/check-jobs-finished-hook.yaml b/slurm-cluster-chart/templates/hooks/check-jobs-finished-hook.yaml similarity index 100% rename from slurm-cluster-chart/templates/check-jobs-finished-hook.yaml rename to slurm-cluster-chart/templates/hooks/check-jobs-finished-hook.yaml From af39470ad767c002050cc6be9dce364f0da7eb2f Mon Sep 17 00:00:00 2001 From: Scott Davidson <49713135+sd109@users.noreply.github.com> Date: Mon, 14 Aug 2023 14:31:04 +0100 Subject: [PATCH 06/24] Fix typo Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- rooknfs/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rooknfs/Chart.yaml b/rooknfs/Chart.yaml index 83a2a11..b8abd25 100644 --- a/rooknfs/Chart.yaml +++ b/rooknfs/Chart.yaml @@ -1,4 +1,4 @@ apiVersion: v2 name: rooknfs version: 0.0.1 -description: An packaged installation of Rook NFS for Kubernetes. \ No newline at end of file +description: A packaged installation of Rook NFS for Kubernetes. \ No newline at end of file From 336f95f01c26924faf2c51c8864f1b656df10dcc Mon Sep 17 00:00:00 2001 From: Scott Davidson <49713135+sd109@users.noreply.github.com> Date: Mon, 14 Aug 2023 14:31:52 +0100 Subject: [PATCH 07/24] Remove yaml anchor Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- slurm-cluster-chart/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index eb9501c..e8e6e09 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -25,7 +25,7 @@ storage: # installed on the cluster as a dependency of this Slurm # chart. If you are using a separate RWM StorageClass, then # set rooknfs.enabled = false - storageClassName: &storageclassname slurm-rook-nfs + storageClassName: slurm-rook-nfs # Name for the R-W-M volume to provision claimName: slurm-shared-storage # Capacite of the R-W-M volume From 5f121966277344c7fe0834c4895cf2ac4f50c9d3 Mon Sep 17 00:00:00 2001 From: Scott Davidson <49713135+sd109@users.noreply.github.com> Date: Mon, 14 Aug 2023 14:32:29 +0100 Subject: [PATCH 08/24] Remove anchor ref and add explanatory comment Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- slurm-cluster-chart/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index e8e6e09..98fe170 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -36,7 +36,7 @@ storage: # See rook-nfs sub-chart for full set of available config values rooknfs: enabled: true - storageClassName: *storageclassname + storageClassName: slurm-rook-nfs # NB this must match storage.storageClassName when using rook # Name for the NFSServer resource created by Rook serverName: rook-nfs # Capacity for the backing Read-Write-*Once* volume From 350d39b4b9a6fe56a7daa0c217156e026f4a16cd Mon Sep 17 00:00:00 2001 From: Scott Davidson <49713135+sd109@users.noreply.github.com> Date: Mon, 14 Aug 2023 14:33:06 +0100 Subject: [PATCH 09/24] Add yaml anchor explanation Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- slurm-cluster-chart/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index 98fe170..2a9eaf8 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -29,7 +29,7 @@ storage: # Name for the R-W-M volume to provision claimName: slurm-shared-storage # Capacite of the R-W-M volume - capacity: &capacity 10Gi + capacity: &capacity 10Gi # NB yaml anchor used so this value is also set for `rooknfs.storageCapacity` if necessary. # Values to be passed to the rook-nfs sub-chart From 58a89d4b27e7cabf5d5203ab9b0d3294a08c1b15 Mon Sep 17 00:00:00 2001 From: Scott Davidson <49713135+sd109@users.noreply.github.com> Date: Mon, 14 Aug 2023 14:33:36 +0100 Subject: [PATCH 10/24] Add comment about name constraints Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- slurm-cluster-chart/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index 2a9eaf8..b89ca85 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -7,7 +7,7 @@ login: slurmd: # StatefulSet resource name - name: slurmd + name: slurmd # NB this must match NodeName= in slurm-cluster-chart/files/slurm.conf replicas: 2 slurmctld: From 908f808efd07c1c66653a53b78c8e2d1ca7d9a6a Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Mon, 14 Aug 2023 17:06:32 +0100 Subject: [PATCH 11/24] Add namespace as command line arg --- publish-keys.sh | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/publish-keys.sh b/publish-keys.sh index d293e81..bdd4e0f 100755 --- a/publish-keys.sh +++ b/publish-keys.sh @@ -1,3 +1,8 @@ -kubectl create configmap authorized-keys-configmap \ +NAMESPACE="$1" +if [[ -z $1 ]]; then + NAMESPACE=default +fi +echo Installing in namespace $NAMESPACE +kubectl -n $NAMESPACE create configmap authorized-keys-configmap \ "--from-literal=authorized_keys=$(cat ~/.ssh/*.pub)" --dry-run=client -o yaml | \ -kubectl apply -f - \ No newline at end of file +kubectl -n $NAMESPACE apply -f - \ No newline at end of file From 925ad806fe072878206310db0422f34039723b91 Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Tue, 15 Aug 2023 11:12:44 +0100 Subject: [PATCH 12/24] Add namespace as script arg --- generate-secrets.sh | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/generate-secrets.sh b/generate-secrets.sh index db64a53..10b7f98 100755 --- a/generate-secrets.sh +++ b/generate-secrets.sh @@ -1,13 +1,17 @@ #!/bin/bash +NAMESPACE="$1" +if [[ -z $1 ]]; then + NAMESPACE=default +fi -kubectl create secret generic database-auth-secret \ +kubectl -n $NAMESPACE create secret generic database-auth-secret \ --dry-run=client \ --from-literal=password=$(tr -dc 'A-Za-z0-9' /dev/null | base64 -w 0) \ -o yaml | \ -kubectl apply -f - \ No newline at end of file +kubectl -n $NAMESPACE apply -f - \ No newline at end of file From f32b4f1fdfeb830569ba63446de22fec6db3ac98 Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Tue, 15 Aug 2023 11:36:52 +0100 Subject: [PATCH 13/24] Fix dnsConfig namespace --- slurm-cluster-chart/templates/login.yaml | 2 +- slurm-cluster-chart/templates/slurmctld.yaml | 2 +- slurm-cluster-chart/templates/slurmd.yaml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/slurm-cluster-chart/templates/login.yaml b/slurm-cluster-chart/templates/login.yaml index ca63392..d8a813c 100644 --- a/slurm-cluster-chart/templates/login.yaml +++ b/slurm-cluster-chart/templates/login.yaml @@ -46,7 +46,7 @@ spec: hostname: login dnsConfig: searches: - - slurmd.default.svc.cluster.local + - slurmd.{{ .Release.Namespace }}.svc.cluster.local restartPolicy: Always volumes: - name: slurm-jobdir diff --git a/slurm-cluster-chart/templates/slurmctld.yaml b/slurm-cluster-chart/templates/slurmctld.yaml index f919c5f..1644463 100644 --- a/slurm-cluster-chart/templates/slurmctld.yaml +++ b/slurm-cluster-chart/templates/slurmctld.yaml @@ -40,7 +40,7 @@ spec: name: slurmctld-state dnsConfig: searches: - - slurmd.default.svc.cluster.local + - slurmd.{{ .Release.Namespace }}.svc.cluster.local restartPolicy: Always volumes: - name: slurm-jobdir diff --git a/slurm-cluster-chart/templates/slurmd.yaml b/slurm-cluster-chart/templates/slurmd.yaml index ff13019..62646b7 100644 --- a/slurm-cluster-chart/templates/slurmd.yaml +++ b/slurm-cluster-chart/templates/slurmd.yaml @@ -50,7 +50,7 @@ spec: privileged: true dnsConfig: searches: - - slurmd.default.svc.cluster.local + - slurmd.{{ .Release.Namespace }}.svc.cluster.local restartPolicy: Always volumes: - name: slurm-jobdir From a33790b2a35cf4b94aebd4bcaa977e69269d2d89 Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Tue, 15 Aug 2023 12:06:50 +0100 Subject: [PATCH 14/24] Use builtin Helm optional dependency feature --- rooknfs/templates/nfs.yaml | 3 --- rooknfs/templates/operator.yaml | 2 -- rooknfs/templates/rbac.yaml | 4 +--- rooknfs/templates/sc.yaml | 4 +--- rooknfs/values.yaml | 5 ----- slurm-cluster-chart/Chart.yaml | 1 + slurm-cluster-chart/templates/pvc.yaml | 4 +--- 7 files changed, 4 insertions(+), 19 deletions(-) diff --git a/rooknfs/templates/nfs.yaml b/rooknfs/templates/nfs.yaml index 6fde553..1da86bc 100644 --- a/rooknfs/templates/nfs.yaml +++ b/rooknfs/templates/nfs.yaml @@ -1,4 +1,3 @@ -{{- if .Values.enabled }} --- # A default storageclass must be present apiVersion: v1 @@ -32,5 +31,3 @@ spec: annotations: rook: nfs --- -{{- end }} - diff --git a/rooknfs/templates/operator.yaml b/rooknfs/templates/operator.yaml index 4a1d542..56318f6 100644 --- a/rooknfs/templates/operator.yaml +++ b/rooknfs/templates/operator.yaml @@ -1,4 +1,3 @@ -{{- if .Values.enabled }} --- apiVersion: v1 kind: Namespace @@ -137,4 +136,3 @@ spec: fieldRef: fieldPath: metadata.namespace --- -{{- end}} diff --git a/rooknfs/templates/rbac.yaml b/rooknfs/templates/rbac.yaml index b327740..422a43b 100644 --- a/rooknfs/templates/rbac.yaml +++ b/rooknfs/templates/rbac.yaml @@ -1,4 +1,3 @@ -{{- if .Values.enabled }} --- apiVersion: v1 kind: Namespace @@ -57,5 +56,4 @@ roleRef: kind: ClusterRole name: rook-nfs-provisioner-runner apiGroup: rbac.authorization.k8s.io ---- -{{- end }} \ No newline at end of file +--- \ No newline at end of file diff --git a/rooknfs/templates/sc.yaml b/rooknfs/templates/sc.yaml index 0ad75fe..505bd44 100644 --- a/rooknfs/templates/sc.yaml +++ b/rooknfs/templates/sc.yaml @@ -1,4 +1,3 @@ -{{- if .Values.enabled }} --- apiVersion: storage.k8s.io/v1 kind: StorageClass @@ -13,5 +12,4 @@ parameters: provisioner: nfs.rook.io/rook-nfs-provisioner reclaimPolicy: Delete volumeBindingMode: Immediate ---- -{{- end }} \ No newline at end of file +--- \ No newline at end of file diff --git a/rooknfs/values.yaml b/rooknfs/values.yaml index 00a3e7f..4150967 100644 --- a/rooknfs/values.yaml +++ b/rooknfs/values.yaml @@ -1,8 +1,3 @@ -# Global flag for enabling/disabling all chart resources -# This is useful for allowing charts which use this chart -# as a dependency to toggle usage of this chart based on -# values in the parent chart -enabled: true # Name for the NFSServer resource created by rook serverName: rook-nfs diff --git a/slurm-cluster-chart/Chart.yaml b/slurm-cluster-chart/Chart.yaml index 0177e24..e3d003c 100644 --- a/slurm-cluster-chart/Chart.yaml +++ b/slurm-cluster-chart/Chart.yaml @@ -27,3 +27,4 @@ dependencies: - name: rooknfs version: ">=0-0" repository: file://../rooknfs + condition: rooknfs.enabled diff --git a/slurm-cluster-chart/templates/pvc.yaml b/slurm-cluster-chart/templates/pvc.yaml index 5e934ef..aab0856 100644 --- a/slurm-cluster-chart/templates/pvc.yaml +++ b/slurm-cluster-chart/templates/pvc.yaml @@ -1,4 +1,3 @@ -{{- if .Values.rooknfs.enabled }} --- apiVersion: v1 kind: PersistentVolumeClaim @@ -10,5 +9,4 @@ spec: - ReadWriteMany resources: requests: - storage: {{ .Values.storage.capacity }} -{{- end }} \ No newline at end of file + storage: {{ .Values.storage.capacity }} \ No newline at end of file From f86952f405ee251f212024a58d8ec6dd75e40314 Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Tue, 15 Aug 2023 12:22:21 +0100 Subject: [PATCH 15/24] Separate Rook cleanup into correct chart --- generate-secrets.sh | 4 +- rooknfs/templates/hooks/pre-delete.yaml | 50 +++++++++++++++++++ .../templates/hooks/pre-delete.yaml | 14 +++--- 3 files changed, 59 insertions(+), 9 deletions(-) create mode 100644 rooknfs/templates/hooks/pre-delete.yaml diff --git a/generate-secrets.sh b/generate-secrets.sh index 10b7f98..5956181 100755 --- a/generate-secrets.sh +++ b/generate-secrets.sh @@ -6,12 +6,12 @@ fi kubectl -n $NAMESPACE create secret generic database-auth-secret \ --dry-run=client \ ---from-literal=password=$(tr -dc 'A-Za-z0-9' /dev/null | base64 -w 0) \ +--from-literal=munge.key=$(dd if=/dev/urandom bs=1 count=1024 2>/dev/null | base64) \ -o yaml | \ kubectl -n $NAMESPACE apply -f - \ No newline at end of file diff --git a/rooknfs/templates/hooks/pre-delete.yaml b/rooknfs/templates/hooks/pre-delete.yaml new file mode 100644 index 0000000..2c75c89 --- /dev/null +++ b/rooknfs/templates/hooks/pre-delete.yaml @@ -0,0 +1,50 @@ +# NOTE: The cleanup jobs defined here are required to ensure that things which +# Rook NFS is responsible for cleaning up are deleted before deleting the Rook +# pods which do the actual clean up of NFS resources. For example, the RWM PVC +# must be deleted before the Rook StorageClass and provisioner pod. However, +# the PVC cannot be deleted until the pods which are using it are deleted, so +# the various Slurm node pods must actually be the first resources deleted. +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: rook-nfs-cleanup +--- +# TODO: Create a job-specific ClusterRole for the ServiceAccount +# instead of using the cluster-admin role here +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: rook-nfs-cleanup +subjects: +- kind: ServiceAccount + name: rook-nfs-cleanup + namespace: {{ .Release.Namespace }} +roleRef: + kind: ClusterRole + name: cluster-admin +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: rook-nfs-pre-delete-cleanup + annotations: + "helm.sh/hook": pre-delete + "helm.sh/hook-delete-policy": hook-succeeded + "helm.sh/hook-weight": "10" +spec: + template: + metadata: + name: rook-nfs-pre-delete-cleanup + spec: + serviceAccountName: rook-nfs-cleanup + containers: + - name: tester + image: bitnami/kubectl + command: + - "bin/bash" + - "-c" + - | + kubectl delete -n {{ .Values.serverNamespace }} nfsservers {{ .Values.serverName }} --wait + restartPolicy: Never +--- \ No newline at end of file diff --git a/slurm-cluster-chart/templates/hooks/pre-delete.yaml b/slurm-cluster-chart/templates/hooks/pre-delete.yaml index 8cdb1f3..868cbbd 100644 --- a/slurm-cluster-chart/templates/hooks/pre-delete.yaml +++ b/slurm-cluster-chart/templates/hooks/pre-delete.yaml @@ -9,17 +9,17 @@ apiVersion: v1 kind: ServiceAccount metadata: - name: rook-nfs-cleanup + name: slurm-k8s-cleanup --- # TODO: Create a job-specific ClusterRole for the ServiceAccount # instead of using the cluster-admin role here apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - name: rook-nfs-cleanup + name: slurm-k8s-cleanup subjects: - kind: ServiceAccount - name: rook-nfs-cleanup + name: slurm-k8s-cleanup namespace: {{ .Release.Namespace }} roleRef: kind: ClusterRole @@ -28,16 +28,17 @@ roleRef: apiVersion: batch/v1 kind: Job metadata: - name: rook-nfs-pre-delete-cleanup + name: slurm-k8s-pre-delete-cleanup annotations: "helm.sh/hook": pre-delete "helm.sh/hook-delete-policy": hook-succeeded + "helm.sh/hook-weight": "1" spec: template: metadata: - name: rook-nfs-pre-delete-cleanup + name: slurm-k8s-pre-delete-cleanup spec: - serviceAccountName: rook-nfs-cleanup + serviceAccountName: slurm-k8s-cleanup containers: - name: tester image: bitnami/kubectl @@ -49,7 +50,6 @@ spec: kubectl delete -n {{ .Release.Namespace }} statefulset {{ .Values.slurmctld.name }} --wait --cascade=foreground kubectl delete -n {{ .Release.Namespace }} statefulset {{ .Values.slurmd.name }} --wait --cascade=foreground kubectl delete -n {{ .Release.Namespace }} pvc {{ .Values.storage.claimName }} --wait - kubectl delete -n {{ .Values.rooknfs.serverNamespace }} nfsservers {{ .Values.rooknfs.serverName }} --wait restartPolicy: Never --- {{- end }} From 1371681210c766da9871ec90ba2e140f645522be Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Tue, 15 Aug 2023 13:40:08 +0100 Subject: [PATCH 16/24] Update docs --- README.md | 30 +++++++++++++++++------------- rooknfs/README.md | 3 +++ 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 11fe8b8..7411656 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,7 @@ # Slurm Docker Cluster -This is a multi-container Slurm cluster using Kubernetes. The Helm chart -creates a named volume for persistent storage of MySQL data files as well as -an NFS volume for shared storage. +This is a multi-container Slurm cluster using Kubernetes. The Slurm cluster Helm chart creates a named volume for persistent storage of MySQL data files. By default, it also installs the +RookNFS Helm chart (also in this repo) to provide shared storage across the Slurm cluster nodes. ## Dependencies @@ -27,12 +26,11 @@ The Helm chart will create the following named volumes: * var_lib_mysql ( -> /var/lib/mysql ) -A named ReadWriteMany (RWX) volume mounted to `/home` is also expected, this can be external or can be deployed using the scripts in the `/nfs` directory (See "Deploying the Cluster") +A named ReadWriteMany (RWX) volume mounted to `/home` is also expected, this can be external or can be deployed using the provided `rooknfs` chart directory (See "Deploying the Cluster") ## Configuring the Cluster -All config files in `slurm-cluster-chart/files` will be mounted into the container to configure their respective services on startup. Note that changes to these files will not all be propagated to existing deployments (see "Reconfiguring the Cluster"). -Additional parameters can be found in the `values.yaml` file, which will be applied on a Helm chart deployment. Note that some of these values will also not propagate until the cluster is restarted (see "Reconfiguring the Cluster"). +All config files in `slurm-cluster-chart/files` will be mounted into the container to configure their respective services on startup. Note that changes to these files will not all be propagated to existing deployments (see "Reconfiguring the Cluster"). Additional parameters can be found in the `values.yaml` file for the Helm chart. Note that some of these values will also not propagate until the cluster is restarted (see "Reconfiguring the Cluster"). ## Deploying the Cluster @@ -44,21 +42,20 @@ On initial deployment ONLY, run ``` This generates a set of secrets. If these need to be regenerated, see "Reconfiguring the Cluster" -### Connecting RWX Volume +### Connecting a RWX Volume -A ReadWriteMany (RWX) volume is required, if a named volume exists, set `nfs.claimName` in the `values.yaml` file to its name. If not, manifests to deploy a Rook NFS volume are provided in the `/nfs` directory. You can deploy this by running -```console -./nfs/deploy-nfs.sh -``` -and leaving `nfs.claimName` as the provided value. +A ReadWriteMany (RWX) volume is required for shared storage across cluster nodes. By default, the Rook NFS Helm chart is installed as a dependency of the Slurm cluster chart in order to provide a RWX capable Storage Class for the required shared volume. If the target Kubernetes cluster has an existing storage class which should be used instead, then `storageClass` in `values.yaml` should be set to the name of this existing class and the RookNFS dependency should be disabled by setting `rooknfs.enabled = false`. + +See the separate RookNFS chart [values.yaml](./rooknfs/values.yaml) for further configuration options when using the RookNFS to provide the shared storage volume. ### Supplying Public Keys To access the cluster via `ssh`, you will need to make your public keys available. All your public keys from localhost can be added by running ```console -./publish-keys.sh +./publish-keys.sh ``` +where `` is the namespace in which the Slurm cluster chart will be deployed (i.e. using `helm install -n ...`). This will create a Kubernetes Secret in the appropriate namespace for the Slurm cluster to use. Omitting the namespace arg will install the secrets in the default namespace. ### Deploying with Helm @@ -66,6 +63,12 @@ After configuring `kubectl` with the appropriate `kubeconfig` file, deploy the c ```console helm install slurm-cluster-chart ``` + +NOTE: If using the RookNFS dependency, then the following must be run before installing the Slurm cluster chart +```console +helm dependency update slurm-cluster-chart +``` + Subsequent releases can be deployed using: ```console @@ -128,6 +131,7 @@ srun singularity exec docker://ghcr.io/stackhpc/mpitests-container:${MPI_CONTAIN ``` Note: The mpirun script assumes you are running as user 'rocky'. If you are running as root, you will need to include the --allow-run-as-root argument + ## Reconfiguring the Cluster ### Changes to config files diff --git a/rooknfs/README.md b/rooknfs/README.md index e69de29..5b7ad6d 100644 --- a/rooknfs/README.md +++ b/rooknfs/README.md @@ -0,0 +1,3 @@ +# RookNFS Helm Chart + +See `values.yaml` for available config options. \ No newline at end of file From fe58891e7ccd9de6cb87f92083db59841f98e7e1 Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Tue, 15 Aug 2023 13:54:30 +0100 Subject: [PATCH 17/24] Make backing RWO storage class configurable --- rooknfs/templates/nfs.yaml | 1 + rooknfs/values.yaml | 3 +++ slurm-cluster-chart/values.yaml | 7 ++++++- 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/rooknfs/templates/nfs.yaml b/rooknfs/templates/nfs.yaml index 1da86bc..a88fb6f 100644 --- a/rooknfs/templates/nfs.yaml +++ b/rooknfs/templates/nfs.yaml @@ -6,6 +6,7 @@ metadata: name: {{ .Values.claimName}} namespace: {{ .Values.serverNamespace }} spec: + storageClassName: {{ .Values.backingStorageClass }} accessModes: - ReadWriteMany resources: diff --git a/rooknfs/values.yaml b/rooknfs/values.yaml index 4150967..4ada627 100644 --- a/rooknfs/values.yaml +++ b/rooknfs/values.yaml @@ -8,6 +8,9 @@ storageClassName: rook-nfs # Name for the Read-Write-Once backing PVC created by Rook claimName: rook-nfs-backing-pv +# Storage class to use for the Read-Write-Once backing PVC +backingStorageClass: + # Name for the NFS share within the NFS Resource instance shareName: share-1 diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index b89ca85..1f59a5a 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -36,7 +36,9 @@ storage: # See rook-nfs sub-chart for full set of available config values rooknfs: enabled: true - storageClassName: slurm-rook-nfs # NB this must match storage.storageClassName when using rook + # Name given to the RWM StorageClass created by Rook + # NB this must match storage.storageClassName when using Rook + storageClassName: slurm-rook-nfs # Name for the NFSServer resource created by Rook serverName: rook-nfs # Capacity for the backing Read-Write-*Once* volume @@ -46,6 +48,9 @@ rooknfs: # this value to the requested storage capacity for the RWM # volume specified in storage.capacity storageCapacity: *capacity + # Storage class to use for the Read-Write-Once backing PVC + # backingStorageClass: + sqlImage: mariadb:10.10 From 303d156f78087eefa761aab7746fd6fafbab5399 Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Tue, 15 Aug 2023 13:57:00 +0100 Subject: [PATCH 18/24] Mention storage capacity config --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 7411656..5ac48f2 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,7 @@ This generates a set of secrets. If these need to be regenerated, see "Reconfigu ### Connecting a RWX Volume -A ReadWriteMany (RWX) volume is required for shared storage across cluster nodes. By default, the Rook NFS Helm chart is installed as a dependency of the Slurm cluster chart in order to provide a RWX capable Storage Class for the required shared volume. If the target Kubernetes cluster has an existing storage class which should be used instead, then `storageClass` in `values.yaml` should be set to the name of this existing class and the RookNFS dependency should be disabled by setting `rooknfs.enabled = false`. +A ReadWriteMany (RWX) volume is required for shared storage across cluster nodes. By default, the Rook NFS Helm chart is installed as a dependency of the Slurm cluster chart in order to provide a RWX capable Storage Class for the required shared volume. If the target Kubernetes cluster has an existing storage class which should be used instead, then `storageClass` in `values.yaml` should be set to the name of this existing class and the RookNFS dependency should be disabled by setting `rooknfs.enabled = false`. In either case, the storage capacity of the provisioned RWX volume can be configured by setting the value of `storage.capacity`. See the separate RookNFS chart [values.yaml](./rooknfs/values.yaml) for further configuration options when using the RookNFS to provide the shared storage volume. From 1debdedcd97a78f17ab5dc6884ce1c26400cf624 Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Tue, 15 Aug 2023 14:00:44 +0100 Subject: [PATCH 19/24] Add note on target namespace --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 5ac48f2..4e21c3d 100644 --- a/README.md +++ b/README.md @@ -38,9 +38,9 @@ All config files in `slurm-cluster-chart/files` will be mounted into the contain On initial deployment ONLY, run ```console -./generate-secrets.sh +./generate-secrets.sh ``` -This generates a set of secrets. If these need to be regenerated, see "Reconfiguring the Cluster" +This generates a set of secrets in the target namespace to be used by the Slurm cluster. If these need to be regenerated, see "Reconfiguring the Cluster" ### Connecting a RWX Volume From 8818a94a30df63d312b645b0450e090c6f9f1587 Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Tue, 15 Aug 2023 14:20:56 +0100 Subject: [PATCH 20/24] Revert to randomly generated DB password --- generate-secrets.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/generate-secrets.sh b/generate-secrets.sh index 261f3be..dab0688 100755 --- a/generate-secrets.sh +++ b/generate-secrets.sh @@ -6,7 +6,7 @@ fi kubectl -n $NAMESPACE create secret generic database-auth-secret \ --dry-run=client \ ---from-literal=password=abcdefghijklmnopqrstuvwxyz123456 \ +--from-literal=password=$(tr -dc 'A-Za-z0-9' Date: Tue, 15 Aug 2023 14:49:06 +0100 Subject: [PATCH 21/24] Conditionally include backing storage class field --- rooknfs/templates/nfs.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/rooknfs/templates/nfs.yaml b/rooknfs/templates/nfs.yaml index a88fb6f..cf7b1de 100644 --- a/rooknfs/templates/nfs.yaml +++ b/rooknfs/templates/nfs.yaml @@ -3,10 +3,12 @@ apiVersion: v1 kind: PersistentVolumeClaim metadata: - name: {{ .Values.claimName}} + name: {{ .Values.claimName }} namespace: {{ .Values.serverNamespace }} spec: + {{- if .Values.backingStorageClass }} storageClassName: {{ .Values.backingStorageClass }} + {{- end }} accessModes: - ReadWriteMany resources: From 50e728515c3f2416a508d121ca6ed180278cab43 Mon Sep 17 00:00:00 2001 From: Scott Davidson <49713135+sd109@users.noreply.github.com> Date: Wed, 16 Aug 2023 10:13:36 +0100 Subject: [PATCH 22/24] Punctuation Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2458e39..aad9b4b 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ The Helm chart will create the following named volumes: * var_lib_mysql ( -> /var/lib/mysql ) -A named ReadWriteMany (RWX) volume mounted to `/home` is also expected, this can be external or can be deployed using the provided `rooknfs` chart directory (See "Deploying the Cluster") +A named ReadWriteMany (RWX) volume mounted to `/home` is also expected, this can be external or can be deployed using the provided `rooknfs` chart directory (See "Deploying the Cluster"). ## Configuring the Cluster From 729e43c0f07aad5f114131be8dbc5e0096b0cb76 Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Wed, 16 Aug 2023 10:17:10 +0100 Subject: [PATCH 23/24] Clarify namespace arg as optional --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index aad9b4b..c0b7d61 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,7 @@ All config files in `slurm-cluster-chart/files` will be mounted into the contain On initial deployment ONLY, run ```console -./generate-secrets.sh +./generate-secrets.sh [] ``` This generates a set of secrets in the target namespace to be used by the Slurm cluster. If these need to be regenerated, see "Reconfiguring the Cluster" @@ -55,7 +55,7 @@ See the separate RookNFS chart [values.yaml](./rooknfs/values.yaml) for further To access the cluster via `ssh`, you will need to make your public keys available. All your public keys from localhost can be added by running ```console -./publish-keys.sh +./publish-keys.sh [] ``` where `` is the namespace in which the Slurm cluster chart will be deployed (i.e. using `helm install -n ...`). This will create a Kubernetes Secret in the appropriate namespace for the Slurm cluster to use. Omitting the namespace arg will install the secrets in the default namespace. From 43a5dd7232c5bc149a232bfe9acb71f732b08f40 Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Wed, 16 Aug 2023 10:18:07 +0100 Subject: [PATCH 24/24] Re-disable line wrapping --- generate-secrets.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/generate-secrets.sh b/generate-secrets.sh index dab0688..a49ede2 100755 --- a/generate-secrets.sh +++ b/generate-secrets.sh @@ -12,7 +12,7 @@ kubectl -n $NAMESPACE apply -f - kubectl -n $NAMESPACE create secret generic munge-key-secret \ --dry-run=client \ ---from-literal=munge.key=$(dd if=/dev/urandom bs=1 count=1024 2>/dev/null | base64) \ +--from-literal=munge.key=$(dd if=/dev/urandom bs=1 count=1024 2>/dev/null | base64 -w 0) \ -o yaml | \ kubectl -n $NAMESPACE apply -f -