diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index e430ae3..9390ce2 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -150,6 +150,18 @@ RUN url="https://raw.githubusercontent.com/python-poetry/poetry/master/install-p wget -q -O- "$url" | POETRY_VERSION=1.1.13 python3 -; \ poetry --version; +# Install kind +RUN url="https://kind.sigs.k8s.io/dl/v0.13.0/kind-linux-amd64"; \ + sha256="c80c6d1013337cbbe226c2eda0a3dc2d75af16e5fa8af4ce3fc9fedcf1f9d2dc"; \ + \ + wget -O kind "$url" --progress=dot:giga; \ + echo "$sha256 kind" | sha256sum --strict --check -; \ + \ + chmod +x kind; \ + mv kind /usr/local/bin; \ + \ + kind version; + WORKDIR /workspace CMD [ "/bin/bash" ] diff --git a/.devcontainer/targets.mk b/.devcontainer/targets.mk index 339efa2..ab779df 100644 --- a/.devcontainer/targets.mk +++ b/.devcontainer/targets.mk @@ -9,6 +9,8 @@ DEV_CONTAINER_WORKING_DIR = /workspace$(CURDIR:$(DEV_CONTAINER_MOUNT)%=%) DOCKER_CONFIG_JSON = $(HOME)/.docker/config.json DOCKER_LINUX_OPTS = -u $(shell id -u):$(shell id -g) --group-add=$(shell getent group docker | cut -d: -f3) DOCKER_OTHER_OS_OPTS = +DOCKER_NETWORK_NAME ?= +DOCKER_NETWORK_OPTS = ifeq ($(shell uname), Linux) DOCKER_OS_OPTS = $(DOCKER_LINUX_OPTS) @@ -16,6 +18,9 @@ else DOCKER_OS_OPTS = $(DOCKER_OTHER_OS_OPTS) endif +ifneq ($(DOCKER_NETWORK_NAME),) + DOCKER_NETWORK_OPTS = --network=$(DOCKER_NETWORK_NAME) +endif --pull-devcontainer: @docker pull ${MAKEVAR_REGISTRY}/${DEV_CONTAINER_IMAGE_NAME}:${DEV_CONTAINER_TAG} || true @@ -35,7 +40,10 @@ devcontainer-%: ENVFILE := $(shell mktemp) devcontainer-%: devcontainer env | grep -e 'AWS_' -e 'ARM_' -e 'GITHUB_' -e 'MAKEVAR_' -e 'SKIP_' -e 'TF_' >> ${ENVFILE} || true echo MAKEVAR_DIND=true >> ${ENVFILE} - docker run --rm $(DOCKER_OS_OPTS) \ + @if [ -n "$(DOCKER_NETWORK_NAME)" ]; then \ + docker network create $(DOCKER_NETWORK_NAME) || true; \ + fi + docker run --rm $(DOCKER_OS_OPTS) $(DOCKER_NETWORK_OPTS) \ -v ~/.kube/config:$(DEV_CONTAINER_WORKING_DIR)/.kube/config \ -v /var/run/docker.sock:/var/run/docker.sock \ -v $(DEV_CONTAINER_MOUNT):/workspace \ diff --git a/README.md b/README.md index 6f1cb00..85daf62 100644 --- a/README.md +++ b/README.md @@ -16,3 +16,7 @@ Already cloned without recursive options? Run the following command to initializ ```sh git submodule update --init --recursive ``` + +### Local kubernetes cluster + +For instructions how to spin up a local kubernetes environment please see the [develop README.md](develop/README.md) diff --git a/components/fl-operator/kustomize/crd/bases/fl.katulu.io_floperators.yaml b/components/fl-operator/kustomize/crd/bases/fl.katulu.io_floperators.yaml index 3ce2f75..7de1e98 100644 --- a/components/fl-operator/kustomize/crd/bases/fl.katulu.io_floperators.yaml +++ b/components/fl-operator/kustomize/crd/bases/fl.katulu.io_floperators.yaml @@ -44,11 +44,6 @@ spec: type: object status: description: FlOperatorStatus defines the observed state of FlOperator - properties: - running-servers: - items: - type: object - type: array type: object type: object served: true diff --git a/components/fl-operator/pkg/resources/resources.go b/components/fl-operator/pkg/resources/resources.go index 93af84e..d4e6599 100644 --- a/components/fl-operator/pkg/resources/resources.go +++ b/components/fl-operator/pkg/resources/resources.go @@ -55,6 +55,7 @@ func NewDeployment(task *pb.OrchestratorMessage_TaskSpec, name types.NamespacedN labels := map[string]string{ FlClientDeploymentLabelKey: FlClientDeploymentLabelValue, "run-id": string(task.ID), + "spire-workload": "flower-client", } envoyConfigVolumeKey := "envoy-config" @@ -63,6 +64,7 @@ func NewDeployment(task *pb.OrchestratorMessage_TaskSpec, name types.NamespacedN ObjectMeta: metav1.ObjectMeta{ Name: name.Name, Namespace: name.Namespace, + Labels: labels, }, Spec: appsv1.DeploymentSpec{ Replicas: utils.Int32Ptr(1), @@ -158,7 +160,8 @@ func NewDeployment(task *pb.OrchestratorMessage_TaskSpec, name types.NamespacedN // Creates a new envoy proxy deployment func NewEnvoyproxyDeployment(name types.NamespacedName) *appsv1.Deployment { labels := map[string]string{ - "app": name.Name, + "app": name.Name, + "spire-workload": "fl-operator", } const envoyConfigVolumeKey = "envoy-config" @@ -167,6 +170,7 @@ func NewEnvoyproxyDeployment(name types.NamespacedName) *appsv1.Deployment { ObjectMeta: metav1.ObjectMeta{ Name: name.Name, Namespace: name.Namespace, + Labels: labels, }, Spec: appsv1.DeploymentSpec{ Replicas: utils.Int32Ptr(1), @@ -244,7 +248,7 @@ func NewEnvoyproxyService(name types.NamespacedName) *corev1.Service { Spec: corev1.ServiceSpec{ Ports: []corev1.ServicePort{ { - Name: "http", + Name: "grpc", Port: 9080, Protocol: "TCP", }, diff --git a/components/spire/kustomize/spire-agent/agent-daemonset.yaml b/components/spire/kustomize/spire-agent/agent-daemonset.yaml index a6c9b92..f186a57 100644 --- a/components/spire/kustomize/spire-agent/agent-daemonset.yaml +++ b/components/spire/kustomize/spire-agent/agent-daemonset.yaml @@ -27,7 +27,7 @@ spec: args: ["-t", "30", "spire-server:8081"] containers: - name: spire-agent - image: gcr.io/spiffe-io/spire-agent:1.1.2 + image: gcr.io/spiffe-io/spire-agent:1.3.0 args: - -expandEnv - -config diff --git a/components/spire/kustomize/spire-server/server-statefulset.yaml b/components/spire/kustomize/spire-server/server-statefulset.yaml index 48f8422..4c100a0 100644 --- a/components/spire/kustomize/spire-server/server-statefulset.yaml +++ b/components/spire/kustomize/spire-server/server-statefulset.yaml @@ -20,7 +20,7 @@ spec: shareProcessNamespace: true containers: - name: spire-server - image: gcr.io/spiffe-io/spire-server:1.1.2 + image: gcr.io/spiffe-io/spire-server:1.3.0 args: - -config - /run/spire/config/server.conf @@ -53,7 +53,7 @@ spec: readOnly: false - name: k8s-workload-registrar - image: gcr.io/spiffe-io/k8s-workload-registrar:1.1.0 + image: gcr.io/spiffe-io/k8s-workload-registrar:1.3.0 args: - -config - /run/spire/config/k8s-workload-registrar.conf diff --git a/develop/.gitignore b/develop/.gitignore new file mode 100644 index 0000000..2d66dfc --- /dev/null +++ b/develop/.gitignore @@ -0,0 +1,3 @@ +# KUBECONFIG +/local.fl-suite.kubeconfig.yaml +/.local.fl-suite-internal.kubeconfig.yaml diff --git a/develop/Makefile b/develop/Makefile new file mode 100644 index 0000000..596dd32 --- /dev/null +++ b/develop/Makefile @@ -0,0 +1,20 @@ +DOCKER_NETWORK_NAME = kind + +-include ../.devcontainer/targets.mk + +dependencies lint test build dist push: + @echo "$@ not implemented" +.PHONY: dependencies lint test build dist push + +local-registry: + docker network create kind || true + docker run -d --name registry --restart=always -p 5000:5000 --net=kind registry:2 +.PHONY: local-registry + +provision: + @./provision.sh +.PHONY: provision + +teardown: + @./teardown.sh +.PHONY: teardown diff --git a/develop/README.md b/develop/README.md new file mode 100644 index 0000000..6222b54 --- /dev/null +++ b/develop/README.md @@ -0,0 +1,104 @@ +# Local kubernetes cluster + +## Requirements + +* [kubectl](https://kubernetes.io/docs/tasks/tools/#kubectl). +* [kustomize](https://kubectl.docs.kubernetes.io/installation/kustomize/). +* [kind](https://kind.sigs.k8s.io/docs/user/quick-start/#installation). + +> 🌻 Add `devcontainer-` to any of the make targets to use a container image with the requirements already pre-installed + +## Deploy + +To deploy a local kubernetes cluster we will use kind (Kubernetes In Docker). + +### 1. Setup a local container registry + +The kind cluster uses a local registry to host the fl-suite container images. To deploy this run: + +```shell +make local-registry +``` + + +### 2. Push the fl-suite images to the local container registry + +Build, dist and push the fl-suite's container images. This needs to be done at the root of the project: + +On Linux (with all the tools to build all the components of the fl-suite): + +``` +cd /path/to/katulu-io/fl-suite/ + +export MAKEVAR_REGISTRY=localhost:5000 +make build dist push +``` + +On any other platform: + +``` +cd /path/to/katulu-io/fl-suite/ + +export MAKEVAR_REGISTRY=localhost:5000 +make devcontainer-build devcontainer-dist push +``` + +### 3. Provision the kind cluster + +```shell +make provision +``` + +That step will show some errors like: + +``` +Error from server (NotFound): error when creating "STDIN": namespaces "katulu-fl" not found +``` + +This and other CRD related errors are expected. The namespace "katulu-fl" gets created once a Kubeflow Profile is reconciled in kubernetes which might take some time. The other CRD errors (e.g cert-manager's Certificates CRDs) have the same cause. The `provision` target will take care to retry this as many times as it needs and normally this takes around ~20 mins but depends on the local resources like CPU, Network, etc. + +> 🌻 The same make-target can be used to update the cluster with the latest kustomize changes + +A kubeconfig file is generated which can be used to configure `kubectl` and access the kind cluster: + +```shell +export KUBECONFIG=local.fl-suite.kubeconfig.yaml +kubectl get nodes +NAME STATUS ROLES AGE VERSION +local.fl-suite-control-plane Ready control-plane,master 5m00s v1.21.10 +``` + +### 4. Wait for all the pods to be ready + +```shell +export KUBECONFIG=local.fl-suite.kubeconfig.yaml +kubectl get pods -n cert-manager +kubectl get pods -n istio-system +kubectl get pods -n auth +kubectl get pods -n knative-eventing +kubectl get pods -n knative-serving +kubectl get pods -n kubeflow +kubectl get pods -n katulu-fl +kubectl get pods -n spire +kubectl get pods -n container-registry +``` + +### 5. Login to the fl-suite central dashboard + +Once all pods are ready, you can access the fl-suite via: + +On Linux: http://localhost +On MacOS: http://docker.for.mac.localhost + +The credentials are: + +``` +Username: user@example.com +Password: 12341234 +``` + +## Teardown + +```shell +make teardown +``` diff --git a/develop/kind-config.yaml b/develop/kind-config.yaml new file mode 100644 index 0000000..217084e --- /dev/null +++ b/develop/kind-config.yaml @@ -0,0 +1,25 @@ +--- +kind: Cluster +apiVersion: kind.x-k8s.io/v1alpha4 +name: local.fl-suite +containerdConfigPatches: + - |- + [plugins."io.containerd.grpc.v1.cri".registry.mirrors."localhost:5000"] + endpoint = ["http://registry:5000"] + [plugins."io.containerd.grpc.v1.cri".registry.mirrors."container-registry.container-registry:5000"] + endpoint = ["http://local.fl-suite-control-plane:30080"] +nodes: + - role: control-plane + # Using kubernetes version 1.21 to avoid "no matches for kind "CustomResourceDefinition"" error (related issue: https://github.com/kubeflow/manifests/issues/2028) + image: kindest/node:v1.21.10@sha256:84709f09756ba4f863769bdcabe5edafc2ada72d3c8c44d6515fc581b66b029c + extraPortMappings: + - containerPort: 30080 + hostPort: 80 + - containerPort: 30443 + hostPort: 443 + # TODO: Remove this extra mount. The FLOperator pods expect a /dataset directory in the kubernetes node. To + # force-create one we let kind do it + extraMounts: + - hostPath: dataset/ + containerPath: /dataset + readOnly: true diff --git a/develop/kustomize/allow-fl-operator-access.yaml b/develop/kustomize/allow-fl-operator-access.yaml new file mode 100644 index 0000000..6bd7ab6 --- /dev/null +++ b/develop/kustomize/allow-fl-operator-access.yaml @@ -0,0 +1,11 @@ +--- +apiVersion: security.istio.io/v1beta1 +kind: AuthorizationPolicy +metadata: + name: allow-fl-operator-access + namespace: katulu-fl +spec: + rules: + - from: + - source: + principals: ["cluster.local/ns/fl-operator-system/sa/fl-operator-controller-manager"] diff --git a/develop/kustomize/config/internal-registry-credentials.json b/develop/kustomize/config/internal-registry-credentials.json new file mode 100644 index 0000000..e0f2c5d --- /dev/null +++ b/develop/kustomize/config/internal-registry-credentials.json @@ -0,0 +1,7 @@ +{ + "auths":{ + "container-registry.container-registry:5000": { + "auth":"cmVnaXN0cnk6cmVnaXN0cnk=" + } + } +} diff --git a/develop/kustomize/config/spire-agent.conf b/develop/kustomize/config/spire-agent.conf new file mode 100644 index 0000000..8bf14e1 --- /dev/null +++ b/develop/kustomize/config/spire-agent.conf @@ -0,0 +1,33 @@ +agent { + data_dir = "/run/spire" + log_level = "DEBUG" + server_address = "spire-server" + server_port = "8081" + socket_path = "/run/spire/sockets/agent.sock" + trust_bundle_path = "/run/spire/bundle/bundle.crt" + trust_domain = "katulu.io" +} + +plugins { + NodeAttestor "k8s_psat" { + plugin_data { + cluster = "local-k8s" + } + } + + KeyManager "memory" { + plugin_data { + } + } + + WorkloadAttestor "k8s" { + plugin_data { + skip_kubelet_verification = true + } + } + + WorkloadAttestor "unix" { + plugin_data { + } + } +} diff --git a/develop/kustomize/kustomization.yaml b/develop/kustomize/kustomization.yaml new file mode 100644 index 0000000..ae6eb08 --- /dev/null +++ b/develop/kustomize/kustomization.yaml @@ -0,0 +1,135 @@ +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - ../../components/fl-operator/kustomize/samples + - ../../components/fl-operator + - ../../kustomize/fl-suite/overlays/standalone + - allow-fl-operator-access.yaml + +secretGenerator: + - name: internal-registry-credentials + namespace: katulu-fl + type: kubernetes.io/dockerconfigjson + files: + - .dockerconfigjson=config/internal-registry-credentials.json + + # Used by the fl-operator's pods to pull the internal-registry container images + - name: regcred + namespace: katulu-fl + type: kubernetes.io/dockerconfigjson + files: + - .dockerconfigjson=config/internal-registry-credentials.json + +configMapGenerator: + # Kind does not have a cert in the cluster CA bundle that can authenticate the kubelet cert so we skip the kubelet verification + - name: spire-agent + namespace: spire + behavior: replace + files: + - agent.conf=config/spire-agent.conf + +patches: + # In this setup, the FLOperator is running inside the same cluster as the FLOrchestrator, therefore we can + # communicate through the istio-ingressgateway directly. But the SNI is needed to do the routing in Istio. + - patch: |- + --- + apiVersion: fl.katulu.io/v1alpha1 + kind: FlOperator + metadata: + name: floperator-sample + namespace: katulu-fl + spec: + orchestrator-url: istio-ingressgateway.istio-system + orchestrator-sni: fl-orchestrator.fl-suite + orchestrator-port: 443 + + # The fl-operator needs a (istio) sidecar to get access to the fl-operator-envoyproxy running in the katulu-fl ns. + # The allow-fl-operator-access (istio) authorization policy is required to grant the access + - patch: |- + --- + apiVersion: v1 + kind: Namespace + metadata: + name: fl-operator-system + labels: + istio-injection: enabled + + # Set the container-registry to listen to self container-name: local.fl-suite-control-plane + - patch: |- + - op: replace + path: /spec/hosts + value: + - local.fl-suite-control-plane + target: + group: networking.istio.io + name: container-registry + namespace: container-registry + kind: VirtualService + - patch: |- + - op: replace + path: /spec/servers/0/hosts/0 + value: local.fl-suite-control-plane + target: + group: networking.istio.io + name: container-registry + namespace: container-registry + kind: Gateway + - patch: |- + - op: replace + path: /spec/configPatches/0/match/routeConfiguration/vhost/name + value: local.fl-suite-control-plane:80 + target: + group: networking.istio.io + name: remove-kubeflow-authz + namespace: istio-system + kind: EnvoyFilter + + # Set the fl-orchestrator SNI (VirtualService and Gateway) to fl-orchestrator.fl-suite. + # This SNI is accessed via the envoyproxies so this doesn't need a matching DNS record. + - patch: |- + - op: replace + path: /spec/tls/0/match/0/sniHosts/0 + value: fl-orchestrator.fl-suite + - op: replace + path: /spec/hosts/0 + value: fl-orchestrator.fl-suite + target: + group: networking.istio.io + name: fl-orchestrator-envoyproxy + namespace: katulu-fl + kind: VirtualService + - patch: |- + - op: replace + path: /spec/servers/0/hosts/0 + value: fl-orchestrator.fl-suite + target: + group: networking.istio.io + name: fl-orchestrator-envoyproxy + namespace: katulu-fl + kind: Gateway + + # Exposing istio-ingressgateway to static node-ports: http=30080 and https=30443 to be able to port-forward in kind + - patch: |- + --- + apiVersion: v1 + kind: Service + metadata: + name: istio-ingressgateway + namespace: istio-system + spec: + ports: + - name: http2 + port: 80 + protocol: TCP + targetPort: 8080 + nodePort: 30080 + - name: https + port: 443 + protocol: TCP + targetPort: 8443 + nodePort: 30443 + +generatorOptions: + disableNameSuffixHash: true diff --git a/develop/provision.sh b/develop/provision.sh new file mode 100755 index 0000000..798bf36 --- /dev/null +++ b/develop/provision.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash + +set -e + +CLUSTER_NAME=local.fl-suite +kind create cluster --config=kind-config.yaml --kubeconfig "$CLUSTER_NAME.kubeconfig.yaml" || true + +# Export kind's internal kubeconfig to be used by devcontainer. +# This kubeconfig file is kept hidden because it is only used by the provision script +kind export kubeconfig --name "$CLUSTER_NAME" --internal --kubeconfig ".$CLUSTER_NAME-internal.kubeconfig.yaml" + +KUBECONFIG="./$CLUSTER_NAME.kubeconfig.yaml" +# If this script is running inside docker then use the internal kubeconfig +if grep -q docker /proc/self/cgroup; then + KUBECONFIG="./.$CLUSTER_NAME-internal.kubeconfig.yaml" +fi +export KUBECONFIG + +while ! kustomize build kustomize | kubectl apply -f -; do + echo "Retrying to apply resources" + sleep 10 +done diff --git a/develop/teardown.sh b/develop/teardown.sh new file mode 100755 index 0000000..4947d71 --- /dev/null +++ b/develop/teardown.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +set -e + +CLUSTER_NAME=local.fl-suite + +kind delete cluster --name "$CLUSTER_NAME" + +rm -f "$CLUSTER_NAME.kubeconfig.yaml" ".$CLUSTER_NAME-internal.kubeconfig.yaml"