diff --git a/.gitignore b/.gitignore index ffbc882..b4efc61 100644 --- a/.gitignore +++ b/.gitignore @@ -21,5 +21,7 @@ maintenance_config.yaml # IDEs .vscode +.DS_Store + # Dev files static/api/v1/info diff --git a/README.md b/README.md index 12a6a33..b9a63b5 100644 --- a/README.md +++ b/README.md @@ -336,10 +336,18 @@ config: value: the value to set, optional remove: boolean value, if true the label is removed, if false the label is added or changed, optional ``` +__eviction:__ Cordons, uncordons or drains a node +```yaml +config: + action: one of "cordon", "uncordon" or "drain", required + deletionTimeout: how long to wait for pod removal to succeed during drain, optional + evictionTimeout: how long to retry creation of pod evictions for drain, optional +``` ## Additional integrations - Support for [VMware ESX maintenances](esx/README.md) - Support for [Kubernikus](kubernikus/README.md) +- Support for [Cluster-API](https://github.com/sapcc/runtime-extension-maintenance-controller) - The maintenance controller exports a bunch of prometheus metrics, but especially - `maintenance_controller_shuffle_count`: Counts pods in DaemonSets, Deployments and StatefulSets, that were likely shuffled by a node send into maintenance - `maintenance_controller_shuffles_per_replica`: Count of pods in DaemonSets, Deployments and StatefulSets, that were likely shuffled by a node send into maintenance, divided by the replica count when the event occurred diff --git a/common/schedule.go b/common/schedule.go index 3ad6cbf..f0e0366 100644 --- a/common/schedule.go +++ b/common/schedule.go @@ -72,9 +72,11 @@ func EnsureSchedulable(ctx context.Context, k8sClient client.Client, node *corev } type DrainParameters struct { + // how long to wait for pods to vanish AwaitDeletion WaitParameters - Eviction WaitParameters - Client client.Client + // how long to wait for eviction creation to succeed + Eviction WaitParameters + Client client.Client // for eviction API as that is not callable from client.Client Clientset kubernetes.Interface // when set to true and eviction creation fails diff --git a/controllers/config.go b/controllers/config.go index a6462c6..4ecc9fb 100644 --- a/controllers/config.go +++ b/controllers/config.go @@ -169,31 +169,33 @@ func loadPluginChains(config StateDescriptor, registry *plugin.Registry) (state. // addPluginsToRegistry adds known plugins to the registry. func addPluginsToRegistry(registry *plugin.Registry) { - addChecker := func(checker plugin.Checker) { + checkers := []plugin.Checker{ + &impl.Affinity{}, + &impl.AnyLabel{}, + &impl.ClusterSemver{}, + &impl.Condition{}, + &impl.HasAnnotation{}, + &impl.HasLabel{}, + &impl.KubernikusCount{}, + &impl.MaxMaintenance{}, + &impl.NodeCount{}, + &impl.PrometheusInstant{}, + &impl.Stagger{}, + &impl.TimeWindow{}, + &impl.Wait{}, + &impl.WaitExclude{}, + } + for _, checker := range checkers { registry.CheckPlugins[checker.ID()] = checker } - addChecker(&impl.Affinity{}) - addChecker(&impl.AnyLabel{}) - addChecker(&impl.ClusterSemver{}) - addChecker(&impl.Condition{}) - addChecker(&impl.HasAnnotation{}) - addChecker(&impl.HasLabel{}) - addChecker(&impl.KubernikusCount{}) - addChecker(&impl.MaxMaintenance{}) - addChecker(&impl.NodeCount{}) - addChecker(&impl.PrometheusInstant{}) - addChecker(&impl.Stagger{}) - addChecker(&impl.TimeWindow{}) - addChecker(&impl.Wait{}) - addChecker(&impl.WaitExclude{}) - addNotifier := func(notifier plugin.Notifier) { + notifiers := []plugin.Notifier{&impl.Mail{}, &impl.SlackThread{}, &impl.SlackWebhook{}} + for _, notifier := range notifiers { registry.NotificationPlugins[notifier.ID()] = notifier } - addNotifier(&impl.Mail{}) - addNotifier(&impl.SlackWebhook{}) - addNotifier(&impl.SlackThread{}) - registry.TriggerPlugins["alterAnnotation"] = &impl.AlterAnnotation{} - registry.TriggerPlugins["alterLabel"] = &impl.AlterLabel{} + triggers := []plugin.Trigger{&impl.AlterAnnotation{}, &impl.AlterLabel{}, &impl.Eviction{}} + for _, trigger := range triggers { + registry.TriggerPlugins[trigger.ID()] = trigger + } } diff --git a/controllers/node_controller.go b/controllers/node_controller.go index 8d0083d..6bbdb7c 100644 --- a/controllers/node_controller.go +++ b/controllers/node_controller.go @@ -35,6 +35,7 @@ import ( "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/kubernetes" "k8s.io/client-go/tools/record" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" @@ -47,6 +48,7 @@ import ( // NodeReconciler reconciles a Node object. type NodeReconciler struct { client.Client + Clientset *kubernetes.Clientset Log logr.Logger Scheme *runtime.Scheme Recorder record.EventRecorder @@ -55,6 +57,7 @@ type NodeReconciler struct { type reconcileParameters struct { client client.Client + clientset kubernetes.Interface config *Config log logr.Logger recorder record.EventRecorder @@ -127,6 +130,7 @@ func (r *NodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl. func (r *NodeReconciler) makeParams(config *Config, node *corev1.Node) reconcileParameters { return reconcileParameters{ client: r.Client, + clientset: r.Clientset, config: config, log: r.Log.WithValues("node", types.NamespacedName{Name: node.Name, Namespace: node.Namespace}), node: node, diff --git a/controllers/node_controller_test.go b/controllers/node_controller_test.go index 1cc768d..ada69c4 100644 --- a/controllers/node_controller_test.go +++ b/controllers/node_controller_test.go @@ -37,6 +37,7 @@ import ( corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" + "k8s.io/utils/ptr" "sigs.k8s.io/controller-runtime/pkg/client" "github.com/sapcc/maintenance-controller/api" @@ -1118,3 +1119,85 @@ var _ = Describe("The AnyLabel plugin", func() { }) }) + +var _ = Describe("The eviction plugin", func() { + + var node *corev1.Node + var pod *corev1.Pod + + BeforeEach(func() { + node = &corev1.Node{} + node.Name = "evict-node" + Expect(k8sClient.Create(context.Background(), node)).To(Succeed()) + + pod = &corev1.Pod{} + pod.Name = "evict-pod" + pod.Namespace = metav1.NamespaceDefault + pod.Spec.NodeName = node.Name + pod.Spec.Containers = []corev1.Container{ + { + Name: "nginx", + Image: "nginx", + }, + } + Expect(k8sClient.Create(context.Background(), pod)).To(Succeed()) + }) + + AfterEach(func() { + Expect(k8sClient.Delete( + context.Background(), + pod, + &client.DeleteOptions{GracePeriodSeconds: ptr.To(int64(0))}, + )).To(Succeed()) + Eventually(func(g Gomega) []corev1.Pod { + pods, err := k8sClientset.CoreV1().Pods(metav1.NamespaceDefault).List(context.Background(), metav1.ListOptions{}) + g.Expect(err).To(Succeed()) + return pods.Items + }).Should(BeEmpty()) + Expect(k8sClient.Delete(context.Background(), node)).To(Succeed()) + }) + + It("should mark a node as unschedulable with cordon action", func(ctx SpecContext) { + eviction := impl.Eviction{Action: impl.Cordon} + err := eviction.Trigger(plugin.Parameters{Ctx: ctx, Client: k8sClient, Node: node}) + Expect(err).To(Succeed()) + Eventually(func(g Gomega) bool { + err := k8sClient.Get(context.Background(), client.ObjectKeyFromObject(node), node) + g.Expect(err).To(Succeed()) + return node.Spec.Unschedulable + }).Should(BeTrue()) + }) + + It("should mark a node as schedulable with uncordon action", func(ctx SpecContext) { + originalNode := node.DeepCopy() + node.Spec.Unschedulable = true + Expect(k8sClient.Patch(ctx, node, client.MergeFrom(originalNode))).To(Succeed()) + + eviction := impl.Eviction{Action: impl.Uncordon} + err := eviction.Trigger(plugin.Parameters{Ctx: ctx, Client: k8sClient, Node: node}) + Expect(err).To(Succeed()) + Eventually(func(g Gomega) bool { + err := k8sClient.Get(context.Background(), client.ObjectKeyFromObject(node), node) + g.Expect(err).To(Succeed()) + return node.Spec.Unschedulable + }).Should(BeFalse()) + }) + + It("should evict pods with the drain action", func(ctx SpecContext) { + eviction := impl.Eviction{Action: impl.Drain, DeletionTimeout: time.Second, EvictionTimeout: time.Minute} + params := plugin.Parameters{Ctx: ctx, Client: k8sClient, Clientset: k8sClientset, Node: node, Log: GinkgoLogr} + err := eviction.Trigger(params) + Expect(err).To(HaveOccurred()) // awaiting the pod deletions fails because there is no kubelet running + Eventually(func(g Gomega) bool { + err := k8sClient.Get(context.Background(), client.ObjectKeyFromObject(node), node) + g.Expect(err).To(Succeed()) + return node.Spec.Unschedulable + }).Should(BeTrue()) + Eventually(func(g Gomega) *metav1.Time { + err := k8sClient.Get(context.Background(), client.ObjectKeyFromObject(pod), pod) + g.Expect(err).To(Succeed()) + return pod.DeletionTimestamp + }).ShouldNot(BeNil()) + }) + +}) diff --git a/controllers/node_handler.go b/controllers/node_handler.go index e44c82b..ff80fe8 100644 --- a/controllers/node_handler.go +++ b/controllers/node_handler.go @@ -85,8 +85,8 @@ func ApplyProfiles(ctx context.Context, params reconcileParameters, data *state. logDetails = true } // build plugin arguments - pluginParams := plugin.Parameters{Client: params.client, Ctx: ctx, Log: params.log, - Profile: ps.Profile.Name, Node: params.node, InMaintenance: anyInMaintenance(profileStates), + pluginParams := plugin.Parameters{Client: params.client, Clientset: params.clientset, Ctx: ctx, + Log: params.log, Profile: ps.Profile.Name, Node: params.node, InMaintenance: anyInMaintenance(profileStates), State: string(ps.State), LastTransition: data.Profiles[ps.Profile.Name].Transition, Recorder: params.recorder, LogDetails: logDetails} diff --git a/controllers/suite_test.go b/controllers/suite_test.go index af14b16..899643b 100644 --- a/controllers/suite_test.go +++ b/controllers/suite_test.go @@ -29,6 +29,7 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" corev1 "k8s.io/api/core/v1" + "k8s.io/client-go/kubernetes" "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/rest" ctrl "sigs.k8s.io/controller-runtime" @@ -118,12 +119,15 @@ profiles: next: in-maintenance ` -var cfg *rest.Config -var k8sClient client.Client -var k8sManager ctrl.Manager -var testEnv *envtest.Environment -var stopController context.CancelFunc -var nodeInfoCache cache.NodeInfoCache +var ( + cfg *rest.Config + k8sClient client.Client + k8sClientset kubernetes.Interface + k8sManager ctrl.Manager + testEnv *envtest.Environment + stopController context.CancelFunc + nodeInfoCache cache.NodeInfoCache +) func TestAPIs(t *testing.T) { RegisterFailHandler(Fail) @@ -191,8 +195,12 @@ var _ = BeforeSuite(func() { Expect(err).ToNot(HaveOccurred()) }() + k8sClientset, err = kubernetes.NewForConfig(k8sManager.GetConfig()) + Expect(err).To(Succeed()) + Expect(k8sClientset).ToNot(BeNil()) + k8sClient, err = client.New(cfg, client.Options{Scheme: scheme.Scheme}) - Expect(err).ToNot(HaveOccurred()) + Expect(err).To(Succeed()) Expect(k8sClient).ToNot(BeNil()) err = os.MkdirAll("./config", 0700) diff --git a/e2e/cluster.json b/e2e/cluster.json index 523930f..d00e1c8 100644 --- a/e2e/cluster.json +++ b/e2e/cluster.json @@ -2,10 +2,11 @@ "name": "maintenance-con-e2e", "spec": { "openstack": { - "routerID": "c7a8528b-0c86-4cb7-b6e2-c258d60bed47" + "routerID": "c7a8528b-0c86-4cb7-b6e2-c258d60bed47", + "lbSubnetID": "024209a3-9134-4cd0-841d-9429339f1ae7" }, "backup": "off", - "version": "1.28.8", + "version": "1.28.10", "serviceCIDR": "198.18.192.0/18", "clusterCIDR": "100.100.0.0/17", "nodePools": [ diff --git a/go.mod b/go.mod index 859baa0..acf4488 100644 --- a/go.mod +++ b/go.mod @@ -9,7 +9,7 @@ require ( github.com/go-logr/logr v1.4.2 github.com/gophercloud/gophercloud v1.11.0 github.com/gophercloud/utils v0.0.0-20231010081019-80377eca5d56 - github.com/onsi/ginkgo/v2 v2.18.0 + github.com/onsi/ginkgo/v2 v2.19.0 github.com/onsi/gomega v1.33.1 github.com/prometheus/client_golang v1.19.1 github.com/prometheus/common v0.53.0 diff --git a/go.sum b/go.sum index 8889e24..74b3e8b 100644 --- a/go.sum +++ b/go.sum @@ -97,8 +97,8 @@ github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8m github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f h1:KUppIJq7/+SVif2QVs3tOP0zanoHgBEVAwHxUSIzRqU= github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno= -github.com/onsi/ginkgo/v2 v2.18.0 h1:W9Y7IWXxPUpAit9ieMOLI7PJZGaW22DTKgiVAuhDTLc= -github.com/onsi/ginkgo/v2 v2.18.0/go.mod h1:rlwLi9PilAFJ8jCg9UE1QP6VBpd6/xj3SRC0d6TU0To= +github.com/onsi/ginkgo/v2 v2.19.0 h1:9Cnnf7UHo57Hy3k6/m5k3dRfGTMXGvxhHFvkDTCTpvA= +github.com/onsi/ginkgo/v2 v2.19.0/go.mod h1:rlwLi9PilAFJ8jCg9UE1QP6VBpd6/xj3SRC0d6TU0To= github.com/onsi/gomega v1.33.1 h1:dsYjIxxSR755MDmKVsaFQTE22ChNBcuuTWgkUDSubOk= github.com/onsi/gomega v1.33.1/go.mod h1:U4R44UsT+9eLIaYRB2a5qajjtQYn0hauxvRm16AVYg0= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= diff --git a/main.go b/main.go index fa7e1a1..b7ee657 100644 --- a/main.go +++ b/main.go @@ -28,6 +28,7 @@ import ( // to ensure that exec-entrypoint and run can make use of them. "go.uber.org/zap/zapcore" v1 "k8s.io/api/core/v1" + "k8s.io/client-go/kubernetes" _ "k8s.io/client-go/plugin/pkg/client/auth" "k8s.io/client-go/rest" @@ -163,6 +164,7 @@ func setupReconcilers(mgr manager.Manager, cfg *reconcilerConfig) error { nodeInfoCache := cache.NewNodeInfoCache() if err := (&controllers.NodeReconciler{ Client: mgr.GetClient(), + Clientset: kubernetes.NewForConfigOrDie(mgr.GetConfig()), Log: ctrl.Log.WithName("controllers").WithName("maintenance"), Scheme: mgr.GetScheme(), Recorder: mgr.GetEventRecorderFor("maintenance"), diff --git a/plugin/impl/eviction.go b/plugin/impl/eviction.go new file mode 100644 index 0000000..3c1d4db --- /dev/null +++ b/plugin/impl/eviction.go @@ -0,0 +1,102 @@ +/******************************************************************************* +* +* Copyright 2020 SAP SE +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You should have received a copy of the License along with this +* program. If not, you may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +*******************************************************************************/ + +package impl + +import ( + "fmt" + "slices" + "time" + + "github.com/sapcc/ucfgwrap" + + "github.com/sapcc/maintenance-controller/common" + "github.com/sapcc/maintenance-controller/plugin" +) + +const ( + Drain EvictionAction = "drain" + Cordon EvictionAction = "cordon" + Uncordon EvictionAction = "uncordon" + + defaultPeriod time.Duration = 5 * time.Second +) + +var validActions = []EvictionAction{Drain, Cordon, Uncordon} + +type EvictionAction string + +type Eviction struct { + Action EvictionAction + DeletionTimeout time.Duration + EvictionTimeout time.Duration +} + +func (e *Eviction) New(config *ucfgwrap.Config) (plugin.Trigger, error) { + conf := struct { + Action string `config:"action" validate:"required"` + DeletionTimeout time.Duration `config:"deletionTimeout"` + EvictionTimeout time.Duration `config:"evictionTimeout"` + }{ + DeletionTimeout: 10 * time.Minute, + EvictionTimeout: 10 * time.Minute, + } + if err := config.Unpack(&conf); err != nil { + return nil, err + } + if !slices.Contains(validActions, EvictionAction(conf.Action)) { + return nil, fmt.Errorf("got invalid eviction action: %s", conf.Action) + } + return &Eviction{ + Action: EvictionAction(conf.Action), + DeletionTimeout: conf.DeletionTimeout, + EvictionTimeout: conf.EvictionTimeout, + }, nil +} + +func (e *Eviction) ID() string { + return "eviction" +} + +func (e *Eviction) Trigger(params plugin.Parameters) error { + switch e.Action { + case Cordon: + return common.EnsureSchedulable(params.Ctx, params.Client, params.Node, false) + case Uncordon: + return common.EnsureSchedulable(params.Ctx, params.Client, params.Node, true) + case Drain: + if err := common.EnsureSchedulable(params.Ctx, params.Client, params.Node, false); err != nil { + return err + } + return common.EnsureDrain(params.Ctx, params.Node, params.Log, common.DrainParameters{ + AwaitDeletion: common.WaitParameters{ + Period: defaultPeriod, + Timeout: e.DeletionTimeout, + }, + Eviction: common.WaitParameters{ + Period: defaultPeriod, + Timeout: e.EvictionTimeout, + }, + Client: params.Client, + Clientset: params.Clientset, + ForceEviction: false, + }) + } + return fmt.Errorf("invalid eviction action: %s", e.Action) +} diff --git a/plugin/impl/eviction_test.go b/plugin/impl/eviction_test.go new file mode 100644 index 0000000..1225f9c --- /dev/null +++ b/plugin/impl/eviction_test.go @@ -0,0 +1,56 @@ +/******************************************************************************* +* +* Copyright 2020 SAP SE +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You should have received a copy of the License along with this +* program. If not, you may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +*******************************************************************************/ + +package impl + +import ( + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "github.com/sapcc/ucfgwrap" +) + +var _ = Describe("The eviction plugin", func() { + + It("has default timeout values", func() { + config, err := ucfgwrap.FromYAML([]byte("action: drain")) + Expect(err).To(Succeed()) + + var base Eviction + plugin, err := base.New(&config) + Expect(err).To(Succeed()) + Expect(plugin.(*Eviction).DeletionTimeout).To(Equal(10 * time.Minute)) + Expect(plugin.(*Eviction).EvictionTimeout).To(Equal(10 * time.Minute)) + }) + + It("can parse it's configuration", func() { + configStr := "action: drain\ndeletionTimeout: 11m\nevictionTimeout: 532ms" + config, err := ucfgwrap.FromYAML([]byte(configStr)) + Expect(err).To(Succeed()) + + var base Eviction + plugin, err := base.New(&config) + Expect(err).To(Succeed()) + Expect(plugin.(*Eviction).Action).To(Equal(Drain)) + Expect(plugin.(*Eviction).DeletionTimeout).To(Equal(11 * time.Minute)) + Expect(plugin.(*Eviction).EvictionTimeout).To(Equal(532 * time.Millisecond)) + }) + +}) diff --git a/plugin/plugin.go b/plugin/plugin.go index 051e791..128f6ab 100644 --- a/plugin/plugin.go +++ b/plugin/plugin.go @@ -31,6 +31,7 @@ import ( "github.com/go-logr/logr" "github.com/sapcc/ucfgwrap" corev1 "k8s.io/api/core/v1" + "k8s.io/client-go/kubernetes" "k8s.io/client-go/tools/record" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -96,6 +97,7 @@ type Parameters struct { // whether to log failing checks, notifications, ... LogDetails bool Client client.Client + Clientset kubernetes.Interface Ctx context.Context //nolint: containedctx Log logr.Logger Recorder record.EventRecorder