Skip to content

Commit

Permalink
Add eviction trigger plugin
Browse files Browse the repository at this point in the history
  • Loading branch information
Nuckal777 committed May 30, 2024
1 parent 153c005 commit b9db81b
Show file tree
Hide file tree
Showing 15 changed files with 309 additions and 37 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -21,5 +21,7 @@ maintenance_config.yaml
# IDEs
.vscode

.DS_Store

# Dev files
static/api/v1/info
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -336,10 +336,18 @@ config:
value: the value to set, optional
remove: boolean value, if true the label is removed, if false the label is added or changed, optional
```
__eviction:__ Cordons, uncordons or drains a node
```yaml
config:
action: one of "cordon", "uncordon" or "drain", required
deletionTimeout: how long to wait for pod removal to succeed during drain, optional
evictionTimeout: how long to retry creation of pod evictions for drain, optional
```

## Additional integrations
- Support for [VMware ESX maintenances](esx/README.md)
- Support for [Kubernikus](kubernikus/README.md)
- Support for [Cluster-API](https://github.com/sapcc/runtime-extension-maintenance-controller)
- The maintenance controller exports a bunch of prometheus metrics, but especially
- `maintenance_controller_shuffle_count`: Counts pods in DaemonSets, Deployments and StatefulSets, that were likely shuffled by a node send into maintenance
- `maintenance_controller_shuffles_per_replica`: Count of pods in DaemonSets, Deployments and StatefulSets, that were likely shuffled by a node send into maintenance, divided by the replica count when the event occurred
Expand Down
6 changes: 4 additions & 2 deletions common/schedule.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,11 @@ func EnsureSchedulable(ctx context.Context, k8sClient client.Client, node *corev
}

type DrainParameters struct {
// how long to wait for pods to vanish
AwaitDeletion WaitParameters
Eviction WaitParameters
Client client.Client
// how long to wait for eviction creation to succeed
Eviction WaitParameters
Client client.Client
// for eviction API as that is not callable from client.Client
Clientset kubernetes.Interface
// when set to true and eviction creation fails
Expand Down
44 changes: 23 additions & 21 deletions controllers/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -169,31 +169,33 @@ func loadPluginChains(config StateDescriptor, registry *plugin.Registry) (state.

// addPluginsToRegistry adds known plugins to the registry.
func addPluginsToRegistry(registry *plugin.Registry) {
addChecker := func(checker plugin.Checker) {
checkers := []plugin.Checker{
&impl.Affinity{},
&impl.AnyLabel{},
&impl.ClusterSemver{},
&impl.Condition{},
&impl.HasAnnotation{},
&impl.HasLabel{},
&impl.KubernikusCount{},
&impl.MaxMaintenance{},
&impl.NodeCount{},
&impl.PrometheusInstant{},
&impl.Stagger{},
&impl.TimeWindow{},
&impl.Wait{},
&impl.WaitExclude{},
}
for _, checker := range checkers {
registry.CheckPlugins[checker.ID()] = checker
}
addChecker(&impl.Affinity{})
addChecker(&impl.AnyLabel{})
addChecker(&impl.ClusterSemver{})
addChecker(&impl.Condition{})
addChecker(&impl.HasAnnotation{})
addChecker(&impl.HasLabel{})
addChecker(&impl.KubernikusCount{})
addChecker(&impl.MaxMaintenance{})
addChecker(&impl.NodeCount{})
addChecker(&impl.PrometheusInstant{})
addChecker(&impl.Stagger{})
addChecker(&impl.TimeWindow{})
addChecker(&impl.Wait{})
addChecker(&impl.WaitExclude{})

addNotifier := func(notifier plugin.Notifier) {
notifiers := []plugin.Notifier{&impl.Mail{}, &impl.SlackThread{}, &impl.SlackWebhook{}}
for _, notifier := range notifiers {
registry.NotificationPlugins[notifier.ID()] = notifier
}
addNotifier(&impl.Mail{})
addNotifier(&impl.SlackWebhook{})
addNotifier(&impl.SlackThread{})

registry.TriggerPlugins["alterAnnotation"] = &impl.AlterAnnotation{}
registry.TriggerPlugins["alterLabel"] = &impl.AlterLabel{}
triggers := []plugin.Trigger{&impl.AlterAnnotation{}, &impl.AlterLabel{}, &impl.Eviction{}}
for _, trigger := range triggers {
registry.TriggerPlugins[trigger.ID()] = trigger
}
}
4 changes: 4 additions & 0 deletions controllers/node_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ import (
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/tools/record"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
Expand All @@ -47,6 +48,7 @@ import (
// NodeReconciler reconciles a Node object.
type NodeReconciler struct {
client.Client
Clientset *kubernetes.Clientset
Log logr.Logger
Scheme *runtime.Scheme
Recorder record.EventRecorder
Expand All @@ -55,6 +57,7 @@ type NodeReconciler struct {

type reconcileParameters struct {
client client.Client
clientset kubernetes.Interface
config *Config
log logr.Logger
recorder record.EventRecorder
Expand Down Expand Up @@ -127,6 +130,7 @@ func (r *NodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.
func (r *NodeReconciler) makeParams(config *Config, node *corev1.Node) reconcileParameters {
return reconcileParameters{
client: r.Client,
clientset: r.Clientset,
config: config,
log: r.Log.WithValues("node", types.NamespacedName{Name: node.Name, Namespace: node.Namespace}),
node: node,
Expand Down
83 changes: 83 additions & 0 deletions controllers/node_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ import (
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/utils/ptr"
"sigs.k8s.io/controller-runtime/pkg/client"

"github.com/sapcc/maintenance-controller/api"
Expand Down Expand Up @@ -1118,3 +1119,85 @@ var _ = Describe("The AnyLabel plugin", func() {
})

})

var _ = Describe("The eviction plugin", func() {

var node *corev1.Node
var pod *corev1.Pod

BeforeEach(func() {
node = &corev1.Node{}
node.Name = "evict-node"
Expect(k8sClient.Create(context.Background(), node)).To(Succeed())

pod = &corev1.Pod{}
pod.Name = "evict-pod"
pod.Namespace = metav1.NamespaceDefault
pod.Spec.NodeName = node.Name
pod.Spec.Containers = []corev1.Container{
{
Name: "nginx",
Image: "nginx",
},
}
Expect(k8sClient.Create(context.Background(), pod)).To(Succeed())
})

AfterEach(func() {
Expect(k8sClient.Delete(
context.Background(),
pod,
&client.DeleteOptions{GracePeriodSeconds: ptr.To(int64(0))},
)).To(Succeed())
Eventually(func(g Gomega) []corev1.Pod {
pods, err := k8sClientset.CoreV1().Pods(metav1.NamespaceDefault).List(context.Background(), metav1.ListOptions{})
g.Expect(err).To(Succeed())
return pods.Items
}).Should(BeEmpty())
Expect(k8sClient.Delete(context.Background(), node)).To(Succeed())
})

It("should mark a node as unschedulable with cordon action", func(ctx SpecContext) {
eviction := impl.Eviction{Action: impl.Cordon}
err := eviction.Trigger(plugin.Parameters{Ctx: ctx, Client: k8sClient, Node: node})
Expect(err).To(Succeed())
Eventually(func(g Gomega) bool {
err := k8sClient.Get(context.Background(), client.ObjectKeyFromObject(node), node)
g.Expect(err).To(Succeed())
return node.Spec.Unschedulable
}).Should(BeTrue())
})

It("should mark a node as schedulable with uncordon action", func(ctx SpecContext) {
originalNode := node.DeepCopy()
node.Spec.Unschedulable = true
Expect(k8sClient.Patch(ctx, node, client.MergeFrom(originalNode))).To(Succeed())

eviction := impl.Eviction{Action: impl.Uncordon}
err := eviction.Trigger(plugin.Parameters{Ctx: ctx, Client: k8sClient, Node: node})
Expect(err).To(Succeed())
Eventually(func(g Gomega) bool {
err := k8sClient.Get(context.Background(), client.ObjectKeyFromObject(node), node)
g.Expect(err).To(Succeed())
return node.Spec.Unschedulable
}).Should(BeFalse())
})

It("should evict pods with the drain action", func(ctx SpecContext) {
eviction := impl.Eviction{Action: impl.Drain, DeletionTimeout: time.Second, EvictionTimeout: time.Minute}
params := plugin.Parameters{Ctx: ctx, Client: k8sClient, Clientset: k8sClientset, Node: node, Log: GinkgoLogr}
err := eviction.Trigger(params)
Expect(err).To(HaveOccurred()) // awaiting the pod deletions fails because there is no kubelet running
Eventually(func(g Gomega) bool {
err := k8sClient.Get(context.Background(), client.ObjectKeyFromObject(node), node)
g.Expect(err).To(Succeed())
return node.Spec.Unschedulable
}).Should(BeTrue())
Eventually(func(g Gomega) *metav1.Time {
err := k8sClient.Get(context.Background(), client.ObjectKeyFromObject(pod), pod)
g.Expect(err).To(Succeed())
return pod.DeletionTimestamp
}).ShouldNot(BeNil())
})

})
4 changes: 2 additions & 2 deletions controllers/node_handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,8 +85,8 @@ func ApplyProfiles(ctx context.Context, params reconcileParameters, data *state.
logDetails = true
}
// build plugin arguments
pluginParams := plugin.Parameters{Client: params.client, Ctx: ctx, Log: params.log,
Profile: ps.Profile.Name, Node: params.node, InMaintenance: anyInMaintenance(profileStates),
pluginParams := plugin.Parameters{Client: params.client, Clientset: params.clientset, Ctx: ctx,
Log: params.log, Profile: ps.Profile.Name, Node: params.node, InMaintenance: anyInMaintenance(profileStates),
State: string(ps.State), LastTransition: data.Profiles[ps.Profile.Name].Transition,
Recorder: params.recorder, LogDetails: logDetails}

Expand Down
22 changes: 15 additions & 7 deletions controllers/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import (
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
corev1 "k8s.io/api/core/v1"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/kubernetes/scheme"
"k8s.io/client-go/rest"
ctrl "sigs.k8s.io/controller-runtime"
Expand Down Expand Up @@ -118,12 +119,15 @@ profiles:
next: in-maintenance
`

var cfg *rest.Config
var k8sClient client.Client
var k8sManager ctrl.Manager
var testEnv *envtest.Environment
var stopController context.CancelFunc
var nodeInfoCache cache.NodeInfoCache
var (
cfg *rest.Config
k8sClient client.Client
k8sClientset kubernetes.Interface
k8sManager ctrl.Manager
testEnv *envtest.Environment
stopController context.CancelFunc
nodeInfoCache cache.NodeInfoCache
)

func TestAPIs(t *testing.T) {
RegisterFailHandler(Fail)
Expand Down Expand Up @@ -191,8 +195,12 @@ var _ = BeforeSuite(func() {
Expect(err).ToNot(HaveOccurred())
}()

k8sClientset, err = kubernetes.NewForConfig(k8sManager.GetConfig())
Expect(err).To(Succeed())
Expect(k8sClientset).ToNot(BeNil())

k8sClient, err = client.New(cfg, client.Options{Scheme: scheme.Scheme})
Expect(err).ToNot(HaveOccurred())
Expect(err).To(Succeed())
Expect(k8sClient).ToNot(BeNil())

err = os.MkdirAll("./config", 0700)
Expand Down
5 changes: 3 additions & 2 deletions e2e/cluster.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@
"name": "maintenance-con-e2e",
"spec": {
"openstack": {
"routerID": "c7a8528b-0c86-4cb7-b6e2-c258d60bed47"
"routerID": "c7a8528b-0c86-4cb7-b6e2-c258d60bed47",
"lbSubnetID": "024209a3-9134-4cd0-841d-9429339f1ae7"
},
"backup": "off",
"version": "1.28.8",
"version": "1.28.10",
"serviceCIDR": "198.18.192.0/18",
"clusterCIDR": "100.100.0.0/17",
"nodePools": [
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ require (
github.com/go-logr/logr v1.4.2
github.com/gophercloud/gophercloud v1.11.0
github.com/gophercloud/utils v0.0.0-20231010081019-80377eca5d56
github.com/onsi/ginkgo/v2 v2.18.0
github.com/onsi/ginkgo/v2 v2.19.0
github.com/onsi/gomega v1.33.1
github.com/prometheus/client_golang v1.19.1
github.com/prometheus/common v0.53.0
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,8 @@ github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8m
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f h1:KUppIJq7/+SVif2QVs3tOP0zanoHgBEVAwHxUSIzRqU=
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U=
github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno=
github.com/onsi/ginkgo/v2 v2.18.0 h1:W9Y7IWXxPUpAit9ieMOLI7PJZGaW22DTKgiVAuhDTLc=
github.com/onsi/ginkgo/v2 v2.18.0/go.mod h1:rlwLi9PilAFJ8jCg9UE1QP6VBpd6/xj3SRC0d6TU0To=
github.com/onsi/ginkgo/v2 v2.19.0 h1:9Cnnf7UHo57Hy3k6/m5k3dRfGTMXGvxhHFvkDTCTpvA=
github.com/onsi/ginkgo/v2 v2.19.0/go.mod h1:rlwLi9PilAFJ8jCg9UE1QP6VBpd6/xj3SRC0d6TU0To=
github.com/onsi/gomega v1.33.1 h1:dsYjIxxSR755MDmKVsaFQTE22ChNBcuuTWgkUDSubOk=
github.com/onsi/gomega v1.33.1/go.mod h1:U4R44UsT+9eLIaYRB2a5qajjtQYn0hauxvRm16AVYg0=
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
Expand Down
2 changes: 2 additions & 0 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import (
// to ensure that exec-entrypoint and run can make use of them.
"go.uber.org/zap/zapcore"
v1 "k8s.io/api/core/v1"
"k8s.io/client-go/kubernetes"
_ "k8s.io/client-go/plugin/pkg/client/auth"
"k8s.io/client-go/rest"

Expand Down Expand Up @@ -163,6 +164,7 @@ func setupReconcilers(mgr manager.Manager, cfg *reconcilerConfig) error {
nodeInfoCache := cache.NewNodeInfoCache()
if err := (&controllers.NodeReconciler{
Client: mgr.GetClient(),
Clientset: kubernetes.NewForConfigOrDie(mgr.GetConfig()),
Log: ctrl.Log.WithName("controllers").WithName("maintenance"),
Scheme: mgr.GetScheme(),
Recorder: mgr.GetEventRecorderFor("maintenance"),
Expand Down
Loading

0 comments on commit b9db81b

Please sign in to comment.