From ac46f9f20c9ae5f35d41677414c41d2cf52d4bf7 Mon Sep 17 00:00:00 2001 From: Stefan Bueringer Date: Fri, 6 Sep 2024 16:49:36 +0200 Subject: [PATCH] Fix drain log for unreachable Nodes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Stefan Büringer buringerst@vmware.com --- internal/controllers/machine/machine_controller.go | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/internal/controllers/machine/machine_controller.go b/internal/controllers/machine/machine_controller.go index 80d054471a6d..c4daa59d1ae2 100644 --- a/internal/controllers/machine/machine_controller.go +++ b/internal/controllers/machine/machine_controller.go @@ -686,7 +686,15 @@ func (r *Reconciler) drainNode(ctx context.Context, cluster *clusterv1.Cluster, // Override the grace period of pods to reduce the time needed to skip them. drainer.GracePeriodSeconds = 1 - log.V(3).Info("Node is unreachable, draining will use 1s GracePeriodSeconds and will ignore all Pods that have a deletionTimestamp > 1s old. PDBs are still honored.") + // Our drain code still respects PDBs when evicting Pods, but that does not mean they are respected + // in general by the entire system. + // When a Node becomes unreachable the following happens: + // * node.kubernetes.io/unreachable:NoExecute taint is set on the Node + // * taint manager will evict Pods immediately because of the NoExecute taint (without respecting PDBs) + // * https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/#concepts + // "NoExecute": "Pods that do not tolerate the taint are evicted immediately"" + // * our drain code will now ignore the Pods (as they quickly have a deletionTimestamp older than 2 seconds) + log.V(3).Info("Node is unreachable, draining will use 1s GracePeriodSeconds and will ignore all Pods that have a deletionTimestamp > 1s old") } if err := drainer.CordonNode(ctx, node); err != nil {