Skip to content

Commit

Permalink
🐛 Machine Controller should try to retrieve node on delete
Browse files Browse the repository at this point in the history
Consider this scenario:

* Machine is created
* InfraMachine is created
* MachineHealthCheck is set to 10 minutes
* [10 minutes pass]
* Machine is marked as needing remediation
* Machine and InfraMachine goes into deletion flow
* Node finally joins the cluster, say 10 minutes + few seconds
* InfraMachine is still waiting for VM to be deleted
* Machine keeps retrying to delete, but `nodeRef` was never set
* Machine, InfraMachine go away (finalizer removed)
* Node object sticks around in the cluster

This changset allows the Machine controller to look into the cluster
during deletion flow if the Node eventually got created before the
infrastructure provider was able to fully delete the machine.

Signed-off-by: Vince Prignano <vince@prigna.com>
  • Loading branch information
vincepri committed Aug 12, 2024
1 parent 8f31a86 commit 9517bba
Show file tree
Hide file tree
Showing 2 changed files with 150 additions and 1 deletion.
27 changes: 26 additions & 1 deletion internal/controllers/machine/machine_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -562,8 +562,33 @@ func (r *Reconciler) isDeleteNodeAllowed(ctx context.Context, cluster *clusterv1
return errClusterIsBeingDeleted
}

// Cannot delete something that doesn't exist.
if machine.Status.NodeRef == nil && machine.Spec.ProviderID != nil {
// If we don't have a node reference, but a provider id has been set,
// try to retrieve the node one more time.
//
// NOTE: The following is a best-effort attempt to retrieve the node,
// errors are logged but not returned to ensure machines are deleted
// even if the node cannot be retrieved.
remoteClient, err := r.Tracker.GetClient(ctx, util.ObjectKey(cluster))
if err != nil {
log.Error(err, "Failed to get cluster client while deleting Machine and checking for nodes")
} else {
node, err := r.getNode(ctx, remoteClient, *machine.Spec.ProviderID)
if err != nil && err != ErrNodeNotFound {
log.Error(err, "Failed to get node while deleting Machine")
} else if err == nil {
machine.Status.NodeRef = &corev1.ObjectReference{
APIVersion: corev1.SchemeGroupVersion.String(),
Kind: "Node",
Name: node.Name,
UID: node.UID,
}
}
}
}

if machine.Status.NodeRef == nil {
// Cannot delete something that doesn't exist.
return errNilNodeRef
}

Expand Down
124 changes: 124 additions & 0 deletions internal/controllers/machine/machine_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2522,6 +2522,130 @@ func TestNodeDeletion(t *testing.T) {
}
}

func TestNodeDeletionWithoutNodeRefFallback(t *testing.T) {
g := NewWithT(t)

deletionTime := metav1.Now().Add(-1 * time.Second)

testCluster := clusterv1.Cluster{
ObjectMeta: metav1.ObjectMeta{
Name: "test-cluster",
Namespace: metav1.NamespaceDefault,
},
}

node := &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "test",
},
Spec: corev1.NodeSpec{ProviderID: "test://id-1"},
}

testMachine := clusterv1.Machine{
ObjectMeta: metav1.ObjectMeta{
Name: "test",
Namespace: metav1.NamespaceDefault,
Labels: map[string]string{
clusterv1.MachineControlPlaneLabel: "",
},
Annotations: map[string]string{
"machine.cluster.x-k8s.io/exclude-node-draining": "",
},
Finalizers: []string{clusterv1.MachineFinalizer},
DeletionTimestamp: &metav1.Time{Time: deletionTime},
},
Spec: clusterv1.MachineSpec{
ClusterName: "test-cluster",
InfrastructureRef: corev1.ObjectReference{
APIVersion: "infrastructure.cluster.x-k8s.io/v1beta1",
Kind: "GenericInfrastructureMachine",
Name: "infra-config1",
},
Bootstrap: clusterv1.Bootstrap{DataSecretName: ptr.To("data")},
ProviderID: ptr.To("test://id-1"),
},
}

cpmachine1 := &clusterv1.Machine{
ObjectMeta: metav1.ObjectMeta{
Name: "cp1",
Namespace: metav1.NamespaceDefault,
Labels: map[string]string{
clusterv1.ClusterNameLabel: "test-cluster",
clusterv1.MachineControlPlaneLabel: "",
},
Finalizers: []string{clusterv1.MachineFinalizer},
},
Spec: clusterv1.MachineSpec{
ClusterName: "test-cluster",
InfrastructureRef: corev1.ObjectReference{},
Bootstrap: clusterv1.Bootstrap{DataSecretName: ptr.To("data")},
},
Status: clusterv1.MachineStatus{
NodeRef: &corev1.ObjectReference{
Name: "cp1",
},
},
}

testCases := []struct {
name string
deletionTimeout *metav1.Duration
resultErr bool
clusterDeleted bool
expectNodeDeletion bool
createFakeClient func(...client.Object) client.Client
}{
{
name: "should return no error when the node exists and matches the provider id",
deletionTimeout: &metav1.Duration{Duration: time.Second},
resultErr: false,
expectNodeDeletion: true,
createFakeClient: func(initObjs ...client.Object) client.Client {
return fake.NewClientBuilder().
WithObjects(initObjs...).
WithIndex(&corev1.Node{}, index.NodeProviderIDField, index.NodeByProviderID).
WithStatusSubresource(&clusterv1.Machine{}).
Build()
},
},
}

for _, tc := range testCases {
t.Run(tc.name, func(*testing.T) {
m := testMachine.DeepCopy()
m.Spec.NodeDeletionTimeout = tc.deletionTimeout

fakeClient := tc.createFakeClient(node, m, cpmachine1)
tracker := remote.NewTestClusterCacheTracker(ctrl.Log, fakeClient, fakeClient, fakeScheme, client.ObjectKeyFromObject(&testCluster))

r := &Reconciler{
Client: fakeClient,
Tracker: tracker,
recorder: record.NewFakeRecorder(10),
nodeDeletionRetryTimeout: 10 * time.Millisecond,
}

cluster := testCluster.DeepCopy()
if tc.clusterDeleted {
cluster.DeletionTimestamp = &metav1.Time{Time: deletionTime.Add(time.Hour)}
}

_, err := r.reconcileDelete(context.Background(), cluster, m)

if tc.resultErr {
g.Expect(err).To(HaveOccurred())
} else {
g.Expect(err).ToNot(HaveOccurred())
if tc.expectNodeDeletion {
n := &corev1.Node{}
g.Expect(fakeClient.Get(context.Background(), client.ObjectKeyFromObject(node), n)).NotTo(Succeed())
}
}
})
}
}

// adds a condition list to an external object.
func addConditionsToExternal(u *unstructured.Unstructured, newConditions clusterv1.Conditions) {
existingConditions := clusterv1.Conditions{}
Expand Down

0 comments on commit 9517bba

Please sign in to comment.