From 707a7570133f8c34efb914b02aba2bf1a03d74cc Mon Sep 17 00:00:00 2001 From: Marcelo Guerrero Date: Thu, 26 Oct 2023 10:07:24 +0200 Subject: [PATCH 1/4] Remove LastState parameter of GenericPlugin The generic plugin was applying config changes only if the desired spec of interfaces was different from the last applied spec. This logic is different from the one in OnNodeStateChange where the real status of the interfaces is used to detect changes. By removing the LastState parameter (and related code), the generic plugin will also use the real status of interfaces to decide whether to apply changes or not. The SyncNodeState function has this logic. --- pkg/plugins/generic/generic_plugin.go | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/pkg/plugins/generic/generic_plugin.go b/pkg/plugins/generic/generic_plugin.go index a7a71ca15..274f0dc01 100644 --- a/pkg/plugins/generic/generic_plugin.go +++ b/pkg/plugins/generic/generic_plugin.go @@ -4,7 +4,6 @@ import ( "bytes" "errors" "os/exec" - "reflect" "strconv" "strings" "syscall" @@ -52,7 +51,6 @@ type GenericPlugin struct { PluginName string SpecVersion string DesireState *sriovnetworkv1.SriovNetworkNodeState - LastState *sriovnetworkv1.SriovNetworkNodeState DriverStateMap DriverStateMapType DesiredKernelArgs map[string]bool helpers helper.HostHelpersInterface @@ -159,14 +157,6 @@ func (p *GenericPlugin) syncDriverState() error { func (p *GenericPlugin) Apply() error { log.Log.Info("generic plugin Apply()", "desiredState", p.DesireState.Spec) - if p.LastState != nil { - log.Log.Info("generic plugin Apply()", "lastState", p.LastState.Spec) - if reflect.DeepEqual(p.LastState.Spec.Interfaces, p.DesireState.Spec.Interfaces) { - log.Log.Info("generic plugin Apply(): desired and latest state are the same, nothing to apply") - return nil - } - } - if err := p.syncDriverState(); err != nil { return err } @@ -188,8 +178,7 @@ func (p *GenericPlugin) Apply() error { } return err } - p.LastState = &sriovnetworkv1.SriovNetworkNodeState{} - *p.LastState = *p.DesireState + return nil } From 82af069fb5d9609eae66fdbd07e2602412f1adbc Mon Sep 17 00:00:00 2001 From: Marcelo Guerrero Date: Mon, 30 Oct 2023 16:58:16 +0100 Subject: [PATCH 2/4] Verify status changes on managed interfaces Users could modify the settings of VFs which have been configured by the sriov operator. This PR starts the reconciliation loop when these changes are detected in the generic plugin. Signed-off-by: Marcelo Guerrero --- pkg/daemon/daemon.go | 164 ++++++++++++--------- pkg/plugins/fake/fake_plugin.go | 4 + pkg/plugins/generic/generic_plugin.go | 25 ++++ pkg/plugins/generic/generic_plugin_test.go | 100 +++++++++++++ pkg/plugins/intel/intel_plugin.go | 9 +- pkg/plugins/k8s/k8s_plugin.go | 6 + pkg/plugins/mellanox/mellanox_plugin.go | 6 + pkg/plugins/mock/mock_plugin.go | 22 ++- pkg/plugins/plugin.go | 10 +- pkg/plugins/virtual/virtual_plugin.go | 5 + 10 files changed, 272 insertions(+), 79 deletions(-) diff --git a/pkg/daemon/daemon.go b/pkg/daemon/daemon.go index 07eefb4f8..65964d7bc 100644 --- a/pkg/daemon/daemon.go +++ b/pkg/daemon/daemon.go @@ -313,77 +313,60 @@ func (dn *Daemon) nodeStateSyncHandler() error { latest := dn.desiredNodeState.GetGeneration() log.Log.V(0).Info("nodeStateSyncHandler(): new generation", "generation", latest) - if dn.currentNodeState.GetGeneration() == latest && !dn.isDrainCompleted() { - if vars.UsingSystemdMode { - serviceEnabled, err := dn.HostHelpers.IsServiceEnabled(systemd.SriovServicePath) - if err != nil { - log.Log.Error(err, "nodeStateSyncHandler(): failed to check if sriov-config service exist on host") - return err - } - postNetworkServiceEnabled, err := dn.HostHelpers.IsServiceEnabled(systemd.SriovPostNetworkServicePath) - if err != nil { - log.Log.Error(err, "nodeStateSyncHandler(): failed to check if sriov-config-post-network service exist on host") - return err - } - - // if the service doesn't exist we should continue to let the k8s plugin to create the service files - // this is only for k8s base environments, for openshift the sriov-operator creates a machine config to will apply - // the system service and reboot the node the config-daemon doesn't need to do anything. - if !(serviceEnabled && postNetworkServiceEnabled) { - sriovResult = &systemd.SriovResult{SyncStatus: consts.SyncStatusFailed, - LastSyncError: fmt.Sprintf("some sriov systemd services are not available on node: "+ - "sriov-config available:%t, sriov-config-post-network available:%t", serviceEnabled, postNetworkServiceEnabled)} - } else { - sriovResult, err = systemd.ReadSriovResult() - if err != nil { - log.Log.Error(err, "nodeStateSyncHandler(): failed to load sriov result file from host") - return err - } - } - if sriovResult.LastSyncError != "" || sriovResult.SyncStatus == consts.SyncStatusFailed { - log.Log.Info("nodeStateSyncHandler(): sync failed systemd service error", "last-sync-error", sriovResult.LastSyncError) - - // add the error but don't requeue - dn.refreshCh <- Message{ - syncStatus: consts.SyncStatusFailed, - lastSyncError: sriovResult.LastSyncError, - } - <-dn.syncCh - return nil - } - } - log.Log.V(0).Info("nodeStateSyncHandler(): Interface not changed") - if dn.desiredNodeState.Status.LastSyncError != "" || - dn.desiredNodeState.Status.SyncStatus != consts.SyncStatusSucceeded { - dn.refreshCh <- Message{ - syncStatus: consts.SyncStatusSucceeded, - lastSyncError: "", - } - // wait for writer to refresh the status - <-dn.syncCh + // load plugins if it has not loaded + if len(dn.loadedPlugins) == 0 { + dn.loadedPlugins, err = loadPlugins(dn.desiredNodeState, dn.HostHelpers, dn.disabledPlugins) + if err != nil { + log.Log.Error(err, "nodeStateSyncHandler(): failed to enable vendor plugins") + return err } - - return nil } - if dn.desiredNodeState.GetGeneration() == 1 && len(dn.desiredNodeState.Spec.Interfaces) == 0 { - err = dn.HostHelpers.ClearPCIAddressFolder() + if vars.UsingSystemdMode && dn.currentNodeState.GetGeneration() == latest && !dn.isDrainCompleted() { + serviceEnabled, err := dn.HostHelpers.IsServiceEnabled(systemd.SriovServicePath) if err != nil { - log.Log.Error(err, "failed to clear the PCI address configuration") + log.Log.Error(err, "nodeStateSyncHandler(): failed to check if sriov-config service exist on host") + return err + } + postNetworkServiceEnabled, err := dn.HostHelpers.IsServiceEnabled(systemd.SriovPostNetworkServicePath) + if err != nil { + log.Log.Error(err, "nodeStateSyncHandler(): failed to check if sriov-config-post-network service exist on host") return err } - log.Log.V(0).Info( - "nodeStateSyncHandler(): interface policy spec not yet set by controller for sriovNetworkNodeState", - "name", dn.desiredNodeState.Name) - if dn.desiredNodeState.Status.SyncStatus != "Succeeded" { + // if the service doesn't exist we should continue to let the k8s plugin to create the service files + // this is only for k8s base environments, for openshift the sriov-operator creates a machine config to will apply + // the system service and reboot the node the config-daemon doesn't need to do anything. + if !(serviceEnabled && postNetworkServiceEnabled) { + sriovResult = &systemd.SriovResult{SyncStatus: consts.SyncStatusFailed, + LastSyncError: fmt.Sprintf("some sriov systemd services are not available on node: "+ + "sriov-config available:%t, sriov-config-post-network available:%t", serviceEnabled, postNetworkServiceEnabled)} + } else { + sriovResult, err = systemd.ReadSriovResult() + if err != nil { + log.Log.Error(err, "nodeStateSyncHandler(): failed to load sriov result file from host") + return err + } + } + if sriovResult.LastSyncError != "" || sriovResult.SyncStatus == consts.SyncStatusFailed { + log.Log.Info("nodeStateSyncHandler(): sync failed systemd service error", "last-sync-error", sriovResult.LastSyncError) + + // add the error but don't requeue dn.refreshCh <- Message{ - syncStatus: "Succeeded", - lastSyncError: "", + syncStatus: consts.SyncStatusFailed, + lastSyncError: sriovResult.LastSyncError, } - // wait for writer to refresh status <-dn.syncCh + return nil } + } + + skip, err := dn.shouldSkipReconciliation(dn.desiredNodeState) + if err != nil { + return err + } + // Reconcile only when there are changes in the spec or status of managed VFs. + if skip { return nil } @@ -404,15 +387,6 @@ func (dn *Daemon) nodeStateSyncHandler() error { } dn.desiredNodeState.Status = updatedState.Status - // load plugins if it has not loaded - if len(dn.loadedPlugins) == 0 { - dn.loadedPlugins, err = loadPlugins(dn.desiredNodeState, dn.HostHelpers, dn.disabledPlugins) - if err != nil { - log.Log.Error(err, "nodeStateSyncHandler(): failed to enable vendor plugins") - return err - } - } - reqReboot := false reqDrain := false @@ -564,6 +538,58 @@ func (dn *Daemon) nodeStateSyncHandler() error { return nil } +func (dn *Daemon) shouldSkipReconciliation(latestState *sriovnetworkv1.SriovNetworkNodeState) (bool, error) { + var err error + + // Skip when SriovNetworkNodeState object has just been created. + if latestState.GetGeneration() == 1 && len(latestState.Spec.Interfaces) == 0 { + err = dn.HostHelpers.ClearPCIAddressFolder() + if err != nil { + log.Log.Error(err, "failed to clear the PCI address configuration") + return false, err + } + + log.Log.V(0).Info( + "shouldSkipReconciliation(): interface policy spec not yet set by controller for sriovNetworkNodeState", + "name", latestState.Name) + if latestState.Status.SyncStatus != consts.SyncStatusSucceeded { + dn.refreshCh <- Message{ + syncStatus: consts.SyncStatusSucceeded, + lastSyncError: "", + } + // wait for writer to refresh status + <-dn.syncCh + } + return true, nil + } + + // Verify changes in the spec of the SriovNetworkNodeState CR. + if dn.currentNodeState.GetGeneration() == latestState.GetGeneration() && !dn.isDrainCompleted() { + genericPlugin, ok := dn.loadedPlugins[GenericPluginName] + if ok { + // Verify changes in the status of the SriovNetworkNodeState CR. + if genericPlugin.CheckStatusChanges(latestState) { + return false, nil + } + } + + log.Log.V(0).Info("shouldSkipReconciliation(): Interface not changed") + if latestState.Status.LastSyncError != "" || + latestState.Status.SyncStatus != consts.SyncStatusSucceeded { + dn.refreshCh <- Message{ + syncStatus: consts.SyncStatusSucceeded, + lastSyncError: "", + } + // wait for writer to refresh the status + <-dn.syncCh + } + + return true, nil + } + + return false, nil +} + // handleDrain: adds the right annotation to the node and nodeState object // returns true if we need to finish the reconcile loop and wait for a new object func (dn *Daemon) handleDrain(reqReboot bool) (bool, error) { diff --git a/pkg/plugins/fake/fake_plugin.go b/pkg/plugins/fake/fake_plugin.go index f7fe6e56f..11c30d6ea 100644 --- a/pkg/plugins/fake/fake_plugin.go +++ b/pkg/plugins/fake/fake_plugin.go @@ -21,6 +21,10 @@ func (f *FakePlugin) OnNodeStateChange(new *sriovnetworkv1.SriovNetworkNodeState return false, false, nil } +func (f *FakePlugin) CheckStatusChanges(new *sriovnetworkv1.SriovNetworkNodeState) bool { + return false +} + func (f *FakePlugin) Apply() error { return nil } diff --git a/pkg/plugins/generic/generic_plugin.go b/pkg/plugins/generic/generic_plugin.go index 274f0dc01..20ef4a49c 100644 --- a/pkg/plugins/generic/generic_plugin.go +++ b/pkg/plugins/generic/generic_plugin.go @@ -139,6 +139,31 @@ func (p *GenericPlugin) OnNodeStateChange(new *sriovnetworkv1.SriovNetworkNodeSt return } +// CheckStatusChanges verify whether SriovNetworkNodeState CR status present changes on configured VFs. +func (p *GenericPlugin) CheckStatusChanges(new *sriovnetworkv1.SriovNetworkNodeState) bool { + log.Log.Info("generic-plugin CheckStatusChanges()") + + changed := false + for _, iface := range new.Spec.Interfaces { + found := false + for _, ifaceStatus := range new.Status.Interfaces { + if iface.PciAddress == ifaceStatus.PciAddress && !iface.ExternallyManaged { + found = true + if sriovnetworkv1.NeedToUpdateSriov(&iface, &ifaceStatus) { + changed = true + log.Log.Info("CheckStatusChanges(): status changed for interface", "address", iface.PciAddress) + } + break + } + } + + if !found { + log.Log.Info("CheckStatusChanges(): no status found for interface", "address", iface.PciAddress) + } + } + return changed +} + func (p *GenericPlugin) syncDriverState() error { for _, driverState := range p.DriverStateMap { if !driverState.DriverLoaded && driverState.NeedDriverFunc(p.DesireState, driverState) { diff --git a/pkg/plugins/generic/generic_plugin_test.go b/pkg/plugins/generic/generic_plugin_test.go index 9f9c1be83..6b75535bb 100644 --- a/pkg/plugins/generic/generic_plugin_test.go +++ b/pkg/plugins/generic/generic_plugin_test.go @@ -137,6 +137,106 @@ var _ = Describe("Generic plugin", func() { Expect(needDrain).To(BeTrue()) }) + It("should not detect changes on status", func() { + networkNodeState := &sriovnetworkv1.SriovNetworkNodeState{ + Spec: sriovnetworkv1.SriovNetworkNodeStateSpec{ + Interfaces: sriovnetworkv1.Interfaces{{ + PciAddress: "0000:00:00.0", + NumVfs: 1, + VfGroups: []sriovnetworkv1.VfGroup{{ + DeviceType: "netdevice", + PolicyName: "policy-1", + ResourceName: "resource-1", + VfRange: "0-0", + }}}}, + }, + Status: sriovnetworkv1.SriovNetworkNodeStateStatus{ + Interfaces: sriovnetworkv1.InterfaceExts{{ + PciAddress: "0000:00:00.0", + NumVfs: 1, + TotalVfs: 1, + DeviceID: "1015", + Vendor: "15b3", + Name: "sriovif1", + Mtu: 1500, + Mac: "0c:42:a1:55:ee:46", + Driver: "mlx5_core", + EswitchMode: "legacy", + LinkSpeed: "25000 Mb/s", + LinkType: "ETH", + VFs: []sriovnetworkv1.VirtualFunction{{ + PciAddress: "0000:00:00.1", + DeviceID: "1016", + Vendor: "15b3", + VfID: 0, + Name: "sriovif1v0", + Mtu: 1500, + Mac: "8e:d6:2c:62:87:1b", + Driver: "mlx5_core", + }}, + }}, + }, + } + + changed := genericPlugin.CheckStatusChanges(networkNodeState) + Expect(err).ToNot(HaveOccurred()) + Expect(changed).To(BeFalse()) + }) + + It("should detect changes on status", func() { + networkNodeState := &sriovnetworkv1.SriovNetworkNodeState{ + Spec: sriovnetworkv1.SriovNetworkNodeStateSpec{ + Interfaces: sriovnetworkv1.Interfaces{{ + PciAddress: "0000:00:00.0", + NumVfs: 2, + Mtu: 1500, + VfGroups: []sriovnetworkv1.VfGroup{{ + DeviceType: "netdevice", + PolicyName: "policy-1", + ResourceName: "resource-1", + VfRange: "0-1", + Mtu: 1500, + }}}}, + }, + Status: sriovnetworkv1.SriovNetworkNodeStateStatus{ + Interfaces: sriovnetworkv1.InterfaceExts{{ + PciAddress: "0000:00:00.0", + NumVfs: 2, + TotalVfs: 2, + DeviceID: "1015", + Vendor: "15b3", + Name: "sriovif1", + Mtu: 1500, + Mac: "0c:42:a1:55:ee:46", + Driver: "mlx5_core", + EswitchMode: "legacy", + LinkSpeed: "25000 Mb/s", + LinkType: "ETH", + VFs: []sriovnetworkv1.VirtualFunction{{ + PciAddress: "0000:00:00.1", + DeviceID: "1016", + Vendor: "15b3", + VfID: 0, + Driver: "mlx5_core", + Name: "sriovif1v0", + Mtu: 999, // Bad MTU value, changed by the user application + Mac: "8e:d6:2c:62:87:1b", + }, { + PciAddress: "0000:00:00.2", + DeviceID: "1016", + Vendor: "15b3", + VfID: 0, + Driver: "mlx5_core", + }}, + }}, + }, + } + + changed := genericPlugin.CheckStatusChanges(networkNodeState) + Expect(err).ToNot(HaveOccurred()) + Expect(changed).To(BeTrue()) + }) + It("should load vfio_pci driver", func() { networkNodeState := &sriovnetworkv1.SriovNetworkNodeState{ Spec: sriovnetworkv1.SriovNetworkNodeStateSpec{ diff --git a/pkg/plugins/intel/intel_plugin.go b/pkg/plugins/intel/intel_plugin.go index 467116ddb..a9174bdba 100644 --- a/pkg/plugins/intel/intel_plugin.go +++ b/pkg/plugins/intel/intel_plugin.go @@ -35,11 +35,16 @@ func (p *IntelPlugin) Spec() string { } // OnNodeStateChange Invoked when SriovNetworkNodeState CR is created or updated, return if need dain and/or reboot node -func (p *IntelPlugin) OnNodeStateChange(new *sriovnetworkv1.SriovNetworkNodeState) (needDrain bool, needReboot bool, err error) { - log.Log.Info("intel plugin OnNodeStateChange()") +func (p *IntelPlugin) OnNodeStateChange(*sriovnetworkv1.SriovNetworkNodeState) (bool, bool, error) { + log.Log.Info("intel-plugin OnNodeStateChange()") return false, false, nil } +// OnNodeStatusChange verify whether SriovNetworkNodeState CR status present changes on configured VFs. +func (p *IntelPlugin) CheckStatusChanges(*sriovnetworkv1.SriovNetworkNodeState) bool { + return false +} + // Apply config change func (p *IntelPlugin) Apply() error { log.Log.Info("intel plugin Apply()") diff --git a/pkg/plugins/k8s/k8s_plugin.go b/pkg/plugins/k8s/k8s_plugin.go index 118073ee5..aa28c0aa6 100644 --- a/pkg/plugins/k8s/k8s_plugin.go +++ b/pkg/plugins/k8s/k8s_plugin.go @@ -154,6 +154,12 @@ func (p *K8sPlugin) OnNodeStateChange(new *sriovnetworkv1.SriovNetworkNodeState) return } +// TODO: implement - https://github.com/k8snetworkplumbingwg/sriov-network-operator/issues/630 +// OnNodeStatusChange verify whether SriovNetworkNodeState CR status present changes on configured VFs. +func (p *K8sPlugin) CheckStatusChanges(*sriovnetworkv1.SriovNetworkNodeState) bool { + return false +} + // Apply config change func (p *K8sPlugin) Apply() error { log.Log.Info("k8s plugin Apply()") diff --git a/pkg/plugins/mellanox/mellanox_plugin.go b/pkg/plugins/mellanox/mellanox_plugin.go index 6b1b943de..1aa1f6434 100644 --- a/pkg/plugins/mellanox/mellanox_plugin.go +++ b/pkg/plugins/mellanox/mellanox_plugin.go @@ -171,6 +171,12 @@ func (p *MellanoxPlugin) OnNodeStateChange(new *sriovnetworkv1.SriovNetworkNodeS return } +// TODO: implement - https://github.com/k8snetworkplumbingwg/sriov-network-operator/issues/631 +// OnNodeStatusChange verify whether SriovNetworkNodeState CR status present changes on configured VFs. +func (p *MellanoxPlugin) CheckStatusChanges(*sriovnetworkv1.SriovNetworkNodeState) bool { + return false +} + // Apply config change func (p *MellanoxPlugin) Apply() error { if p.helpers.IsKernelLockdownMode() { diff --git a/pkg/plugins/mock/mock_plugin.go b/pkg/plugins/mock/mock_plugin.go index b821139c5..044d30f0e 100644 --- a/pkg/plugins/mock/mock_plugin.go +++ b/pkg/plugins/mock/mock_plugin.go @@ -48,6 +48,20 @@ func (mr *MockVendorPluginMockRecorder) Apply() *gomock.Call { return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Apply", reflect.TypeOf((*MockVendorPlugin)(nil).Apply)) } +// CheckStatusChanges mocks base method. +func (m *MockVendorPlugin) CheckStatusChanges(arg0 *v1.SriovNetworkNodeState) bool { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "CheckStatusChanges", arg0) + ret0, _ := ret[0].(bool) + return ret0 +} + +// CheckStatusChanges indicates an expected call of CheckStatusChanges. +func (mr *MockVendorPluginMockRecorder) CheckStatusChanges(arg0 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "CheckStatusChanges", reflect.TypeOf((*MockVendorPlugin)(nil).CheckStatusChanges), arg0) +} + // Name mocks base method. func (m *MockVendorPlugin) Name() string { m.ctrl.T.Helper() @@ -63,9 +77,9 @@ func (mr *MockVendorPluginMockRecorder) Name() *gomock.Call { } // OnNodeStateChange mocks base method. -func (m *MockVendorPlugin) OnNodeStateChange(new *v1.SriovNetworkNodeState) (bool, bool, error) { +func (m *MockVendorPlugin) OnNodeStateChange(arg0 *v1.SriovNetworkNodeState) (bool, bool, error) { m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "OnNodeStateChange", new) + ret := m.ctrl.Call(m, "OnNodeStateChange", arg0) ret0, _ := ret[0].(bool) ret1, _ := ret[1].(bool) ret2, _ := ret[2].(error) @@ -73,9 +87,9 @@ func (m *MockVendorPlugin) OnNodeStateChange(new *v1.SriovNetworkNodeState) (boo } // OnNodeStateChange indicates an expected call of OnNodeStateChange. -func (mr *MockVendorPluginMockRecorder) OnNodeStateChange(new interface{}) *gomock.Call { +func (mr *MockVendorPluginMockRecorder) OnNodeStateChange(arg0 interface{}) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "OnNodeStateChange", reflect.TypeOf((*MockVendorPlugin)(nil).OnNodeStateChange), new) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "OnNodeStateChange", reflect.TypeOf((*MockVendorPlugin)(nil).OnNodeStateChange), arg0) } // Spec mocks base method. diff --git a/pkg/plugins/plugin.go b/pkg/plugins/plugin.go index 276e37cf2..3136dfd46 100644 --- a/pkg/plugins/plugin.go +++ b/pkg/plugins/plugin.go @@ -6,12 +6,14 @@ import ( //go:generate ../../bin/mockgen -destination mock/mock_plugin.go -source plugin.go type VendorPlugin interface { - // Return the name of plugin + // Name returns the name of plugin Name() string - // Return the SpecVersion followed by plugin + // Spec returns the SpecVersion followed by plugin Spec() string - // Invoked when SriovNetworkNodeState CR is created or updated, return if need dain and/or reboot node - OnNodeStateChange(new *sriovnetworkv1.SriovNetworkNodeState) (bool, bool, error) + // OnNodeStateChange is invoked when SriovNetworkNodeState CR is created or updated, return if need dain and/or reboot node + OnNodeStateChange(*sriovnetworkv1.SriovNetworkNodeState) (bool, bool, error) // Apply config change Apply() error + // CheckStatusChanges checks status changes on the SriovNetworkNodeState CR for configured VFs. + CheckStatusChanges(*sriovnetworkv1.SriovNetworkNodeState) bool } diff --git a/pkg/plugins/virtual/virtual_plugin.go b/pkg/plugins/virtual/virtual_plugin.go index a942aaa86..cd43b120b 100644 --- a/pkg/plugins/virtual/virtual_plugin.go +++ b/pkg/plugins/virtual/virtual_plugin.go @@ -67,6 +67,11 @@ func (p *VirtualPlugin) OnNodeStateChange(new *sriovnetworkv1.SriovNetworkNodeSt return } +// OnNodeStatusChange verify whether SriovNetworkNodeState CR status present changes on configured VFs. +func (p *VirtualPlugin) CheckStatusChanges(*sriovnetworkv1.SriovNetworkNodeState) bool { + return false +} + // Apply config change func (p *VirtualPlugin) Apply() error { log.Log.Info("virtual plugin Apply()", "desired-state", p.DesireState.Spec) From 21d919c487828759a3236e36534703c7e476d2ef Mon Sep 17 00:00:00 2001 From: Marcelo Guerrero Date: Mon, 22 Jan 2024 18:55:58 +0100 Subject: [PATCH 3/4] Abstract logic to check and load missing KArgs Logic to check missing kernel arguments is placed in a method to be used by both OnNodeStateChange and CheckStatusChanges. --- pkg/daemon/daemon.go | 9 +- pkg/plugins/fake/fake_plugin.go | 4 +- pkg/plugins/generic/generic_plugin.go | 104 +++++++++++++-------- pkg/plugins/generic/generic_plugin_test.go | 65 ++++++++++++- pkg/plugins/intel/intel_plugin.go | 4 +- pkg/plugins/k8s/k8s_plugin.go | 4 +- pkg/plugins/mellanox/mellanox_plugin.go | 4 +- pkg/plugins/mock/mock_plugin.go | 5 +- pkg/plugins/plugin.go | 2 +- pkg/plugins/virtual/virtual_plugin.go | 4 +- 10 files changed, 147 insertions(+), 58 deletions(-) diff --git a/pkg/daemon/daemon.go b/pkg/daemon/daemon.go index 65964d7bc..950287a16 100644 --- a/pkg/daemon/daemon.go +++ b/pkg/daemon/daemon.go @@ -565,10 +565,13 @@ func (dn *Daemon) shouldSkipReconciliation(latestState *sriovnetworkv1.SriovNetw // Verify changes in the spec of the SriovNetworkNodeState CR. if dn.currentNodeState.GetGeneration() == latestState.GetGeneration() && !dn.isDrainCompleted() { - genericPlugin, ok := dn.loadedPlugins[GenericPluginName] - if ok { + for _, p := range dn.loadedPlugins { // Verify changes in the status of the SriovNetworkNodeState CR. - if genericPlugin.CheckStatusChanges(latestState) { + changed, err := p.CheckStatusChanges(latestState) + if err != nil { + return false, err + } + if changed { return false, nil } } diff --git a/pkg/plugins/fake/fake_plugin.go b/pkg/plugins/fake/fake_plugin.go index 11c30d6ea..0aa218f28 100644 --- a/pkg/plugins/fake/fake_plugin.go +++ b/pkg/plugins/fake/fake_plugin.go @@ -21,8 +21,8 @@ func (f *FakePlugin) OnNodeStateChange(new *sriovnetworkv1.SriovNetworkNodeState return false, false, nil } -func (f *FakePlugin) CheckStatusChanges(new *sriovnetworkv1.SriovNetworkNodeState) bool { - return false +func (f *FakePlugin) CheckStatusChanges(new *sriovnetworkv1.SriovNetworkNodeState) (bool, error) { + return false, nil } func (f *FakePlugin) Apply() error { diff --git a/pkg/plugins/generic/generic_plugin.go b/pkg/plugins/generic/generic_plugin.go index 20ef4a49c..5e0827c69 100644 --- a/pkg/plugins/generic/generic_plugin.go +++ b/pkg/plugins/generic/generic_plugin.go @@ -140,28 +140,34 @@ func (p *GenericPlugin) OnNodeStateChange(new *sriovnetworkv1.SriovNetworkNodeSt } // CheckStatusChanges verify whether SriovNetworkNodeState CR status present changes on configured VFs. -func (p *GenericPlugin) CheckStatusChanges(new *sriovnetworkv1.SriovNetworkNodeState) bool { +func (p *GenericPlugin) CheckStatusChanges(current *sriovnetworkv1.SriovNetworkNodeState) (bool, error) { log.Log.Info("generic-plugin CheckStatusChanges()") - changed := false - for _, iface := range new.Spec.Interfaces { + for _, iface := range current.Spec.Interfaces { found := false - for _, ifaceStatus := range new.Status.Interfaces { + for _, ifaceStatus := range current.Status.Interfaces { + // TODO: remove the check for ExternallyManaged - https://github.com/k8snetworkplumbingwg/sriov-network-operator/issues/632 if iface.PciAddress == ifaceStatus.PciAddress && !iface.ExternallyManaged { found = true if sriovnetworkv1.NeedToUpdateSriov(&iface, &ifaceStatus) { - changed = true log.Log.Info("CheckStatusChanges(): status changed for interface", "address", iface.PciAddress) + return true, nil } break } } - if !found { log.Log.Info("CheckStatusChanges(): no status found for interface", "address", iface.PciAddress) } } - return changed + + missingKernelArgs, err := p.getMissingKernelArgs() + if err != nil { + log.Log.Error(err, "generic-plugin CheckStatusChanges(): failed to verify missing kernel arguments") + return false, err + } + + return len(missingKernelArgs) != 0, nil } func (p *GenericPlugin) syncDriverState() error { @@ -266,37 +272,48 @@ func (p *GenericPlugin) addToDesiredKernelArgs(karg string) { } } -// syncDesiredKernelArgs Should be called to set all the kernel arguments. Returns bool if node update is needed. -func (p *GenericPlugin) syncDesiredKernelArgs() (bool, error) { - needReboot := false +// getMissingKernelArgs gets Kernel arguments that have not been set. +func (p *GenericPlugin) getMissingKernelArgs() ([]string, error) { + missingArgs := make([]string, 0, len(p.DesiredKernelArgs)) if len(p.DesiredKernelArgs) == 0 { - return false, nil + return nil, nil } + kargs, err := p.helpers.GetCurrentKernelArgs() if err != nil { - return false, err + return nil, err } - for desiredKarg, attempted := range p.DesiredKernelArgs { - set := p.helpers.IsKernelArgsSet(kargs, desiredKarg) - if !set { - if attempted { - log.Log.V(2).Info("generic plugin syncDesiredKernelArgs(): previously attempted to set kernel arg", - "karg", desiredKarg) - } - // There is a case when we try to set the kernel argument here, the daemon could decide to not reboot because - // the daemon encountered a potentially one-time error. However we always want to make sure that the kernel - // argument is set once the daemon goes through node state sync again. - update, err := setKernelArg(desiredKarg) - if err != nil { - log.Log.Error(err, "generic plugin syncDesiredKernelArgs(): fail to set kernel arg", "karg", desiredKarg) - return false, err - } - if update { - needReboot = true - log.Log.V(2).Info("generic plugin syncDesiredKernelArgs(): need reboot for setting kernel arg", "karg", desiredKarg) - } - p.DesiredKernelArgs[desiredKarg] = true + + for desiredKarg := range p.DesiredKernelArgs { + if !p.helpers.IsKernelArgsSet(kargs, desiredKarg) { + missingArgs = append(missingArgs, desiredKarg) + } + } + return missingArgs, nil +} + +// syncDesiredKernelArgs should be called to set all the kernel arguments. Returns bool if node update is needed. +func (p *GenericPlugin) syncDesiredKernelArgs(kargs []string) (bool, error) { + needReboot := false + + for _, karg := range kargs { + if p.DesiredKernelArgs[karg] { + log.Log.V(2).Info("generic-plugin syncDesiredKernelArgs(): previously attempted to set kernel arg", + "karg", karg) + } + // There is a case when we try to set the kernel argument here, the daemon could decide to not reboot because + // the daemon encountered a potentially one-time error. However we always want to make sure that the kernel + // argument is set once the daemon goes through node state sync again. + update, err := setKernelArg(karg) + if err != nil { + log.Log.Error(err, "generic-plugin syncDesiredKernelArgs(): fail to set kernel arg", "karg", karg) + return false, err + } + if update { + needReboot = true + log.Log.V(2).Info("generic-plugin syncDesiredKernelArgs(): need reboot for setting kernel arg", "karg", karg) } + p.DesiredKernelArgs[karg] = true } return needReboot, nil } @@ -365,19 +382,28 @@ func (p *GenericPlugin) addVfioDesiredKernelArg(state *sriovnetworkv1.SriovNetwo } } -func (p *GenericPlugin) needRebootNode(state *sriovnetworkv1.SriovNetworkNodeState) (needReboot bool, err error) { - needReboot = false +func (p *GenericPlugin) needRebootNode(state *sriovnetworkv1.SriovNetworkNodeState) (bool, error) { + needReboot := false + p.addVfioDesiredKernelArg(state) - updateNode, err := p.syncDesiredKernelArgs() + missingKernelArgs, err := p.getMissingKernelArgs() if err != nil { - log.Log.Error(err, "generic plugin needRebootNode(): failed to set the desired kernel arguments") + log.Log.Error(err, "generic-plugin needRebootNode(): failed to verify missing kernel arguments") return false, err } - if updateNode { - log.Log.V(2).Info("generic plugin needRebootNode(): need reboot for updating kernel arguments") - needReboot = true + + if len(missingKernelArgs) != 0 { + needReboot, err = p.syncDesiredKernelArgs(missingKernelArgs) + if err != nil { + log.Log.Error(err, "generic-plugin needRebootNode(): failed to set the desired kernel arguments") + return false, err + } + if needReboot { + log.Log.V(2).Info("generic-plugin needRebootNode(): need reboot for updating kernel arguments") + } } + return needReboot, nil } diff --git a/pkg/plugins/generic/generic_plugin_test.go b/pkg/plugins/generic/generic_plugin_test.go index 6b75535bb..d2e9c1bbe 100644 --- a/pkg/plugins/generic/generic_plugin_test.go +++ b/pkg/plugins/generic/generic_plugin_test.go @@ -8,6 +8,7 @@ import ( . "github.com/onsi/gomega" sriovnetworkv1 "github.com/k8snetworkplumbingwg/sriov-network-operator/api/v1" + "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/consts" mock_helper "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/helper/mock" plugin "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/plugins" ) @@ -178,12 +179,12 @@ var _ = Describe("Generic plugin", func() { }, } - changed := genericPlugin.CheckStatusChanges(networkNodeState) + changed, err := genericPlugin.CheckStatusChanges(networkNodeState) Expect(err).ToNot(HaveOccurred()) Expect(changed).To(BeFalse()) }) - It("should detect changes on status", func() { + It("should detect changes on status due to spec mismatch", func() { networkNodeState := &sriovnetworkv1.SriovNetworkNodeState{ Spec: sriovnetworkv1.SriovNetworkNodeStateSpec{ Interfaces: sriovnetworkv1.Interfaces{{ @@ -232,7 +233,65 @@ var _ = Describe("Generic plugin", func() { }, } - changed := genericPlugin.CheckStatusChanges(networkNodeState) + changed, err := genericPlugin.CheckStatusChanges(networkNodeState) + Expect(err).ToNot(HaveOccurred()) + Expect(changed).To(BeTrue()) + }) + + It("should detect changes on status due to missing kernel args", func() { + networkNodeState := &sriovnetworkv1.SriovNetworkNodeState{ + Spec: sriovnetworkv1.SriovNetworkNodeStateSpec{ + Interfaces: sriovnetworkv1.Interfaces{{ + PciAddress: "0000:00:00.0", + NumVfs: 2, + Mtu: 1500, + VfGroups: []sriovnetworkv1.VfGroup{{ + DeviceType: "vfio-pci", + PolicyName: "policy-1", + ResourceName: "resource-1", + VfRange: "0-1", + Mtu: 1500, + }}}}, + }, + Status: sriovnetworkv1.SriovNetworkNodeStateStatus{ + Interfaces: sriovnetworkv1.InterfaceExts{{ + PciAddress: "0000:00:00.0", + NumVfs: 2, + TotalVfs: 2, + DeviceID: "159b", + Vendor: "8086", + Name: "sriovif1", + Mtu: 1500, + Mac: "0c:42:a1:55:ee:46", + Driver: "ice", + EswitchMode: "legacy", + LinkSpeed: "25000 Mb/s", + LinkType: "ETH", + VFs: []sriovnetworkv1.VirtualFunction{{ + PciAddress: "0000:00:00.1", + DeviceID: "1889", + Vendor: "8086", + VfID: 0, + Driver: "vfio-pci", + }, { + PciAddress: "0000:00:00.2", + DeviceID: "1889", + Vendor: "8086", + VfID: 1, + Driver: "vfio-pci", + }}, + }}, + }, + } + + // Load required kernel args. + genericPlugin.(*GenericPlugin).addVfioDesiredKernelArg(networkNodeState) + + hostHelper.EXPECT().GetCurrentKernelArgs().Return("", nil) + hostHelper.EXPECT().IsKernelArgsSet("", consts.KernelArgIntelIommu).Return(false) + hostHelper.EXPECT().IsKernelArgsSet("", consts.KernelArgIommuPt).Return(false) + + changed, err := genericPlugin.CheckStatusChanges(networkNodeState) Expect(err).ToNot(HaveOccurred()) Expect(changed).To(BeTrue()) }) diff --git a/pkg/plugins/intel/intel_plugin.go b/pkg/plugins/intel/intel_plugin.go index a9174bdba..e88a186c9 100644 --- a/pkg/plugins/intel/intel_plugin.go +++ b/pkg/plugins/intel/intel_plugin.go @@ -41,8 +41,8 @@ func (p *IntelPlugin) OnNodeStateChange(*sriovnetworkv1.SriovNetworkNodeState) ( } // OnNodeStatusChange verify whether SriovNetworkNodeState CR status present changes on configured VFs. -func (p *IntelPlugin) CheckStatusChanges(*sriovnetworkv1.SriovNetworkNodeState) bool { - return false +func (p *IntelPlugin) CheckStatusChanges(*sriovnetworkv1.SriovNetworkNodeState) (bool, error) { + return false, nil } // Apply config change diff --git a/pkg/plugins/k8s/k8s_plugin.go b/pkg/plugins/k8s/k8s_plugin.go index aa28c0aa6..1a0336049 100644 --- a/pkg/plugins/k8s/k8s_plugin.go +++ b/pkg/plugins/k8s/k8s_plugin.go @@ -156,8 +156,8 @@ func (p *K8sPlugin) OnNodeStateChange(new *sriovnetworkv1.SriovNetworkNodeState) // TODO: implement - https://github.com/k8snetworkplumbingwg/sriov-network-operator/issues/630 // OnNodeStatusChange verify whether SriovNetworkNodeState CR status present changes on configured VFs. -func (p *K8sPlugin) CheckStatusChanges(*sriovnetworkv1.SriovNetworkNodeState) bool { - return false +func (p *K8sPlugin) CheckStatusChanges(*sriovnetworkv1.SriovNetworkNodeState) (bool, error) { + return false, nil } // Apply config change diff --git a/pkg/plugins/mellanox/mellanox_plugin.go b/pkg/plugins/mellanox/mellanox_plugin.go index 1aa1f6434..87363464b 100644 --- a/pkg/plugins/mellanox/mellanox_plugin.go +++ b/pkg/plugins/mellanox/mellanox_plugin.go @@ -173,8 +173,8 @@ func (p *MellanoxPlugin) OnNodeStateChange(new *sriovnetworkv1.SriovNetworkNodeS // TODO: implement - https://github.com/k8snetworkplumbingwg/sriov-network-operator/issues/631 // OnNodeStatusChange verify whether SriovNetworkNodeState CR status present changes on configured VFs. -func (p *MellanoxPlugin) CheckStatusChanges(*sriovnetworkv1.SriovNetworkNodeState) bool { - return false +func (p *MellanoxPlugin) CheckStatusChanges(*sriovnetworkv1.SriovNetworkNodeState) (bool, error) { + return false, nil } // Apply config change diff --git a/pkg/plugins/mock/mock_plugin.go b/pkg/plugins/mock/mock_plugin.go index 044d30f0e..a6ae38202 100644 --- a/pkg/plugins/mock/mock_plugin.go +++ b/pkg/plugins/mock/mock_plugin.go @@ -49,11 +49,12 @@ func (mr *MockVendorPluginMockRecorder) Apply() *gomock.Call { } // CheckStatusChanges mocks base method. -func (m *MockVendorPlugin) CheckStatusChanges(arg0 *v1.SriovNetworkNodeState) bool { +func (m *MockVendorPlugin) CheckStatusChanges(arg0 *v1.SriovNetworkNodeState) (bool, error) { m.ctrl.T.Helper() ret := m.ctrl.Call(m, "CheckStatusChanges", arg0) ret0, _ := ret[0].(bool) - return ret0 + ret1, _ := ret[1].(error) + return ret0, ret1 } // CheckStatusChanges indicates an expected call of CheckStatusChanges. diff --git a/pkg/plugins/plugin.go b/pkg/plugins/plugin.go index 3136dfd46..830f7dd68 100644 --- a/pkg/plugins/plugin.go +++ b/pkg/plugins/plugin.go @@ -15,5 +15,5 @@ type VendorPlugin interface { // Apply config change Apply() error // CheckStatusChanges checks status changes on the SriovNetworkNodeState CR for configured VFs. - CheckStatusChanges(*sriovnetworkv1.SriovNetworkNodeState) bool + CheckStatusChanges(*sriovnetworkv1.SriovNetworkNodeState) (bool, error) } diff --git a/pkg/plugins/virtual/virtual_plugin.go b/pkg/plugins/virtual/virtual_plugin.go index cd43b120b..211cbee2c 100644 --- a/pkg/plugins/virtual/virtual_plugin.go +++ b/pkg/plugins/virtual/virtual_plugin.go @@ -68,8 +68,8 @@ func (p *VirtualPlugin) OnNodeStateChange(new *sriovnetworkv1.SriovNetworkNodeSt } // OnNodeStatusChange verify whether SriovNetworkNodeState CR status present changes on configured VFs. -func (p *VirtualPlugin) CheckStatusChanges(*sriovnetworkv1.SriovNetworkNodeState) bool { - return false +func (p *VirtualPlugin) CheckStatusChanges(*sriovnetworkv1.SriovNetworkNodeState) (bool, error) { + return false, nil } // Apply config change From 0d1a11ae9c1053d7e65af102cdef4490faf867dc Mon Sep 17 00:00:00 2001 From: Marcelo Guerrero Date: Wed, 31 Jan 2024 13:43:27 +0100 Subject: [PATCH 4/4] Add e2e test for reconciliation when status changes --- test/conformance/tests/test_sriov_operator.go | 54 +++++++++++++++++-- 1 file changed, 51 insertions(+), 3 deletions(-) diff --git a/test/conformance/tests/test_sriov_operator.go b/test/conformance/tests/test_sriov_operator.go index 55711b78f..094a95cf0 100644 --- a/test/conformance/tests/test_sriov_operator.go +++ b/test/conformance/tests/test_sriov_operator.go @@ -307,8 +307,12 @@ var _ = Describe("[sriov] operator", func() { node = sriovInfos.Nodes[0] createVanillaNetworkPolicy(node, sriovInfos, numVfs, resourceName) WaitForSRIOVStable() - sriovDevice, err = sriovInfos.FindOneSriovDevice(node) + + // Update info + sriovInfos, err = cluster.DiscoverSriov(clients, operatorNamespace) Expect(err).ToNot(HaveOccurred()) + sriovDevice = findInterface(sriovInfos, node) + By("Using device " + sriovDevice.Name + " on node " + node) Eventually(func() int64 { @@ -1000,6 +1004,44 @@ var _ = Describe("[sriov] operator", func() { return runningPodB.Status.Phase }, 3*time.Minute, time.Second).Should(Equal(corev1.PodRunning)) }) + + It("should reconcile managed VF if status changes", func() { + originalMtu := sriovDevice.Mtu + newMtu := 1000 + + By("manually changing the MTU") + _, errOutput, err := runCommandOnConfigDaemon(node, "/bin/bash", "-c", fmt.Sprintf("echo %d > /sys/bus/pci/devices/%s/net/%s/mtu", newMtu, sriovDevice.PciAddress, sriovDevice.Name)) + Expect(err).ToNot(HaveOccurred()) + Expect(errOutput).To(Equal("")) + + By("waiting for the mtu to be updated in the status") + Eventually(func() sriovv1.InterfaceExts { + nodeState, err := clients.SriovNetworkNodeStates(operatorNamespace).Get(context.Background(), node, metav1.GetOptions{}) + Expect(err).ToNot(HaveOccurred()) + return nodeState.Status.Interfaces + }, 3*time.Minute, 1*time.Second).Should(ContainElement(MatchFields( + IgnoreExtras, + Fields{ + "Name": Equal(sriovDevice.Name), + "Mtu": Equal(newMtu), + "PciAddress": Equal(sriovDevice.PciAddress), + "NumVfs": Equal(sriovDevice.NumVfs), + }))) + + By("waiting for the mtu to be restored") + Eventually(func() sriovv1.InterfaceExts { + nodeState, err := clients.SriovNetworkNodeStates(operatorNamespace).Get(context.Background(), node, metav1.GetOptions{}) + Expect(err).ToNot(HaveOccurred()) + return nodeState.Status.Interfaces + }, 3*time.Minute, 1*time.Second).Should(ContainElement(MatchFields( + IgnoreExtras, + Fields{ + "Name": Equal(sriovDevice.Name), + "Mtu": Equal(originalMtu), + "PciAddress": Equal(sriovDevice.PciAddress), + "NumVfs": Equal(sriovDevice.NumVfs), + }))) + }) }) Context("CNI Logging level", func() { @@ -2486,8 +2528,7 @@ func WaitForSRIOVStable() { }, waitingTime, 1*time.Second).Should(BeTrue()) } -func createVanillaNetworkPolicy(node string, sriovInfos *cluster.EnabledNodes, numVfs int, resourceName string) { - // For the context of tests is better to use a Mellanox card +func findInterface(sriovInfos *cluster.EnabledNodes, node string) *sriovv1.InterfaceExt { // For the context of tests is better to use a Mellanox card // as they support all the virtual function flags // if we don't find a Mellanox card we fall back to any sriov // capability interface and skip the rate limit test. @@ -2497,6 +2538,12 @@ func createVanillaNetworkPolicy(node string, sriovInfos *cluster.EnabledNodes, n Expect(err).ToNot(HaveOccurred()) } + return intf +} + +func createVanillaNetworkPolicy(node string, sriovInfos *cluster.EnabledNodes, numVfs int, resourceName string) { + intf := findInterface(sriovInfos, node) + config := &sriovv1.SriovNetworkNodePolicy{ ObjectMeta: metav1.ObjectMeta{ GenerateName: "test-policy", @@ -2509,6 +2556,7 @@ func createVanillaNetworkPolicy(node string, sriovInfos *cluster.EnabledNodes, n }, NumVfs: numVfs, ResourceName: resourceName, + Mtu: 1500, Priority: 99, NicSelector: sriovv1.SriovNetworkNicSelector{ PfNames: []string{intf.Name},