From 7134d2f10beca2636df6e1651f6b453736ce79a5 Mon Sep 17 00:00:00 2001 From: Ivan Kolodiazhnyi Date: Sat, 23 Mar 2024 18:34:28 +0200 Subject: [PATCH] Implement RDMA subsystem mode change Now it's possible to configure RDMA subsystem mode using SR-IOV Network Operator in systemd mode We can't configure RDMA subsystem in a daemon mode because it should be done on host before any network namespace is created. --- api/v1/sriovnetworknodestate_types.go | 6 + api/v1/sriovnetworkpoolconfig_types.go | 4 + api/v1/zz_generated.deepcopy.go | 17 +++ cmd/sriov-network-config-daemon/service.go | 16 +++ ...k.openshift.io_sriovnetworknodestates.yaml | 10 ++ ....openshift.io_sriovnetworkpoolconfigs.yaml | 6 + controllers/drain_controller.go | 99 +---------------- .../sriovnetworknodepolicy_controller.go | 8 ++ .../sriovnetworkpoolconfig_controller.go | 13 +++ ...k.openshift.io_sriovnetworknodestates.yaml | 10 ++ ....openshift.io_sriovnetworkpoolconfigs.yaml | 6 + .../templates/clusterrole.yaml | 3 + .../templates/operator.yaml | 2 +- pkg/consts/constants.go | 3 + pkg/daemon/daemon.go | 8 ++ pkg/daemon/writer.go | 7 ++ pkg/helper/mock/mock_helper.go | 29 +++++ pkg/host/internal/kernel/kernel.go | 26 +++++ pkg/host/mock/mock_host.go | 29 +++++ pkg/host/types/interfaces.go | 4 + pkg/utils/cluster.go | 105 +++++++++++++++++- 21 files changed, 310 insertions(+), 101 deletions(-) diff --git a/api/v1/sriovnetworknodestate_types.go b/api/v1/sriovnetworknodestate_types.go index 74dd1b58b..e86e8752c 100644 --- a/api/v1/sriovnetworknodestate_types.go +++ b/api/v1/sriovnetworknodestate_types.go @@ -23,9 +23,14 @@ import ( // EDIT THIS FILE! THIS IS SCAFFOLDING FOR YOU TO OWN! // NOTE: json tags are required. Any new fields you add must have json tags for the fields to be serialized. +type System struct { + RdmaMode string `json:"rdmaMode,omitempty"` +} + // SriovNetworkNodeStateSpec defines the desired state of SriovNetworkNodeState type SriovNetworkNodeStateSpec struct { Interfaces Interfaces `json:"interfaces,omitempty"` + System System `json:"system,omitempty"` } type Interfaces []Interface @@ -87,6 +92,7 @@ type VirtualFunction struct { // SriovNetworkNodeStateStatus defines the observed state of SriovNetworkNodeState type SriovNetworkNodeStateStatus struct { Interfaces InterfaceExts `json:"interfaces,omitempty"` + System System `json:"system,omitempty"` SyncStatus string `json:"syncStatus,omitempty"` LastSyncError string `json:"lastSyncError,omitempty"` } diff --git a/api/v1/sriovnetworkpoolconfig_types.go b/api/v1/sriovnetworkpoolconfig_types.go index c6e710a99..011ffc7d9 100644 --- a/api/v1/sriovnetworkpoolconfig_types.go +++ b/api/v1/sriovnetworkpoolconfig_types.go @@ -21,6 +21,10 @@ type SriovNetworkPoolConfigSpec struct { // Drain will respect Pod Disruption Budgets (PDBs) such as etcd quorum guards, // even if maxUnavailable is greater than one. MaxUnavailable *intstr.IntOrString `json:"maxUnavailable,omitempty"` + + // +kubebuilder:validation:Enum=shared;exclusive + // RDMA subsystem. Allowed value "shared", "exclusive". + RdmaMode string `json:"rdmaMode,omitempty"` } type OvsHardwareOffloadConfig struct { diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go index 5e13174ee..a956025e3 100644 --- a/api/v1/zz_generated.deepcopy.go +++ b/api/v1/zz_generated.deepcopy.go @@ -503,6 +503,7 @@ func (in *SriovNetworkNodeStateSpec) DeepCopyInto(out *SriovNetworkNodeStateSpec (*in)[i].DeepCopyInto(&(*out)[i]) } } + out.System = in.System } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new SriovNetworkNodeStateSpec. @@ -525,6 +526,7 @@ func (in *SriovNetworkNodeStateStatus) DeepCopyInto(out *SriovNetworkNodeStateSt (*in)[i].DeepCopyInto(&(*out)[i]) } } + out.System = in.System } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new SriovNetworkNodeStateStatus. @@ -785,6 +787,21 @@ func (in *SriovOperatorConfigStatus) DeepCopy() *SriovOperatorConfigStatus { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *System) DeepCopyInto(out *System) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new System. +func (in *System) DeepCopy() *System { + if in == nil { + return nil + } + out := new(System) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *VfGroup) DeepCopyInto(out *VfGroup) { *out = *in diff --git a/cmd/sriov-network-config-daemon/service.go b/cmd/sriov-network-config-daemon/service.go index 88b60036e..c9aa14678 100644 --- a/cmd/sriov-network-config-daemon/service.go +++ b/cmd/sriov-network-config-daemon/service.go @@ -22,6 +22,7 @@ import ( "github.com/go-logr/logr" "github.com/spf13/cobra" + "github.com/vishvananda/netlink" "sigs.k8s.io/controller-runtime/pkg/log" sriovv1 "github.com/k8snetworkplumbingwg/sriov-network-operator/api/v1" @@ -152,6 +153,21 @@ func phasePre(setupLog logr.Logger, conf *systemd.SriovConfig, hostHelpers helpe hostHelpers.TryEnableTun() hostHelpers.TryEnableVhostNet() + if conf.Spec.System.RdmaMode != "" { + rdmaSubsystem, err := netlink.RdmaSystemGetNetnsMode() + if err != nil { + setupLog.Error(err, "failed to get RDMA subsystem mode") + return fmt.Errorf("failed to get RDMA subsystem mode: %v", err) + } + if rdmaSubsystem != conf.Spec.System.RdmaMode { + err = netlink.RdmaSystemSetNetnsMode(conf.Spec.System.RdmaMode) + if err != nil { + setupLog.Error(err, "failed to set RDMA subsystem mode") + return fmt.Errorf("failed to set RDMA subsystem mode: %v", err) + } + } + } + return callPlugin(setupLog, PhasePre, conf, hostHelpers) } diff --git a/config/crd/bases/sriovnetwork.openshift.io_sriovnetworknodestates.yaml b/config/crd/bases/sriovnetwork.openshift.io_sriovnetworknodestates.yaml index a68c16f4e..250d1ba58 100644 --- a/config/crd/bases/sriovnetwork.openshift.io_sriovnetworknodestates.yaml +++ b/config/crd/bases/sriovnetwork.openshift.io_sriovnetworknodestates.yaml @@ -89,6 +89,11 @@ spec: - pciAddress type: object type: array + system: + properties: + rdmaMode: + type: string + type: object type: object status: description: SriovNetworkNodeStateStatus defines the observed state of @@ -163,6 +168,11 @@ spec: type: string syncStatus: type: string + system: + properties: + rdmaMode: + type: string + type: object type: object type: object served: true diff --git a/config/crd/bases/sriovnetwork.openshift.io_sriovnetworkpoolconfigs.yaml b/config/crd/bases/sriovnetwork.openshift.io_sriovnetworkpoolconfigs.yaml index b81999976..9b1b2ee9b 100644 --- a/config/crd/bases/sriovnetwork.openshift.io_sriovnetworkpoolconfigs.yaml +++ b/config/crd/bases/sriovnetwork.openshift.io_sriovnetworkpoolconfigs.yaml @@ -103,6 +103,12 @@ spec: offload' type: string type: object + rdmaMode: + description: RDMA subsystem. Allowed value "shared", "exclusive". + enum: + - shared + - exclusive + type: string type: object status: description: SriovNetworkPoolConfigStatus defines the observed state of diff --git a/controllers/drain_controller.go b/controllers/drain_controller.go index 2869e9a51..bc89ea8c2 100644 --- a/controllers/drain_controller.go +++ b/controllers/drain_controller.go @@ -24,11 +24,8 @@ import ( corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" - "k8s.io/apimachinery/pkg/util/intstr" "k8s.io/client-go/tools/record" "k8s.io/client-go/util/workqueue" ctrl "sigs.k8s.io/controller-runtime" @@ -48,13 +45,6 @@ import ( "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/vars" ) -var ( - oneNode = intstr.FromInt32(1) - defaultNpcl = &sriovnetworkv1.SriovNetworkPoolConfig{Spec: sriovnetworkv1.SriovNetworkPoolConfigSpec{ - MaxUnavailable: &oneNode, - NodeSelector: &metav1.LabelSelector{}}} -) - type DrainReconcile struct { client.Client Scheme *runtime.Scheme @@ -345,94 +335,7 @@ func (dr *DrainReconcile) tryDrainNode(ctx context.Context, node *corev1.Node) ( } func (dr *DrainReconcile) findNodePoolConfig(ctx context.Context, node *corev1.Node) (*sriovnetworkv1.SriovNetworkPoolConfig, []corev1.Node, error) { - logger := log.FromContext(ctx) - logger.Info("findNodePoolConfig():") - // get all the sriov network pool configs - npcl := &sriovnetworkv1.SriovNetworkPoolConfigList{} - err := dr.List(ctx, npcl) - if err != nil { - logger.Error(err, "failed to list sriovNetworkPoolConfig") - return nil, nil, err - } - - selectedNpcl := []*sriovnetworkv1.SriovNetworkPoolConfig{} - nodesInPools := map[string]interface{}{} - - for _, npc := range npcl.Items { - // we skip hw offload objects - if npc.Spec.OvsHardwareOffloadConfig.Name != "" { - continue - } - - if npc.Spec.NodeSelector == nil { - npc.Spec.NodeSelector = &metav1.LabelSelector{} - } - - selector, err := metav1.LabelSelectorAsSelector(npc.Spec.NodeSelector) - if err != nil { - logger.Error(err, "failed to create label selector from nodeSelector", "nodeSelector", npc.Spec.NodeSelector) - return nil, nil, err - } - - if selector.Matches(labels.Set(node.Labels)) { - selectedNpcl = append(selectedNpcl, npc.DeepCopy()) - } - - nodeList := &corev1.NodeList{} - err = dr.List(ctx, nodeList, &client.ListOptions{LabelSelector: selector}) - if err != nil { - logger.Error(err, "failed to list all the nodes matching the pool with label selector from nodeSelector", - "machineConfigPoolName", npc, - "nodeSelector", npc.Spec.NodeSelector) - return nil, nil, err - } - - for _, nodeName := range nodeList.Items { - nodesInPools[nodeName.Name] = nil - } - } - - if len(selectedNpcl) > 1 { - // don't allow the node to be part of multiple pools - err = fmt.Errorf("node is part of more then one pool") - logger.Error(err, "multiple pools founded for a specific node", "numberOfPools", len(selectedNpcl), "pools", selectedNpcl) - return nil, nil, err - } else if len(selectedNpcl) == 1 { - // found one pool for our node - logger.V(2).Info("found sriovNetworkPool", "pool", *selectedNpcl[0]) - selector, err := metav1.LabelSelectorAsSelector(selectedNpcl[0].Spec.NodeSelector) - if err != nil { - logger.Error(err, "failed to create label selector from nodeSelector", "nodeSelector", selectedNpcl[0].Spec.NodeSelector) - return nil, nil, err - } - - // list all the nodes that are also part of this pool and return them - nodeList := &corev1.NodeList{} - err = dr.List(ctx, nodeList, &client.ListOptions{LabelSelector: selector}) - if err != nil { - logger.Error(err, "failed to list nodes using with label selector", "labelSelector", selector) - return nil, nil, err - } - - return selectedNpcl[0], nodeList.Items, nil - } else { - // in this case we get all the nodes and remove the ones that already part of any pool - logger.V(1).Info("node doesn't belong to any pool, using default drain configuration with MaxUnavailable of one", "pool", *defaultNpcl) - nodeList := &corev1.NodeList{} - err = dr.List(ctx, nodeList) - if err != nil { - logger.Error(err, "failed to list all the nodes") - return nil, nil, err - } - - defaultNodeLists := []corev1.Node{} - for _, nodeObj := range nodeList.Items { - if _, exist := nodesInPools[nodeObj.Name]; !exist { - defaultNodeLists = append(defaultNodeLists, nodeObj) - } - } - return defaultNpcl, defaultNodeLists, nil - } + return utils.FindNodePoolConfig(ctx, node, dr.Client) } // SetupWithManager sets up the controller with the Manager. diff --git a/controllers/sriovnetworknodepolicy_controller.go b/controllers/sriovnetworknodepolicy_controller.go index c4c1fc42d..7325d746b 100644 --- a/controllers/sriovnetworknodepolicy_controller.go +++ b/controllers/sriovnetworknodepolicy_controller.go @@ -50,6 +50,7 @@ import ( constants "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/consts" "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/featuregate" "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/render" + "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/utils" "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/vars" ) @@ -258,6 +259,13 @@ func (r *SriovNetworkNodePolicyReconciler) syncAllSriovNetworkNodeStates(ctx con ns.Name = node.Name ns.Namespace = vars.Namespace j, _ := json.Marshal(ns) + netPoolConfig, _, err := utils.FindNodePoolConfig(context.Background(), &node, r.Client) + if err != nil { + log.Log.Error(err, "nodeStateSyncHandler(): failed to get SriovNetworkPoolConfig for the current node") + } + if netPoolConfig != nil { + ns.Spec.System.RdmaMode = netPoolConfig.Spec.RdmaMode + } logger.V(2).Info("SriovNetworkNodeState CR", "content", j) if err := r.syncSriovNetworkNodeState(ctx, dc, npl, ns, &node); err != nil { logger.Error(err, "Fail to sync", "SriovNetworkNodeState", ns.Name) diff --git a/controllers/sriovnetworkpoolconfig_controller.go b/controllers/sriovnetworkpoolconfig_controller.go index 43fd513c9..10d8085ab 100644 --- a/controllers/sriovnetworkpoolconfig_controller.go +++ b/controllers/sriovnetworkpoolconfig_controller.go @@ -73,6 +73,19 @@ func (r *SriovNetworkPoolConfigReconciler) Reconcile(ctx context.Context, req ct return reconcile.Result{}, err } + // RdmaMode could be set in systemd mode only + if instance.Spec.RdmaMode != "" { + operatorConfig := &sriovnetworkv1.SriovOperatorConfig{} + err := r.Get(ctx, types.NamespacedName{Namespace: vars.Namespace, Name: constants.DefaultConfigName}, operatorConfig) + if err != nil { + logger.Error(err, "failed to list SriovOperatorConfig") + return reconcile.Result{}, err + } + if operatorConfig.Spec.ConfigurationMode == sriovnetworkv1.DaemonConfigurationMode { + logger.Info("rdmaSpec is ignored in 'daemon' configuration mode") + } + } + // we don't need a finalizer for pools that doesn't use the ovs hardware offload feature if instance.Spec.OvsHardwareOffloadConfig.Name == "" { return ctrl.Result{}, nil diff --git a/deployment/sriov-network-operator/crds/sriovnetwork.openshift.io_sriovnetworknodestates.yaml b/deployment/sriov-network-operator/crds/sriovnetwork.openshift.io_sriovnetworknodestates.yaml index a68c16f4e..250d1ba58 100644 --- a/deployment/sriov-network-operator/crds/sriovnetwork.openshift.io_sriovnetworknodestates.yaml +++ b/deployment/sriov-network-operator/crds/sriovnetwork.openshift.io_sriovnetworknodestates.yaml @@ -89,6 +89,11 @@ spec: - pciAddress type: object type: array + system: + properties: + rdmaMode: + type: string + type: object type: object status: description: SriovNetworkNodeStateStatus defines the observed state of @@ -163,6 +168,11 @@ spec: type: string syncStatus: type: string + system: + properties: + rdmaMode: + type: string + type: object type: object type: object served: true diff --git a/deployment/sriov-network-operator/crds/sriovnetwork.openshift.io_sriovnetworkpoolconfigs.yaml b/deployment/sriov-network-operator/crds/sriovnetwork.openshift.io_sriovnetworkpoolconfigs.yaml index b81999976..9b1b2ee9b 100644 --- a/deployment/sriov-network-operator/crds/sriovnetwork.openshift.io_sriovnetworkpoolconfigs.yaml +++ b/deployment/sriov-network-operator/crds/sriovnetwork.openshift.io_sriovnetworkpoolconfigs.yaml @@ -103,6 +103,12 @@ spec: offload' type: string type: object + rdmaMode: + description: RDMA subsystem. Allowed value "shared", "exclusive". + enum: + - shared + - exclusive + type: string type: object status: description: SriovNetworkPoolConfigStatus defines the observed state of diff --git a/deployment/sriov-network-operator/templates/clusterrole.yaml b/deployment/sriov-network-operator/templates/clusterrole.yaml index 7cd8fd014..3d5afcf15 100644 --- a/deployment/sriov-network-operator/templates/clusterrole.yaml +++ b/deployment/sriov-network-operator/templates/clusterrole.yaml @@ -58,3 +58,6 @@ rules: - apiGroups: [ "config.openshift.io" ] resources: [ "infrastructures" ] verbs: [ "get", "list", "watch" ] + - apiGroups: [ "sriovnetwork.openshift.io" ] + resources: [ "sriovnetworkpoolconfigs" ] + verbs: [ "get", "list", "watch" ] diff --git a/deployment/sriov-network-operator/templates/operator.yaml b/deployment/sriov-network-operator/templates/operator.yaml index 9f1b6fa56..aa2801bc4 100644 --- a/deployment/sriov-network-operator/templates/operator.yaml +++ b/deployment/sriov-network-operator/templates/operator.yaml @@ -59,7 +59,7 @@ spec: - name: OVS_CNI_IMAGE value: {{ .Values.images.ovsCni }} - name: RDMA_CNI_IMAGE - value: { { .Values.images.rdmaCni } } + value: {{ .Values.images.rdmaCni }} - name: SRIOV_DEVICE_PLUGIN_IMAGE value: {{ .Values.images.sriovDevicePlugin }} - name: NETWORK_RESOURCES_INJECTOR_IMAGE diff --git a/pkg/consts/constants.go b/pkg/consts/constants.go index efbefe198..e05a6fcc0 100644 --- a/pkg/consts/constants.go +++ b/pkg/consts/constants.go @@ -47,6 +47,9 @@ const ( VdpaTypeVirtio = "virtio" VdpaTypeVhost = "vhost" + RdmaSubsystemModeShared = "shared" + RdmaSubsystemModeExclusive = "exclusive" + ClusterTypeOpenshift = "openshift" ClusterTypeKubernetes = "kubernetes" diff --git a/pkg/daemon/daemon.go b/pkg/daemon/daemon.go index 950287a16..46efa83b7 100644 --- a/pkg/daemon/daemon.go +++ b/pkg/daemon/daemon.go @@ -9,6 +9,7 @@ import ( "time" "golang.org/x/time/rate" + corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" utilruntime "k8s.io/apimachinery/pkg/util/runtime" @@ -413,6 +414,13 @@ func (dn *Daemon) nodeStateSyncHandler() error { // When using systemd configuration we write the file if vars.UsingSystemdMode { log.Log.V(0).Info("nodeStateSyncHandler(): writing systemd config file to host") + // get node object + node := &corev1.Node{} + err := dn.client.Get(context.TODO(), client.ObjectKey{Name: vars.NodeName}, node) + if err != nil { + log.Log.Error(err, "nodeStateSyncHandler(): failed to get node object") + return err + } systemdConfModified, err := systemd.WriteConfFile(dn.desiredNodeState) if err != nil { log.Log.Error(err, "nodeStateSyncHandler(): failed to write configuration file for systemd mode") diff --git a/pkg/daemon/writer.go b/pkg/daemon/writer.go index c796546ef..59fdf91cf 100644 --- a/pkg/daemon/writer.go +++ b/pkg/daemon/writer.go @@ -117,6 +117,7 @@ func (w *NodeStateStatusWriter) Run(stop <-chan struct{}, refresh <-chan Message func (w *NodeStateStatusWriter) pollNicStatus() error { log.Log.V(2).Info("pollNicStatus()") var iface []sriovnetworkv1.InterfaceExt + var rdmaMode string var err error if vars.PlatformType == consts.VirtualOpenStack { @@ -127,7 +128,13 @@ func (w *NodeStateStatusWriter) pollNicStatus() error { if err != nil { return err } + rdmaMode, err = w.hostHelper.GetRDMASubsystem() + if err != nil { + return err + } + w.status.Interfaces = iface + w.status.System.RdmaMode = rdmaMode return nil } diff --git a/pkg/helper/mock/mock_helper.go b/pkg/helper/mock/mock_helper.go index 4d8db7bf6..b60db2e38 100644 --- a/pkg/helper/mock/mock_helper.go +++ b/pkg/helper/mock/mock_helper.go @@ -516,6 +516,21 @@ func (mr *MockHostHelpersInterfaceMockRecorder) GetPhysSwitchID(name interface{} return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetPhysSwitchID", reflect.TypeOf((*MockHostHelpersInterface)(nil).GetPhysSwitchID), name) } +// GetRDMASubsystem mocks base method. +func (m *MockHostHelpersInterface) GetRDMASubsystem() (string, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "GetRDMASubsystem") + ret0, _ := ret[0].(string) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// GetRDMASubsystem indicates an expected call of GetRDMASubsystem. +func (mr *MockHostHelpersInterfaceMockRecorder) GetRDMASubsystem() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetRDMASubsystem", reflect.TypeOf((*MockHostHelpersInterface)(nil).GetRDMASubsystem)) +} + // GetVfInfo mocks base method. func (m *MockHostHelpersInterface) GetVfInfo(pciAddr string, devices []*ghw.PCIDevice) v1.VirtualFunction { m.ctrl.T.Helper() @@ -1019,6 +1034,20 @@ func (mr *MockHostHelpersInterfaceMockRecorder) SetNicSriovMode(pciAddr, mode in return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SetNicSriovMode", reflect.TypeOf((*MockHostHelpersInterface)(nil).SetNicSriovMode), pciAddr, mode) } +// SetRDMASubsystem mocks base method. +func (m *MockHostHelpersInterface) SetRDMASubsystem(mode string) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "SetRDMASubsystem", mode) + ret0, _ := ret[0].(error) + return ret0 +} + +// SetRDMASubsystem indicates an expected call of SetRDMASubsystem. +func (mr *MockHostHelpersInterfaceMockRecorder) SetRDMASubsystem(mode interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SetRDMASubsystem", reflect.TypeOf((*MockHostHelpersInterface)(nil).SetRDMASubsystem), mode) +} + // SetSriovNumVfs mocks base method. func (m *MockHostHelpersInterface) SetSriovNumVfs(pciAddr string, numVfs int) error { m.ctrl.T.Helper() diff --git a/pkg/host/internal/kernel/kernel.go b/pkg/host/internal/kernel/kernel.go index 958f2590a..b5797931a 100644 --- a/pkg/host/internal/kernel/kernel.go +++ b/pkg/host/internal/kernel/kernel.go @@ -7,6 +7,7 @@ import ( "path/filepath" "strings" + "github.com/vishvananda/netlink" "sigs.k8s.io/controller-runtime/pkg/log" sriovnetworkv1 "github.com/k8snetworkplumbingwg/sriov-network-operator/api/v1" @@ -522,6 +523,31 @@ func (k *kernel) InstallRDMA(packageManager string) error { return nil } +func (k *kernel) GetRDMASubsystem() (string, error) { + log.Log.Info("GetRDMASubsystem(): retrieving RDMA subsystem mode") + subsystem, err := netlink.RdmaSystemGetNetnsMode() + + if err != nil { + log.Log.Error(err, "GetRDMASubsystem(): failed to get RDMA subsystem mode") + return "", err + } + + return subsystem, nil +} + +func (k *kernel) SetRDMASubsystem(mode string) error { + log.Log.Info("SetRDMASubsystem(): Updating RDMA subsystem mode") + chrootDefinition := utils.GetChrootExtension() + + stdout, stderr, err := k.utilsHelper.RunCommand("/bin/sh", "-c", fmt.Sprintf("%s /usr/bin/rdma system set net %s", chrootDefinition, mode)) + if err != nil && len(stderr) != 0 { + log.Log.Error(err, "SetRDMASubsystem(): failed to update RDMA subsystem mode", "stdout", stdout, "stderr", stderr) + return err + } + + return nil +} + func (k *kernel) TriggerUdevEvent() error { log.Log.Info("TriggerUdevEvent(): installing RDMA") diff --git a/pkg/host/mock/mock_host.go b/pkg/host/mock/mock_host.go index 4ea2f0e8b..c8e676a5e 100644 --- a/pkg/host/mock/mock_host.go +++ b/pkg/host/mock/mock_host.go @@ -440,6 +440,21 @@ func (mr *MockHostManagerInterfaceMockRecorder) GetPhysSwitchID(name interface{} return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetPhysSwitchID", reflect.TypeOf((*MockHostManagerInterface)(nil).GetPhysSwitchID), name) } +// GetRDMASubsystem mocks base method. +func (m *MockHostManagerInterface) GetRDMASubsystem() (string, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "GetRDMASubsystem") + ret0, _ := ret[0].(string) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// GetRDMASubsystem indicates an expected call of GetRDMASubsystem. +func (mr *MockHostManagerInterfaceMockRecorder) GetRDMASubsystem() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetRDMASubsystem", reflect.TypeOf((*MockHostManagerInterface)(nil).GetRDMASubsystem)) +} + // GetVfInfo mocks base method. func (m *MockHostManagerInterface) GetVfInfo(pciAddr string, devices []*ghw.PCIDevice) v1.VirtualFunction { m.ctrl.T.Helper() @@ -862,6 +877,20 @@ func (mr *MockHostManagerInterfaceMockRecorder) SetNicSriovMode(pciAddr, mode in return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SetNicSriovMode", reflect.TypeOf((*MockHostManagerInterface)(nil).SetNicSriovMode), pciAddr, mode) } +// SetRDMASubsystem mocks base method. +func (m *MockHostManagerInterface) SetRDMASubsystem(mode string) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "SetRDMASubsystem", mode) + ret0, _ := ret[0].(error) + return ret0 +} + +// SetRDMASubsystem indicates an expected call of SetRDMASubsystem. +func (mr *MockHostManagerInterfaceMockRecorder) SetRDMASubsystem(mode interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SetRDMASubsystem", reflect.TypeOf((*MockHostManagerInterface)(nil).SetRDMASubsystem), mode) +} + // SetSriovNumVfs mocks base method. func (m *MockHostManagerInterface) SetSriovNumVfs(pciAddr string, numVfs int) error { m.ctrl.T.Helper() diff --git a/pkg/host/types/interfaces.go b/pkg/host/types/interfaces.go index 48d47b424..fde51304f 100644 --- a/pkg/host/types/interfaces.go +++ b/pkg/host/types/interfaces.go @@ -69,6 +69,10 @@ type KernelInterface interface { EnableRDMA(conditionFilePath, serviceName, packageManager string) (bool, error) // InstallRDMA install RDMA packages on the system InstallRDMA(packageManager string) error + // GetRDMASubsystem returns RDMA subsystem mode + GetRDMASubsystem() (string, error) + // SetRDMASubsystem changes RDMA subsystem mode + SetRDMASubsystem(mode string) error // EnableRDMAOnRHELMachine enable RDMA on a RHEL base system EnableRDMAOnRHELMachine() (bool, error) // GetOSPrettyName returns OS name diff --git a/pkg/utils/cluster.go b/pkg/utils/cluster.go index 6f8d72e07..da250b2a9 100644 --- a/pkg/utils/cluster.go +++ b/pkg/utils/cluster.go @@ -5,14 +5,16 @@ import ( "fmt" "os" - "sigs.k8s.io/controller-runtime/pkg/log" - configv1 "github.com/openshift/api/config/v1" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/intstr" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log" + sriovnetworkv1 "github.com/k8snetworkplumbingwg/sriov-network-operator/api/v1" "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/consts" ) @@ -26,6 +28,14 @@ const ( controlPlaneNodeLabelKey = "node-role.kubernetes.io/control-plane" ) +var ( + oneNode = intstr.FromInt32(1) + defaultNpcl = &sriovnetworkv1.SriovNetworkPoolConfig{Spec: sriovnetworkv1.SriovNetworkPoolConfigSpec{ + MaxUnavailable: &oneNode, + NodeSelector: &metav1.LabelSelector{}, + RdmaMode: ""}} +) + func getNodeRole(node corev1.Node) string { for k := range node.Labels { if k == workerNodeLabelKey { @@ -161,3 +171,94 @@ func AnnotateNode(ctx context.Context, nodeName string, key, value string, c cli return AnnotateObject(ctx, node, key, value, c) } + +func FindNodePoolConfig(ctx context.Context, node *corev1.Node, c client.Client) (*sriovnetworkv1.SriovNetworkPoolConfig, []corev1.Node, error) { + logger := log.FromContext(ctx) + logger.Info("FindNodePoolConfig():") + // get all the sriov network pool configs + npcl := &sriovnetworkv1.SriovNetworkPoolConfigList{} + err := c.List(ctx, npcl) + if err != nil { + logger.Error(err, "failed to list sriovNetworkPoolConfig") + return nil, nil, err + } + + selectedNpcl := []*sriovnetworkv1.SriovNetworkPoolConfig{} + nodesInPools := map[string]interface{}{} + + for _, npc := range npcl.Items { + // we skip hw offload objects + if npc.Spec.OvsHardwareOffloadConfig.Name != "" { + continue + } + + if npc.Spec.NodeSelector == nil { + npc.Spec.NodeSelector = &metav1.LabelSelector{} + } + + selector, err := metav1.LabelSelectorAsSelector(npc.Spec.NodeSelector) + if err != nil { + logger.Error(err, "failed to create label selector from nodeSelector", "nodeSelector", npc.Spec.NodeSelector) + return nil, nil, err + } + + if selector.Matches(labels.Set(node.Labels)) { + selectedNpcl = append(selectedNpcl, npc.DeepCopy()) + } + + nodeList := &corev1.NodeList{} + err = c.List(ctx, nodeList, &client.ListOptions{LabelSelector: selector}) + if err != nil { + logger.Error(err, "failed to list all the nodes matching the pool with label selector from nodeSelector", + "machineConfigPoolName", npc, + "nodeSelector", npc.Spec.NodeSelector) + return nil, nil, err + } + + for _, nodeName := range nodeList.Items { + nodesInPools[nodeName.Name] = nil + } + } + + if len(selectedNpcl) > 1 { + // don't allow the node to be part of multiple pools + err = fmt.Errorf("node is part of more then one pool") + logger.Error(err, "multiple pools founded for a specific node", "numberOfPools", len(selectedNpcl), "pools", selectedNpcl) + return nil, nil, err + } else if len(selectedNpcl) == 1 { + // found one pool for our node + logger.V(2).Info("found sriovNetworkPool", "pool", *selectedNpcl[0]) + selector, err := metav1.LabelSelectorAsSelector(selectedNpcl[0].Spec.NodeSelector) + if err != nil { + logger.Error(err, "failed to create label selector from nodeSelector", "nodeSelector", selectedNpcl[0].Spec.NodeSelector) + return nil, nil, err + } + + // list all the nodes that are also part of this pool and return them + nodeList := &corev1.NodeList{} + err = c.List(ctx, nodeList, &client.ListOptions{LabelSelector: selector}) + if err != nil { + logger.Error(err, "failed to list nodes using with label selector", "labelSelector", selector) + return nil, nil, err + } + + return selectedNpcl[0], nodeList.Items, nil + } else { + // in this case we get all the nodes and remove the ones that already part of any pool + logger.V(1).Info("node doesn't belong to any pool, using default drain configuration with MaxUnavailable of one", "pool", *defaultNpcl) + nodeList := &corev1.NodeList{} + err = c.List(ctx, nodeList) + if err != nil { + logger.Error(err, "failed to list all the nodes") + return nil, nil, err + } + + defaultNodeLists := []corev1.Node{} + for _, nodeObj := range nodeList.Items { + if _, exist := nodesInPools[nodeObj.Name]; !exist { + defaultNodeLists = append(defaultNodeLists, nodeObj) + } + } + return defaultNpcl, defaultNodeLists, nil + } +}