Skip to content

Commit

Permalink
Merge pull request #733 from tobiasgiese/impl-mlx-fw-reset
Browse files Browse the repository at this point in the history
feat: implement MlxResetFW to reset the FW on VF changes
  • Loading branch information
SchSeba committed Aug 12, 2024
2 parents 2dec53f + 08b524e commit 88017db
Show file tree
Hide file tree
Showing 10 changed files with 104 additions and 5 deletions.
4 changes: 3 additions & 1 deletion Dockerfile.sriov-network-config-daemon
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@ RUN make _build-sriov-network-config-daemon BIN_PATH=build/_output/cmd

FROM quay.io/centos/centos:stream9
ARG MSTFLINT=mstflint
RUN ARCH_DEP_PKGS=$(if [ "$(uname -m)" != "s390x" ]; then echo -n ${MSTFLINT} ; fi) && yum -y install hwdata $ARCH_DEP_PKGS && yum clean all
# We have to ensure that pciutils is installed. This package is needed for mstfwreset to succeed.
# xref pkg/vendors/mellanox/mellanox.go#L150
RUN ARCH_DEP_PKGS=$(if [ "$(uname -m)" != "s390x" ]; then echo -n ${MSTFLINT} ; fi) && yum -y install hwdata pciutils $ARCH_DEP_PKGS && yum clean all
LABEL io.k8s.display-name="sriov-network-config-daemon" \
io.k8s.description="This is a daemon that manage and config sriov network devices in Kubernetes cluster"
COPY --from=builder /go/src/github.com/k8snetworkplumbingwg/sriov-network-operator/build/_output/cmd/sriov-network-config-daemon /usr/bin/
Expand Down
15 changes: 15 additions & 0 deletions cmd/sriov-network-config-daemon/start.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (

"github.com/spf13/cobra"
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/kubernetes/scheme"
"k8s.io/client-go/rest"
Expand All @@ -41,6 +42,7 @@ import (
snclientset "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/client/clientset/versioned"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/consts"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/daemon"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/featuregate"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/helper"
snolog "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/log"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/platforms"
Expand Down Expand Up @@ -276,6 +278,18 @@ func runStartCmd(cmd *cobra.Command, args []string) error {
}
go nodeWriter.Run(stopCh, refreshCh, syncCh)

// Init feature gates once to prevent race conditions.
defaultConfig := &sriovnetworkv1.SriovOperatorConfig{}
err = kClient.Get(context.Background(), types.NamespacedName{Namespace: vars.Namespace, Name: consts.DefaultConfigName}, defaultConfig)
if err != nil {
log.Log.Error(err, "Failed to get default SriovOperatorConfig object")
return err
}
featureGates := featuregate.New()
featureGates.Init(defaultConfig.Spec.FeatureGates)
vars.MlxPluginFwReset = featureGates.IsEnabled(consts.MellanoxFirmwareResetFeatureGate)
log.Log.Info("Enabled featureGates", "featureGates", featureGates.String())

setupLog.V(0).Info("Starting SriovNetworkConfigDaemon")
err = daemon.New(
kClient,
Expand All @@ -288,6 +302,7 @@ func runStartCmd(cmd *cobra.Command, args []string) error {
syncCh,
refreshCh,
eventRecorder,
featureGates,
startOpts.disabledPlugins,
).Run(stopCh, exitCh)
if err != nil {
Expand Down
3 changes: 3 additions & 0 deletions pkg/consts/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,9 @@ const (
// ManageSoftwareBridgesFeatureGate: enables management of software bridges by the operator
ManageSoftwareBridgesFeatureGate = "manageSoftwareBridges"

// MellanoxFirmwareResetFeatureGate: enables the firmware reset via mstfwreset before a reboot
MellanoxFirmwareResetFeatureGate = "mellanoxFirmwareReset"

// The path to the file on the host filesystem that contains the IB GUID distribution for IB VFs
InfinibandGUIDConfigFilePath = SriovConfBasePath + "/infiniband/guids"
)
Expand Down
14 changes: 14 additions & 0 deletions pkg/daemon/daemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"fmt"
"math/rand"
"os/exec"
"reflect"
"sync"
"time"

Expand All @@ -23,6 +24,7 @@ import (
snclientset "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/client/clientset/versioned"
sninformer "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/client/informers/externalversions"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/consts"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/featuregate"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/helper"
snolog "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/log"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/platforms"
Expand Down Expand Up @@ -82,6 +84,8 @@ type Daemon struct {
workqueue workqueue.RateLimitingInterface

eventRecorder *EventRecorder

featureGate featuregate.FeatureGate
}

func New(
Expand All @@ -95,6 +99,7 @@ func New(
syncCh <-chan struct{},
refreshCh chan<- Message,
er *EventRecorder,
featureGates featuregate.FeatureGate,
disabledPlugins []string,
) *Daemon {
return &Daemon{
Expand All @@ -113,6 +118,7 @@ func New(
&workqueue.BucketRateLimiter{Limiter: rate.NewLimiter(rate.Limit(updateDelay), 1)},
workqueue.NewItemExponentialFailureRateLimiter(1*time.Second, maxUpdateBackoff)), "SriovNetworkNodeState"),
eventRecorder: er,
featureGate: featureGates,
disabledPlugins: disabledPlugins,
}
}
Expand Down Expand Up @@ -286,6 +292,7 @@ func (dn *Daemon) operatorConfigAddHandler(obj interface{}) {
}

func (dn *Daemon) operatorConfigChangeHandler(old, new interface{}) {
oldCfg := old.(*sriovnetworkv1.SriovOperatorConfig)
newCfg := new.(*sriovnetworkv1.SriovOperatorConfig)
if newCfg.Namespace != vars.Namespace || newCfg.Name != consts.DefaultConfigName {
log.Log.V(2).Info("unsupported SriovOperatorConfig", "namespace", newCfg.Namespace, "name", newCfg.Name)
Expand All @@ -299,6 +306,13 @@ func (dn *Daemon) operatorConfigChangeHandler(old, new interface{}) {
dn.disableDrain = newDisableDrain
log.Log.Info("Set Disable Drain", "value", dn.disableDrain)
}

if !reflect.DeepEqual(oldCfg.Spec.FeatureGates, newCfg.Spec.FeatureGates) {
dn.featureGate.Init(newCfg.Spec.FeatureGates)
log.Log.Info("Updated featureGates", "featureGates", dn.featureGate.String())
}

vars.MlxPluginFwReset = dn.featureGate.IsEnabled(consts.MellanoxFirmwareResetFeatureGate)
}

func (dn *Daemon) nodeStateSyncHandler() error {
Expand Down
9 changes: 6 additions & 3 deletions pkg/daemon/daemon_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,18 @@ import (
"sigs.k8s.io/controller-runtime/pkg/log/zap"

sriovnetworkv1 "github.com/k8snetworkplumbingwg/sriov-network-operator/api/v1"
mock_platforms "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/platforms/mock"
"github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/fakefilesystem"

snclient "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/client/clientset/versioned"
snclientset "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/client/clientset/versioned/fake"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/consts"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/featuregate"
mock_helper "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/helper/mock"
mock_platforms "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/platforms/mock"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/platforms/openshift"
plugin "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/plugins"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/plugins/fake"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/plugins/generic"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/vars"
"github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/fakefilesystem"
)

func TestConfigDaemon(t *testing.T) {
Expand Down Expand Up @@ -151,6 +151,8 @@ var _ = Describe("Config Daemon", func() {
vendorHelper.EXPECT().PrepareNMUdevRule([]string{"0x1014", "0x154c"}).Return(nil).AnyTimes()
vendorHelper.EXPECT().PrepareVFRepUdevRule().Return(nil).AnyTimes()

featureGates := featuregate.New()

sut = New(
kClient,
snclient,
Expand All @@ -162,6 +164,7 @@ var _ = Describe("Config Daemon", func() {
syncCh,
refreshCh,
er,
featureGates,
nil,
)

Expand Down
14 changes: 14 additions & 0 deletions pkg/helper/mock/mock_helper.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

15 changes: 14 additions & 1 deletion pkg/plugins/mellanox/mellanox_plugin.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/consts"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/helper"
plugin "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/plugins"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/vars"
mlx "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/vendors/mellanox"
)

Expand All @@ -20,6 +21,7 @@ type MellanoxPlugin struct {
helpers helper.HostHelpersInterface
}

var pciAddressesToReset []string
var attributesToChange map[string]mlx.MlxNic
var mellanoxNicsStatus map[string]map[string]sriovnetworkv1.InterfaceExt
var mellanoxNicsSpec map[string]sriovnetworkv1.Interface
Expand Down Expand Up @@ -52,6 +54,7 @@ func (p *MellanoxPlugin) OnNodeStateChange(new *sriovnetworkv1.SriovNetworkNodeS
needDrain = false
needReboot = false
err = nil
pciAddressesToReset = []string{}
attributesToChange = map[string]mlx.MlxNic{}
mellanoxNicsStatus = map[string]map[string]sriovnetworkv1.InterfaceExt{}
mellanoxNicsSpec = map[string]sriovnetworkv1.Interface{}
Expand Down Expand Up @@ -132,6 +135,10 @@ func (p *MellanoxPlugin) OnNodeStateChange(new *sriovnetworkv1.SriovNetworkNodeS
if needReboot || changeWithoutReboot {
attributesToChange[ifaceSpec.PciAddress] = *attrs
}

if needReboot {
pciAddressesToReset = append(pciAddressesToReset, ifaceSpec.PciAddress)
}
}

// Set total VFs to 0 for mellanox interfaces with no spec
Expand Down Expand Up @@ -202,7 +209,13 @@ func (p *MellanoxPlugin) Apply() error {
return nil
}
log.Log.Info("mellanox plugin Apply()")
return p.helpers.MlxConfigFW(attributesToChange)
if err := p.helpers.MlxConfigFW(attributesToChange); err != nil {
return err
}
if vars.MlxPluginFwReset {
return p.helpers.MlxResetFW(pciAddressesToReset)
}
return nil
}

// nicHasExternallyManagedPFs returns true if one of the ports(interface) of the NIC is marked as externally managed
Expand Down
3 changes: 3 additions & 0 deletions pkg/vars/vars.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@ var (
// ManageSoftwareBridges global variable which reflects state of manageSoftwareBridges feature
ManageSoftwareBridges = false

// MlxPluginFwReset global variable enables mstfwreset before rebooting a node on VF changes
MlxPluginFwReset = false

// FilesystemRoot used by test to mock interactions with filesystem
FilesystemRoot = ""

Expand Down
18 changes: 18 additions & 0 deletions pkg/vendors/mellanox/mellanox.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"strconv"
"strings"

kerrors "k8s.io/apimachinery/pkg/util/errors"
"sigs.k8s.io/controller-runtime/pkg/log"

sriovnetworkv1 "github.com/k8snetworkplumbingwg/sriov-network-operator/api/v1"
Expand Down Expand Up @@ -60,6 +61,7 @@ type MellanoxInterface interface {
GetMlxNicFwData(pciAddress string) (current, next *MlxNic, err error)

MlxConfigFW(attributesToChange map[string]MlxNic) error
MlxResetFW(pciAddresses []string) error
}

type mellanoxHelper struct {
Expand Down Expand Up @@ -141,6 +143,22 @@ func (m *mellanoxHelper) GetMellanoxBlueFieldMode(PciAddress string) (BlueFieldM
return -1, fmt.Errorf("MellanoxBlueFieldMode(): unknown device status for %s", PciAddress)
}

func (m *mellanoxHelper) MlxResetFW(pciAddresses []string) error {
log.Log.Info("mellanox-plugin resetFW()")
var errs []error
for _, pciAddress := range pciAddresses {
cmdArgs := []string{"-d", pciAddress, "--skip_driver", "-l", "3", "-y", "reset"}
log.Log.Info("mellanox-plugin: resetFW()", "cmd-args", cmdArgs)
// We have to ensure that pciutils is installed into the container image Dockerfile.sriov-network-config-daemon
_, stderr, err := m.utils.RunCommand("mstfwreset", cmdArgs...)
if err != nil {
log.Log.Error(err, "mellanox-plugin resetFW(): failed", "stderr", stderr)
errs = append(errs, err)
}
}
return kerrors.NewAggregate(errs)
}

func (m *mellanoxHelper) MlxConfigFW(attributesToChange map[string]MlxNic) error {
log.Log.Info("mellanox-plugin configFW()")
for pciAddr, fwArgs := range attributesToChange {
Expand Down
14 changes: 14 additions & 0 deletions pkg/vendors/mellanox/mock/mock_mellanox.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 88017db

Please sign in to comment.