Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use TSCs in-place of Pod affinity/anti-affinity & always enforce them even if custom placement is specified #2720

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
199 changes: 51 additions & 148 deletions controllers/defaults/placements.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,204 +12,107 @@ var (
CsiPluginKey = "csi-plugin"
CsiProvisionerKey = "csi-provisioner"

// osdLabelSelector is the key in OSD pod. Used
// as a label selector for topology spread constraints.
osdLabelSelector = "rook-ceph-osd"
// osdPrepareLabelSelector is the key in OSD prepare pod. Used
// as a label selector for topology spread constraints.
osdPrepareLabelSelector = "rook-ceph-osd-prepare"
// appLabelSelectorKey is common value for 'Key' field in 'LabelSelectorRequirement'
appLabelSelectorKey = "app"
// mgrLabelSelector is the key in mgr pod, used for topology spread constraints.
mgrLabelSelector = "rook-ceph-mgr"
// monLabelSelector is the key in mon pod, used for topology spread constraints.
monLabelSelector = "rook-ceph-mon"
// osdLabelSelector is the key in OSD pod, used for topology spread constraints.
osdLabelSelector = "rook-ceph-osd"
// osdPrepareLabelSelector is the key in OSD prepare pod, used for topology spread constraints.
osdPrepareLabelSelector = "rook-ceph-osd-prepare"
// mdsLabelSelector is the key in mds pod, used for topology spread constraints.
mdsLabelSelector = "rook-ceph-mds"
// rgwLabelSelector is the key in rgw pod, used for topology spread constraints.
rgwLabelSelector = "rook-ceph-rgw"
// nfsLabelSelector is the key in nfs pod, used for topology spread constraints.
nfsLabelSelector = "rook-ceph-nfs"

// DefaultNodeAffinity is the NodeAffinity to be used when labelSelector is nil
DefaultNodeAffinity = &corev1.NodeAffinity{
RequiredDuringSchedulingIgnoredDuringExecution: getOcsNodeSelector(),
}
// DaemonPlacements map contains the default placement configs for the
// various OCS daemons
DaemonPlacements = map[string]rookCephv1.Placement{
"all": {
Tolerations: []corev1.Toleration{
getOcsToleration(),
// The empty topology key in TSCs must be replaced with the failure domain key by the caller.
// This enforces strict even distribution of pods across failure domains.

"mgr": {
TopologySpreadConstraints: []corev1.TopologySpreadConstraint{
getTopologySpreadConstraintWithExpressions(1, "", "ScheduleAnyway",
appLabelSelectorKey, metav1.LabelSelectorOpIn, []string{mgrLabelSelector}),
},
},

"mon": {
PodAntiAffinity: &corev1.PodAntiAffinity{
RequiredDuringSchedulingIgnoredDuringExecution: []corev1.PodAffinityTerm{
getPodAffinityTerm("rook-ceph-mon"),
},
TopologySpreadConstraints: []corev1.TopologySpreadConstraint{
getTopologySpreadConstraintWithExpressions(1, "", "ScheduleAnyway",
appLabelSelectorKey, metav1.LabelSelectorOpIn, []string{monLabelSelector}),
},
},

"osd": {
Tolerations: []corev1.Toleration{
getOcsToleration(),
},
TopologySpreadConstraints: []corev1.TopologySpreadConstraint{
getTopologySpreadConstraintsSpec(1, []string{osdLabelSelector}),
getTopologySpreadConstraintWithExpressions(1, corev1.LabelHostname, "ScheduleAnyway",
appLabelSelectorKey, metav1.LabelSelectorOpIn, []string{osdLabelSelector}),
},
},

"osd-prepare": {
Tolerations: []corev1.Toleration{
getOcsToleration(),
},
TopologySpreadConstraints: []corev1.TopologySpreadConstraint{
getTopologySpreadConstraintsSpec(1, []string{osdLabelSelector, osdPrepareLabelSelector}),
getTopologySpreadConstraintWithExpressions(1, corev1.LabelHostname, "ScheduleAnyway",
appLabelSelectorKey, metav1.LabelSelectorOpIn, []string{osdLabelSelector, osdPrepareLabelSelector}),
},
},

"rgw": {
Tolerations: []corev1.Toleration{
getOcsToleration(),
},
PodAntiAffinity: &corev1.PodAntiAffinity{
PreferredDuringSchedulingIgnoredDuringExecution: []corev1.WeightedPodAffinityTerm{
getWeightedPodAffinityTerm(100, "rook-ceph-rgw"),
},
TopologySpreadConstraints: []corev1.TopologySpreadConstraint{
getTopologySpreadConstraintWithExpressions(1, "", "ScheduleAnyway",
appLabelSelectorKey, metav1.LabelSelectorOpIn, []string{rgwLabelSelector}),
},
},

"mds": {
Tolerations: []corev1.Toleration{
malayparida2000 marked this conversation as resolved.
Show resolved Hide resolved
getOcsToleration(),
},
PodAntiAffinity: &corev1.PodAntiAffinity{
PreferredDuringSchedulingIgnoredDuringExecution: []corev1.WeightedPodAffinityTerm{
// left the selector value empty as it will be updated later in the getPlacement()
},
TopologySpreadConstraints: []corev1.TopologySpreadConstraint{
getTopologySpreadConstraintWithExpressions(1, "", "ScheduleAnyway",
appLabelSelectorKey, metav1.LabelSelectorOpIn, []string{mdsLabelSelector}),
},
},

"nfs": {
Tolerations: []corev1.Toleration{
getOcsToleration(),
},
PodAntiAffinity: &corev1.PodAntiAffinity{
RequiredDuringSchedulingIgnoredDuringExecution: []corev1.PodAffinityTerm{
getPodAffinityTerm("rook-ceph-nfs"),
},
},
},

"noobaa-core": {
Tolerations: []corev1.Toleration{
getOcsToleration(),
},
},

"noobaa-standalone": {
Tolerations: []corev1.Toleration{
getOcsToleration(),
},
},

"rbd-mirror": {
Tolerations: []corev1.Toleration{
getOcsToleration(),
},
},

APIServerKey: {
Tolerations: []corev1.Toleration{
getOcsToleration(),
},
},

MetricsExporterKey: {
Tolerations: []corev1.Toleration{
getOcsToleration(),
},
},

CsiPluginKey: {
Tolerations: []corev1.Toleration{
getOcsToleration(),
},
},

CsiProvisionerKey: {
Tolerations: []corev1.Toleration{
getOcsToleration(),
TopologySpreadConstraints: []corev1.TopologySpreadConstraint{
getTopologySpreadConstraintWithExpressions(1, "", "DoNotSchedule",
appLabelSelectorKey, metav1.LabelSelectorOpIn, []string{nfsLabelSelector}),
},
},
}
)

// getTopologySpreadConstraintsSpec populates values required for topology spread constraints.
// TopologyKey gets updated in newStorageClassDeviceSets after determining it from determineFailureDomain.
func getTopologySpreadConstraintsSpec(maxSkew int32, valueLabels []string) corev1.TopologySpreadConstraint {
topologySpreadConstraints := corev1.TopologySpreadConstraint{
// getTopologySpreadConstraintWithExpressions constructs a TopologySpreadConstraint
// with the specified parameters for label-based topology spreading.
func getTopologySpreadConstraintWithExpressions(
maxSkew int32, topologyKey string, whenUnsatisfiable corev1.UnsatisfiableConstraintAction,
labelKey string, labelOperator metav1.LabelSelectorOperator, labelValues []string,
) corev1.TopologySpreadConstraint {
return corev1.TopologySpreadConstraint{
MaxSkew: maxSkew,
TopologyKey: corev1.LabelHostname,
WhenUnsatisfiable: "ScheduleAnyway",
LabelSelector: &metav1.LabelSelector{
MatchExpressions: []metav1.LabelSelectorRequirement{
{
Key: appLabelSelectorKey,
Operator: metav1.LabelSelectorOpIn,
Values: valueLabels,
},
},
},
}

return topologySpreadConstraints
}

func getWeightedPodAffinityTerm(weight int32, selectorValue ...string) corev1.WeightedPodAffinityTerm {
return corev1.WeightedPodAffinityTerm{
Weight: weight,
PodAffinityTerm: corev1.PodAffinityTerm{
LabelSelector: &metav1.LabelSelector{
MatchExpressions: []metav1.LabelSelectorRequirement{
{
Key: appLabelSelectorKey,
Operator: metav1.LabelSelectorOpIn,
Values: selectorValue,
},
},
},
TopologyKey: corev1.LabelHostname,
},
}
}

func GetMdsWeightedPodAffinityTerm(weight int32, selectorValue ...string) corev1.WeightedPodAffinityTerm {
return corev1.WeightedPodAffinityTerm{
Weight: weight,
PodAffinityTerm: corev1.PodAffinityTerm{
LabelSelector: &metav1.LabelSelector{
MatchExpressions: []metav1.LabelSelectorRequirement{
{
Key: "rook_file_system",
Operator: metav1.LabelSelectorOpIn,
Values: selectorValue,
},
},
},
TopologyKey: corev1.LabelHostname,
},
}
}

func getPodAffinityTerm(selectorValue ...string) corev1.PodAffinityTerm {
podAffinityTerm := corev1.PodAffinityTerm{
TopologyKey: topologyKey,
WhenUnsatisfiable: whenUnsatisfiable,
LabelSelector: &metav1.LabelSelector{
MatchExpressions: []metav1.LabelSelectorRequirement{
{
Key: appLabelSelectorKey,
Operator: metav1.LabelSelectorOpIn,
Values: selectorValue,
Key: labelKey,
Operator: labelOperator,
Values: labelValues,
},
},
},
TopologyKey: corev1.LabelHostname,
}
return podAffinityTerm
}

func getOcsToleration() corev1.Toleration {
func GetOcsToleration() corev1.Toleration {
toleration := corev1.Toleration{
Key: NodeTolerationKey,
Operator: corev1.TolerationOpEqual,
Expand Down
63 changes: 21 additions & 42 deletions controllers/storagecluster/cephcluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -461,6 +461,7 @@ func newCephCluster(sc *ocsv1.StorageCluster, cephImage string, kmsConfigMap *co
},
Placement: rookCephv1.PlacementSpec{
"all": getPlacement(sc, "all"),
"mgr": getPlacement(sc, "mgr"),
"mon": getPlacement(sc, "mon"),
"arbiter": getPlacement(sc, "arbiter"),
},
Expand Down Expand Up @@ -768,7 +769,6 @@ func getMonCount(sc *ocsv1.StorageCluster) int {
// newStorageClassDeviceSets converts a list of StorageDeviceSets into a list of Rook StorageClassDeviceSets
func newStorageClassDeviceSets(sc *ocsv1.StorageCluster) []rookCephv1.StorageClassDeviceSet {
storageDeviceSets := sc.Spec.StorageDeviceSets
topologyMap := sc.Status.NodeTopologies

var storageClassDeviceSets []rookCephv1.StorageClassDeviceSet

Expand All @@ -781,25 +781,13 @@ func newStorageClassDeviceSets(sc *ocsv1.StorageCluster) []rookCephv1.StorageCla
portable := ds.Portable

topologyKey := ds.TopologyKey
topologyKeyValues := []string{}
if topologyKey == "" {
topologyKey = sc.Status.FailureDomainKey
}

noPlacement := ds.Placement.NodeAffinity == nil && ds.Placement.PodAffinity == nil && ds.Placement.PodAntiAffinity == nil && ds.Placement.TopologySpreadConstraints == nil
noPreparePlacement := ds.PreparePlacement.NodeAffinity == nil && ds.PreparePlacement.PodAffinity == nil && ds.PreparePlacement.PodAntiAffinity == nil && ds.PreparePlacement.TopologySpreadConstraints == nil

if noPlacement {
if topologyKey == "" {
topologyKey = getFailureDomain(sc)
}

if topologyKey == "host" {
portable = false
}

if topologyMap != nil {
topologyKey, topologyKeyValues = topologyMap.GetKeyValues(topologyKey)
}
}

count, replica := countAndReplicaOf(&ds)
for i := 0; i < replica; i++ {
placement := rookCephv1.Placement{}
Expand All @@ -812,37 +800,28 @@ func newStorageClassDeviceSets(sc *ocsv1.StorageCluster) []rookCephv1.StorageCla
if noPreparePlacement {
in := getPlacement(sc, "osd-prepare")
(&in).DeepCopyInto(&preparePlacement)
}

if len(topologyKeyValues) >= getMinDeviceSetReplica(sc) {
// Hard constraints are set in OSD placement for portable volumes with rack failure domain
// domain as there is no node affinity in PVs. This restricts the movement of OSDs
// between failure domain.
if portable && !strings.Contains(topologyKey, "zone") {
addStrictFailureDomainTSC(&placement, topologyKey)
}
// If topologyKey is not host, append additional topology spread constraint to the
// default preparePlacement. This serves even distribution at the host level
// within a failure domain (zone/rack).
if noPreparePlacement {
if topologyKey != corev1.LabelHostname {
addStrictFailureDomainTSC(&preparePlacement, topologyKey)
} else {
preparePlacement.TopologySpreadConstraints[0].TopologyKey = topologyKey
}
}
}

if !noPreparePlacement {
} else {
preparePlacement = ds.PreparePlacement
}
} else if !noPlacement && noPreparePlacement {
preparePlacement = ds.Placement
placement = ds.Placement
} else {
preparePlacement = ds.PreparePlacement
placement = ds.Placement
if noPreparePlacement {
preparePlacement = ds.Placement
} else {
preparePlacement = ds.PreparePlacement
}
}

// for osd & osd-prepare we always need to add TSCs if not present, to ensure their even distribution
if len(placement.TopologySpreadConstraints) == 0 {
placement.TopologySpreadConstraints = defaults.DaemonPlacements["osd"].TopologySpreadConstraints
}
if len(preparePlacement.TopologySpreadConstraints) == 0 {
preparePlacement.TopologySpreadConstraints = defaults.DaemonPlacements["osd-prepare"].TopologySpreadConstraints
}
// Add another TSC which is a hard constraint which restricts the movement of OSDs between failure domains
addStrictFailureDomainTSC(&placement, topologyKey)
addStrictFailureDomainTSC(&preparePlacement, topologyKey)

// Annotation crushDeviceClass ensures osd with different CRUSH device class than the one detected by Ceph
crushDeviceClass := ds.DeviceType
Expand Down
11 changes: 1 addition & 10 deletions controllers/storagecluster/cephtoolbox.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,8 @@ import (

nadclientset "github.com/k8snetworkplumbingwg/network-attachment-definition-client/pkg/client/clientset/versioned"
ocsv1 "github.com/red-hat-storage/ocs-operator/api/v4/v1"
"github.com/red-hat-storage/ocs-operator/v4/controllers/defaults"
"github.com/red-hat-storage/ocs-operator/v4/controllers/util"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
Expand All @@ -35,14 +33,7 @@ func (r *StorageClusterReconciler) ensureToolsDeployment(sc *ocsv1.StorageCluste
var isFound bool
namespace := sc.Namespace

tolerations := []corev1.Toleration{{
Key: defaults.NodeTolerationKey,
Operator: corev1.TolerationOpEqual,
Value: "true",
Effect: corev1.TaintEffectNoSchedule,
}}

tolerations = append(tolerations, getPlacement(sc, "toolbox").Tolerations...)
tolerations := getPlacement(sc, "toolbox").Tolerations

nodeAffinity := getPlacement(sc, "toolbox").NodeAffinity

Expand Down
Loading
Loading