diff --git a/controllers/defaults/placements.go b/controllers/defaults/placements.go index 25d45a0e18..4d15c9a153 100644 --- a/controllers/defaults/placements.go +++ b/controllers/defaults/placements.go @@ -12,14 +12,23 @@ var ( CsiPluginKey = "csi-plugin" CsiProvisionerKey = "csi-provisioner" - // osdLabelSelector is the key in OSD pod. Used - // as a label selector for topology spread constraints. - osdLabelSelector = "rook-ceph-osd" - // osdPrepareLabelSelector is the key in OSD prepare pod. Used - // as a label selector for topology spread constraints. - osdPrepareLabelSelector = "rook-ceph-osd-prepare" // appLabelSelectorKey is common value for 'Key' field in 'LabelSelectorRequirement' appLabelSelectorKey = "app" + // mgrLabelSelector is the key in mgr pod, used for topology spread constraints. + mgrLabelSelector = "rook-ceph-mgr" + // monLabelSelector is the key in mon pod, used for topology spread constraints. + monLabelSelector = "rook-ceph-mon" + // osdLabelSelector is the key in OSD pod, used for topology spread constraints. + osdLabelSelector = "rook-ceph-osd" + // osdPrepareLabelSelector is the key in OSD prepare pod, used for topology spread constraints. + osdPrepareLabelSelector = "rook-ceph-osd-prepare" + // mdsLabelSelector is the key in mds pod, used for topology spread constraints. + mdsLabelSelector = "rook-ceph-mds" + // rgwLabelSelector is the key in rgw pod, used for topology spread constraints. + rgwLabelSelector = "rook-ceph-rgw" + // nfsLabelSelector is the key in nfs pod, used for topology spread constraints. + nfsLabelSelector = "rook-ceph-nfs" + // DefaultNodeAffinity is the NodeAffinity to be used when labelSelector is nil DefaultNodeAffinity = &corev1.NodeAffinity{ RequiredDuringSchedulingIgnoredDuringExecution: getOcsNodeSelector(), @@ -27,189 +36,83 @@ var ( // DaemonPlacements map contains the default placement configs for the // various OCS daemons DaemonPlacements = map[string]rookCephv1.Placement{ - "all": { - Tolerations: []corev1.Toleration{ - getOcsToleration(), + // The empty topology key in TSCs must be replaced with the failure domain key by the caller. + // This enforces strict even distribution of pods across failure domains. + + "mgr": { + TopologySpreadConstraints: []corev1.TopologySpreadConstraint{ + getTopologySpreadConstraintWithExpressions(1, "", "ScheduleAnyway", + appLabelSelectorKey, metav1.LabelSelectorOpIn, []string{mgrLabelSelector}), }, }, "mon": { - PodAntiAffinity: &corev1.PodAntiAffinity{ - RequiredDuringSchedulingIgnoredDuringExecution: []corev1.PodAffinityTerm{ - getPodAffinityTerm("rook-ceph-mon"), - }, + TopologySpreadConstraints: []corev1.TopologySpreadConstraint{ + getTopologySpreadConstraintWithExpressions(1, "", "ScheduleAnyway", + appLabelSelectorKey, metav1.LabelSelectorOpIn, []string{monLabelSelector}), }, }, "osd": { - Tolerations: []corev1.Toleration{ - getOcsToleration(), - }, TopologySpreadConstraints: []corev1.TopologySpreadConstraint{ - getTopologySpreadConstraintsSpec(1, []string{osdLabelSelector}), + getTopologySpreadConstraintWithExpressions(1, corev1.LabelHostname, "ScheduleAnyway", + appLabelSelectorKey, metav1.LabelSelectorOpIn, []string{osdLabelSelector}), }, }, "osd-prepare": { - Tolerations: []corev1.Toleration{ - getOcsToleration(), - }, TopologySpreadConstraints: []corev1.TopologySpreadConstraint{ - getTopologySpreadConstraintsSpec(1, []string{osdLabelSelector, osdPrepareLabelSelector}), + getTopologySpreadConstraintWithExpressions(1, corev1.LabelHostname, "ScheduleAnyway", + appLabelSelectorKey, metav1.LabelSelectorOpIn, []string{osdLabelSelector, osdPrepareLabelSelector}), }, }, "rgw": { - Tolerations: []corev1.Toleration{ - getOcsToleration(), - }, - PodAntiAffinity: &corev1.PodAntiAffinity{ - PreferredDuringSchedulingIgnoredDuringExecution: []corev1.WeightedPodAffinityTerm{ - getWeightedPodAffinityTerm(100, "rook-ceph-rgw"), - }, + TopologySpreadConstraints: []corev1.TopologySpreadConstraint{ + getTopologySpreadConstraintWithExpressions(1, "", "ScheduleAnyway", + appLabelSelectorKey, metav1.LabelSelectorOpIn, []string{rgwLabelSelector}), }, }, "mds": { - Tolerations: []corev1.Toleration{ - getOcsToleration(), - }, - PodAntiAffinity: &corev1.PodAntiAffinity{ - PreferredDuringSchedulingIgnoredDuringExecution: []corev1.WeightedPodAffinityTerm{ - // left the selector value empty as it will be updated later in the getPlacement() - }, + TopologySpreadConstraints: []corev1.TopologySpreadConstraint{ + getTopologySpreadConstraintWithExpressions(1, "", "ScheduleAnyway", + appLabelSelectorKey, metav1.LabelSelectorOpIn, []string{mdsLabelSelector}), }, }, "nfs": { - Tolerations: []corev1.Toleration{ - getOcsToleration(), - }, - PodAntiAffinity: &corev1.PodAntiAffinity{ - RequiredDuringSchedulingIgnoredDuringExecution: []corev1.PodAffinityTerm{ - getPodAffinityTerm("rook-ceph-nfs"), - }, - }, - }, - - "noobaa-core": { - Tolerations: []corev1.Toleration{ - getOcsToleration(), - }, - }, - - "noobaa-standalone": { - Tolerations: []corev1.Toleration{ - getOcsToleration(), - }, - }, - - "rbd-mirror": { - Tolerations: []corev1.Toleration{ - getOcsToleration(), - }, - }, - - APIServerKey: { - Tolerations: []corev1.Toleration{ - getOcsToleration(), - }, - }, - - MetricsExporterKey: { - Tolerations: []corev1.Toleration{ - getOcsToleration(), - }, - }, - - CsiPluginKey: { - Tolerations: []corev1.Toleration{ - getOcsToleration(), - }, - }, - - CsiProvisionerKey: { - Tolerations: []corev1.Toleration{ - getOcsToleration(), + TopologySpreadConstraints: []corev1.TopologySpreadConstraint{ + getTopologySpreadConstraintWithExpressions(1, "", "DoNotSchedule", + appLabelSelectorKey, metav1.LabelSelectorOpIn, []string{nfsLabelSelector}), }, }, } ) -// getTopologySpreadConstraintsSpec populates values required for topology spread constraints. -// TopologyKey gets updated in newStorageClassDeviceSets after determining it from determineFailureDomain. -func getTopologySpreadConstraintsSpec(maxSkew int32, valueLabels []string) corev1.TopologySpreadConstraint { - topologySpreadConstraints := corev1.TopologySpreadConstraint{ +// getTopologySpreadConstraintWithExpressions constructs a TopologySpreadConstraint +// with the specified parameters for label-based topology spreading. +func getTopologySpreadConstraintWithExpressions( + maxSkew int32, topologyKey string, whenUnsatisfiable corev1.UnsatisfiableConstraintAction, + labelKey string, labelOperator metav1.LabelSelectorOperator, labelValues []string, +) corev1.TopologySpreadConstraint { + return corev1.TopologySpreadConstraint{ MaxSkew: maxSkew, - TopologyKey: corev1.LabelHostname, - WhenUnsatisfiable: "ScheduleAnyway", - LabelSelector: &metav1.LabelSelector{ - MatchExpressions: []metav1.LabelSelectorRequirement{ - { - Key: appLabelSelectorKey, - Operator: metav1.LabelSelectorOpIn, - Values: valueLabels, - }, - }, - }, - } - - return topologySpreadConstraints -} - -func getWeightedPodAffinityTerm(weight int32, selectorValue ...string) corev1.WeightedPodAffinityTerm { - return corev1.WeightedPodAffinityTerm{ - Weight: weight, - PodAffinityTerm: corev1.PodAffinityTerm{ - LabelSelector: &metav1.LabelSelector{ - MatchExpressions: []metav1.LabelSelectorRequirement{ - { - Key: appLabelSelectorKey, - Operator: metav1.LabelSelectorOpIn, - Values: selectorValue, - }, - }, - }, - TopologyKey: corev1.LabelHostname, - }, - } -} - -func GetMdsWeightedPodAffinityTerm(weight int32, selectorValue ...string) corev1.WeightedPodAffinityTerm { - return corev1.WeightedPodAffinityTerm{ - Weight: weight, - PodAffinityTerm: corev1.PodAffinityTerm{ - LabelSelector: &metav1.LabelSelector{ - MatchExpressions: []metav1.LabelSelectorRequirement{ - { - Key: "rook_file_system", - Operator: metav1.LabelSelectorOpIn, - Values: selectorValue, - }, - }, - }, - TopologyKey: corev1.LabelHostname, - }, - } -} - -func getPodAffinityTerm(selectorValue ...string) corev1.PodAffinityTerm { - podAffinityTerm := corev1.PodAffinityTerm{ + TopologyKey: topologyKey, + WhenUnsatisfiable: whenUnsatisfiable, LabelSelector: &metav1.LabelSelector{ MatchExpressions: []metav1.LabelSelectorRequirement{ { - Key: appLabelSelectorKey, - Operator: metav1.LabelSelectorOpIn, - Values: selectorValue, + Key: labelKey, + Operator: labelOperator, + Values: labelValues, }, }, }, - TopologyKey: corev1.LabelHostname, } - return podAffinityTerm } -func getOcsToleration() corev1.Toleration { +func GetOcsToleration() corev1.Toleration { toleration := corev1.Toleration{ Key: NodeTolerationKey, Operator: corev1.TolerationOpEqual, diff --git a/controllers/storagecluster/cephcluster.go b/controllers/storagecluster/cephcluster.go index 96304a05e0..374a3a84f5 100644 --- a/controllers/storagecluster/cephcluster.go +++ b/controllers/storagecluster/cephcluster.go @@ -461,6 +461,7 @@ func newCephCluster(sc *ocsv1.StorageCluster, cephImage string, kmsConfigMap *co }, Placement: rookCephv1.PlacementSpec{ "all": getPlacement(sc, "all"), + "mgr": getPlacement(sc, "mgr"), "mon": getPlacement(sc, "mon"), "arbiter": getPlacement(sc, "arbiter"), }, @@ -768,7 +769,6 @@ func getMonCount(sc *ocsv1.StorageCluster) int { // newStorageClassDeviceSets converts a list of StorageDeviceSets into a list of Rook StorageClassDeviceSets func newStorageClassDeviceSets(sc *ocsv1.StorageCluster) []rookCephv1.StorageClassDeviceSet { storageDeviceSets := sc.Spec.StorageDeviceSets - topologyMap := sc.Status.NodeTopologies var storageClassDeviceSets []rookCephv1.StorageClassDeviceSet @@ -781,25 +781,13 @@ func newStorageClassDeviceSets(sc *ocsv1.StorageCluster) []rookCephv1.StorageCla portable := ds.Portable topologyKey := ds.TopologyKey - topologyKeyValues := []string{} + if topologyKey == "" { + topologyKey = sc.Status.FailureDomainKey + } noPlacement := ds.Placement.NodeAffinity == nil && ds.Placement.PodAffinity == nil && ds.Placement.PodAntiAffinity == nil && ds.Placement.TopologySpreadConstraints == nil noPreparePlacement := ds.PreparePlacement.NodeAffinity == nil && ds.PreparePlacement.PodAffinity == nil && ds.PreparePlacement.PodAntiAffinity == nil && ds.PreparePlacement.TopologySpreadConstraints == nil - if noPlacement { - if topologyKey == "" { - topologyKey = getFailureDomain(sc) - } - - if topologyKey == "host" { - portable = false - } - - if topologyMap != nil { - topologyKey, topologyKeyValues = topologyMap.GetKeyValues(topologyKey) - } - } - count, replica := countAndReplicaOf(&ds) for i := 0; i < replica; i++ { placement := rookCephv1.Placement{} @@ -812,37 +800,28 @@ func newStorageClassDeviceSets(sc *ocsv1.StorageCluster) []rookCephv1.StorageCla if noPreparePlacement { in := getPlacement(sc, "osd-prepare") (&in).DeepCopyInto(&preparePlacement) - } - - if len(topologyKeyValues) >= getMinDeviceSetReplica(sc) { - // Hard constraints are set in OSD placement for portable volumes with rack failure domain - // domain as there is no node affinity in PVs. This restricts the movement of OSDs - // between failure domain. - if portable && !strings.Contains(topologyKey, "zone") { - addStrictFailureDomainTSC(&placement, topologyKey) - } - // If topologyKey is not host, append additional topology spread constraint to the - // default preparePlacement. This serves even distribution at the host level - // within a failure domain (zone/rack). - if noPreparePlacement { - if topologyKey != corev1.LabelHostname { - addStrictFailureDomainTSC(&preparePlacement, topologyKey) - } else { - preparePlacement.TopologySpreadConstraints[0].TopologyKey = topologyKey - } - } - } - - if !noPreparePlacement { + } else { preparePlacement = ds.PreparePlacement } - } else if !noPlacement && noPreparePlacement { - preparePlacement = ds.Placement - placement = ds.Placement } else { - preparePlacement = ds.PreparePlacement placement = ds.Placement + if noPreparePlacement { + preparePlacement = ds.Placement + } else { + preparePlacement = ds.PreparePlacement + } + } + + // for osd & osd-prepare we always need to add TSCs if not present, to ensure their even distribution + if len(placement.TopologySpreadConstraints) == 0 { + placement.TopologySpreadConstraints = defaults.DaemonPlacements["osd"].TopologySpreadConstraints + } + if len(preparePlacement.TopologySpreadConstraints) == 0 { + preparePlacement.TopologySpreadConstraints = defaults.DaemonPlacements["osd-prepare"].TopologySpreadConstraints } + // Add another TSC which is a hard constraint which restricts the movement of OSDs between failure domains + addStrictFailureDomainTSC(&placement, topologyKey) + addStrictFailureDomainTSC(&preparePlacement, topologyKey) // Annotation crushDeviceClass ensures osd with different CRUSH device class than the one detected by Ceph crushDeviceClass := ds.DeviceType diff --git a/controllers/storagecluster/cephtoolbox.go b/controllers/storagecluster/cephtoolbox.go index cff86c1775..9aabb9778d 100644 --- a/controllers/storagecluster/cephtoolbox.go +++ b/controllers/storagecluster/cephtoolbox.go @@ -9,10 +9,8 @@ import ( nadclientset "github.com/k8snetworkplumbingwg/network-attachment-definition-client/pkg/client/clientset/versioned" ocsv1 "github.com/red-hat-storage/ocs-operator/api/v4/v1" - "github.com/red-hat-storage/ocs-operator/v4/controllers/defaults" "github.com/red-hat-storage/ocs-operator/v4/controllers/util" appsv1 "k8s.io/api/apps/v1" - corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" @@ -35,14 +33,7 @@ func (r *StorageClusterReconciler) ensureToolsDeployment(sc *ocsv1.StorageCluste var isFound bool namespace := sc.Namespace - tolerations := []corev1.Toleration{{ - Key: defaults.NodeTolerationKey, - Operator: corev1.TolerationOpEqual, - Value: "true", - Effect: corev1.TaintEffectNoSchedule, - }} - - tolerations = append(tolerations, getPlacement(sc, "toolbox").Tolerations...) + tolerations := getPlacement(sc, "toolbox").Tolerations nodeAffinity := getPlacement(sc, "toolbox").NodeAffinity diff --git a/controllers/storagecluster/placement.go b/controllers/storagecluster/placement.go index e251deb716..4af9b90148 100644 --- a/controllers/storagecluster/placement.go +++ b/controllers/storagecluster/placement.go @@ -11,56 +11,13 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" ) -// getPlacement returns placement configuration for ceph components with appropriate topology +// getPlacement returns placement configuration for specified component func getPlacement(sc *ocsv1.StorageCluster, component string) rookCephv1.Placement { - placement := rookCephv1.Placement{} - in, ok := sc.Spec.Placement[rookCephv1.KeyType(component)] - if ok { - (&in).DeepCopyInto(&placement) - } else { - in := defaults.DaemonPlacements[component] - (&in).DeepCopyInto(&placement) - // label rook_file_system is added to the mds pod using rook operator - if component == "mds" { - placement.PodAntiAffinity = &corev1.PodAntiAffinity{ - PreferredDuringSchedulingIgnoredDuringExecution: []corev1.WeightedPodAffinityTerm{ - defaults.GetMdsWeightedPodAffinityTerm(100, generateNameForCephFilesystem(sc)), - }, - } - } - } - - // ignore default PodAntiAffinity mon placement when arbiter is enabled - if component == "mon" && arbiterEnabled(sc) { - placement.PodAntiAffinity = &corev1.PodAntiAffinity{} - } - - if component == "arbiter" { - if !sc.Spec.Arbiter.DisableMasterNodeToleration { - placement.Tolerations = append(placement.Tolerations, corev1.Toleration{ - Key: "node-role.kubernetes.io/master", - Operator: corev1.TolerationOpExists, - Effect: corev1.TaintEffectNoSchedule, - }) - } - return placement - } + // Fetch placement spec specified for the component from the StorageCluster CR + placement := sc.Spec.Placement[rookCephv1.KeyType(component)] - // if provider-server placements are found in the storagecluster spec append the default ocs tolerations to it - if ok && component == defaults.APIServerKey { - placement.Tolerations = append(placement.Tolerations, defaults.DaemonPlacements[component].Tolerations...) - return placement - } - - // if metrics-exporter placements are found in the storagecluster spec append the default ocs tolerations to it - if ok && component == defaults.MetricsExporterKey { - placement.Tolerations = append(placement.Tolerations, defaults.DaemonPlacements[component].Tolerations...) - return placement - } - - // If no placement is specified for the given component and the - // StorageCluster has no label selector, set the default node - // affinity. + // If no node affinity is specified for the given component and the StorageCluster has no label selector + // Set the default node affinity. if placement.NodeAffinity == nil && sc.Spec.LabelSelector == nil { placement.NodeAffinity = defaults.DefaultNodeAffinity } @@ -74,24 +31,29 @@ func getPlacement(sc *ocsv1.StorageCluster, component string) rookCephv1.Placeme } } - topologyMap := sc.Status.NodeTopologies - if topologyMap == nil { - return placement + // Add the ocs toleration to the placement of any component + placement.Tolerations = append(placement.Tolerations, defaults.GetOcsToleration()) + + // Add the master node toleration to the placement of the arbiter component if it is not disabled + if component == "arbiter" { + if !sc.Spec.Arbiter.DisableMasterNodeToleration { + placement.Tolerations = append(placement.Tolerations, corev1.Toleration{ + Key: "node-role.kubernetes.io/master", + Operator: corev1.TolerationOpExists, + Effect: corev1.TaintEffectNoSchedule, + }) + } } - topologyKey := getFailureDomain(sc) - topologyKey, _ = topologyMap.GetKeyValues(topologyKey) - if component == "mon" || component == "mds" || component == "rgw" { - if placement.PodAntiAffinity != nil { - if placement.PodAntiAffinity.PreferredDuringSchedulingIgnoredDuringExecution != nil { - for i := range placement.PodAntiAffinity.PreferredDuringSchedulingIgnoredDuringExecution { - placement.PodAntiAffinity.PreferredDuringSchedulingIgnoredDuringExecution[i].PodAffinityTerm.TopologyKey = topologyKey - } - } - if placement.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution != nil { - for i := range placement.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution { - placement.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution[i].TopologyKey = topologyKey - } + // for these ceph-daemons we always need to add topology spread constraints if not present, to ensure their even distribution + if component == "mgr" || component == "mon" || component == "osd" || component == "osd-prepare" || component == "mds" || component == "rgw" || component == "nfs" { + if len(placement.TopologySpreadConstraints) == 0 { + placement.TopologySpreadConstraints = defaults.DaemonPlacements[component].TopologySpreadConstraints + } + // if the topology key is empty, set it to the failure domain key of the cluster + for i := range placement.TopologySpreadConstraints { + if placement.TopologySpreadConstraints[i].TopologyKey == "" { + placement.TopologySpreadConstraints[i].TopologyKey = sc.Status.FailureDomainKey } } }