diff --git a/internal/scheduler/configuration/configuration.go b/internal/scheduler/configuration/configuration.go index 9469009e3d7..6d6780bfd02 100644 --- a/internal/scheduler/configuration/configuration.go +++ b/internal/scheduler/configuration/configuration.go @@ -85,9 +85,6 @@ type MetricsConfig struct { // Allowed characters in resource names are [a-zA-Z_:][a-zA-Z0-9_:]* // It can also be used to track multiple resources within the same metric, e.g., "nvidia.com/gpu" and "amd.com/gpu". ResourceRenaming map[v1.ResourceName]string - // Controls the cycle time metrics. - // TODO(albin): Not used yet. - CycleTimeConfig PrometheusSummaryConfig // The first matching regex of each error message is cached in an LRU cache. // This setting controls the cache size. MatchedRegexIndexByErrorMessageCacheSize uint64 @@ -95,22 +92,6 @@ type MetricsConfig struct { ResetInterval time.Duration } -// PrometheusSummaryConfig contains the relevant config for a prometheus.Summary. -type PrometheusSummaryConfig struct { - // Objectives defines the quantile rank estimates with their respective - // absolute error. If Objectives[q] = e, then the value reported for q - // will be the φ-quantile value for some φ between q-e and q+e. The - // default value is an empty map, resulting in a summary without - // quantiles. - Objectives map[float64]float64 - - // MaxAge defines the duration for which an observation stays relevant - // for the summary. Only applies to pre-calculated quantiles, does not - // apply to _sum and _count. Must be positive. The default value is - // DefMaxAge. - MaxAge time.Duration -} - type LeaderConfig struct { // Valid modes are "standalone" or "kubernetes" Mode string `validate:"required"` @@ -233,13 +214,6 @@ type SchedulingConfig struct { MaxRetries uint // List of resource names, e.g., []string{"cpu", "memory"}, to consider when computing DominantResourceFairness. DominantResourceFairnessResourcesToConsider []string - // Once a node has been found on which a pod can be scheduled, - // the scheduler will consider up to the next maxExtraNodesToConsider nodes. - // The scheduler selects the node with the best score out of the considered nodes. - // In particular, the score expresses whether preemption is necessary to schedule a pod. - // Hence, a larger MaxExtraNodesToConsider would reduce the expected number of preemptions. - // TODO(albin): Remove. It's unused. - MaxExtraNodesToConsider uint // Resource types (e.g. memory or nvidia.com/gpu) that the scheduler keeps track of. // Resource types not on this list will be ignored if seen on a node, and any jobs requesting them will fail. SupportedResourceTypes []ResourceType diff --git a/internal/scheduler/gang_scheduler_test.go b/internal/scheduler/gang_scheduler_test.go index 1f47641a2f8..67c2086baae 100644 --- a/internal/scheduler/gang_scheduler_test.go +++ b/internal/scheduler/gang_scheduler_test.go @@ -509,7 +509,6 @@ func TestGangScheduler(t *testing.T) { } nodeDb, err := nodedb.NewNodeDb( tc.SchedulingConfig.PriorityClasses, - tc.SchedulingConfig.MaxExtraNodesToConsider, tc.SchedulingConfig.IndexedResources, tc.SchedulingConfig.IndexedTaints, tc.SchedulingConfig.IndexedNodeLabels, diff --git a/internal/scheduler/nodedb/nodedb.go b/internal/scheduler/nodedb/nodedb.go index 034371ba2f8..1259c930933 100644 --- a/internal/scheduler/nodedb/nodedb.go +++ b/internal/scheduler/nodedb/nodedb.go @@ -157,14 +157,6 @@ type EvictedJobSchedulingContext struct { type NodeDb struct { // In-memory database storing *Node. db *memdb.MemDB - // Once a node has been found on which a pod can be scheduled, - // the NodeDb will consider up to the next maxExtraNodesToConsider nodes. - // The NodeDb selects the node with the best score out of the considered nodes. - // In particular, the score expresses whether preemption is necessary to schedule a pod. - // Hence, a larger maxExtraNodesToConsider would reduce the expected number of preemptions. - // - // TODO: Currently gives no benefit. Since all nodes are given the same score. - maxExtraNodesToConsider uint // Allowed priority classes. // Because the number of database indices scales linearly with the number of distinct priorities, // the efficiency of the NodeDb relies on the number of distinct priorities being small. @@ -245,7 +237,6 @@ type NodeDb struct { func NewNodeDb( priorityClasses map[string]types.PriorityClass, - maxExtraNodesToConsider uint, indexedResources []configuration.ResourceType, indexedTaints []string, indexedNodeLabels []string, @@ -297,7 +288,6 @@ func NewNodeDb( nodeDb := NodeDb{ priorityClasses: priorityClasses, nodeDbPriorities: nodeDbPriorities, - maxExtraNodesToConsider: maxExtraNodesToConsider, indexedResources: indexedResourceNames, indexedResourcesSet: mapFromSlice(indexedResourceNames), indexedResourceResolution: indexedResourceResolution, @@ -778,42 +768,27 @@ func (nodeDb *NodeDb) selectNodeForPodWithItAtPriority( onlyCheckDynamicRequirements bool, ) (*internaltypes.Node, error) { var selectedNode *internaltypes.Node - var selectedNodeScore int - var numExtraNodes uint for obj := it.Next(); obj != nil; obj = it.Next() { - if selectedNode != nil { - numExtraNodes++ - if numExtraNodes > nodeDb.maxExtraNodesToConsider { - break - } - } - node := obj.(*internaltypes.Node) if node == nil { return nil, nil } var matches bool - var score int var reason PodRequirementsNotMetReason var err error if onlyCheckDynamicRequirements { - matches, score, reason = DynamicJobRequirementsMet(node.AllocatableByPriority[priority], jctx) + matches, reason = DynamicJobRequirementsMet(node.AllocatableByPriority[priority], jctx) } else { - matches, score, reason, err = JobRequirementsMet(node, priority, jctx) + matches, reason, err = JobRequirementsMet(node, priority, jctx) } if err != nil { return nil, err } if matches { - if selectedNode == nil || score > selectedNodeScore { - selectedNode = node - selectedNodeScore = score - if selectedNodeScore == SchedulableBestScore { - break - } - } + selectedNode = node + break } else { s := nodeDb.stringFromPodRequirementsNotMetReason(reason) jctx.PodSchedulingContext.NumExcludedNodesByReason[s] += 1 @@ -873,7 +848,7 @@ func (nodeDb *NodeDb) selectNodeForJobWithFairPreemption(txn *memdb.Txn, jctx *s if priority > maxPriority { maxPriority = priority } - matches, _, reason, err := JobRequirementsMet( + matches, reason, err := JobRequirementsMet( node, // At this point, we've unbound the jobs running on the node. // Hence, we should check if the job is schedulable at evictedPriority, diff --git a/internal/scheduler/nodedb/nodedb_test.go b/internal/scheduler/nodedb/nodedb_test.go index 12a578c7b12..b996cde278e 100644 --- a/internal/scheduler/nodedb/nodedb_test.go +++ b/internal/scheduler/nodedb/nodedb_test.go @@ -554,7 +554,6 @@ func TestScheduleMany(t *testing.T) { func TestAwayNodeTypes(t *testing.T) { nodeDb, err := NewNodeDb( testfixtures.TestPriorityClasses, - testfixtures.TestMaxExtraNodesToConsider, testfixtures.TestResources, testfixtures.TestIndexedTaints, testfixtures.TestIndexedNodeLabels, @@ -722,7 +721,6 @@ func TestMakeIndexedResourceResolution_ErrorsOnInvalidResolution(t *testing.T) { func benchmarkUpsert(nodes []*schedulerobjects.Node, b *testing.B) { nodeDb, err := NewNodeDb( testfixtures.TestPriorityClasses, - testfixtures.TestMaxExtraNodesToConsider, testfixtures.TestResources, testfixtures.TestIndexedTaints, testfixtures.TestIndexedNodeLabels, @@ -763,7 +761,6 @@ func BenchmarkUpsert100000(b *testing.B) { func benchmarkScheduleMany(b *testing.B, nodes []*schedulerobjects.Node, jobs []*jobdb.Job) { nodeDb, err := NewNodeDb( testfixtures.TestPriorityClasses, - testfixtures.TestMaxExtraNodesToConsider, testfixtures.TestResources, testfixtures.TestIndexedTaints, testfixtures.TestIndexedNodeLabels, @@ -890,7 +887,6 @@ func BenchmarkScheduleManyResourceConstrained(b *testing.B) { func newNodeDbWithNodes(nodes []*schedulerobjects.Node) (*NodeDb, error) { nodeDb, err := NewNodeDb( testfixtures.TestPriorityClasses, - testfixtures.TestMaxExtraNodesToConsider, testfixtures.TestResources, testfixtures.TestIndexedTaints, testfixtures.TestIndexedNodeLabels, diff --git a/internal/scheduler/nodedb/nodematching.go b/internal/scheduler/nodedb/nodematching.go index 46864db151e..a068d9e1fbd 100644 --- a/internal/scheduler/nodedb/nodematching.go +++ b/internal/scheduler/nodedb/nodematching.go @@ -14,10 +14,6 @@ import ( ) const ( - // When checking if a pod fits on a node, this score indicates how well the pods fits. - // However, all nodes are currently given the same score. - SchedulableScore = 0 - SchedulableBestScore = SchedulableScore PodRequirementsNotMetReasonUnknown = "unknown" PodRequirementsNotMetReasonInsufficientResources = "insufficient resources available" ) @@ -156,16 +152,16 @@ func NodeTypeJobRequirementsMet(nodeType *schedulerobjects.NodeType, jctx *sched // - 1: Pod can be scheduled without preempting any running pods. // If the requirements are not met, it returns the reason why. // If the requirements can't be parsed, an error is returned. -func JobRequirementsMet(node *internaltypes.Node, priority int32, jctx *schedulercontext.JobSchedulingContext) (bool, int, PodRequirementsNotMetReason, error) { +func JobRequirementsMet(node *internaltypes.Node, priority int32, jctx *schedulercontext.JobSchedulingContext) (bool, PodRequirementsNotMetReason, error) { matches, reason, err := StaticJobRequirementsMet(node, jctx) if !matches || err != nil { - return matches, 0, reason, err + return matches, reason, err } - matches, score, reason := DynamicJobRequirementsMet(node.AllocatableByPriority[priority], jctx) + matches, reason = DynamicJobRequirementsMet(node.AllocatableByPriority[priority], jctx) if !matches { - return matches, 0, reason, nil + return matches, reason, nil } - return true, score, nil, nil + return true, nil, nil } // StaticJobRequirementsMet checks if a job can be scheduled onto this node, @@ -201,9 +197,9 @@ func StaticJobRequirementsMet(node *internaltypes.Node, jctx *schedulercontext.J // DynamicJobRequirementsMet checks if a pod can be scheduled onto this node, // accounting for resources allocated to pods already assigned to this node. -func DynamicJobRequirementsMet(allocatableResources internaltypes.ResourceList, jctx *schedulercontext.JobSchedulingContext) (bool, int, PodRequirementsNotMetReason) { +func DynamicJobRequirementsMet(allocatableResources internaltypes.ResourceList, jctx *schedulercontext.JobSchedulingContext) (bool, PodRequirementsNotMetReason) { matches, reason := resourceRequirementsMet(allocatableResources, jctx.ResourceRequirements) - return matches, SchedulableScore, reason + return matches, reason } func TolerationRequirementsMet(taints []v1.Taint, tolerations ...[]v1.Toleration) (bool, PodRequirementsNotMetReason) { diff --git a/internal/scheduler/nodedb/nodematching_test.go b/internal/scheduler/nodedb/nodematching_test.go index bfa17420184..5ae54c65f8a 100644 --- a/internal/scheduler/nodedb/nodematching_test.go +++ b/internal/scheduler/nodedb/nodematching_test.go @@ -380,7 +380,7 @@ func TestNodeSchedulingRequirementsMet(t *testing.T) { } for name, tc := range tests { t.Run(name, func(t *testing.T) { - matches, _, reason, err := JobRequirementsMet( + matches, reason, err := JobRequirementsMet( tc.node, tc.req.Priority, // TODO(albin): Define a jctx in the test case instead. diff --git a/internal/scheduler/queue_scheduler_test.go b/internal/scheduler/queue_scheduler_test.go index 4d69b4a8865..5f9493e041b 100644 --- a/internal/scheduler/queue_scheduler_test.go +++ b/internal/scheduler/queue_scheduler_test.go @@ -725,7 +725,6 @@ func TestQueueScheduler(t *testing.T) { func NewNodeDb(config configuration.SchedulingConfig, stringInterner *stringinterner.StringInterner) (*nodedb.NodeDb, error) { nodeDb, err := nodedb.NewNodeDb( config.PriorityClasses, - config.MaxExtraNodesToConsider, config.IndexedResources, config.IndexedTaints, config.IndexedNodeLabels, diff --git a/internal/scheduler/scheduling_algo.go b/internal/scheduler/scheduling_algo.go index a6e9ca4ab35..00587eb5cd9 100644 --- a/internal/scheduler/scheduling_algo.go +++ b/internal/scheduler/scheduling_algo.go @@ -393,7 +393,6 @@ func (l *FairSchedulingAlgo) scheduleOnExecutors( ) (*SchedulerResult, *schedulercontext.SchedulingContext, error) { nodeDb, err := nodedb.NewNodeDb( l.schedulingConfig.PriorityClasses, - l.schedulingConfig.MaxExtraNodesToConsider, l.schedulingConfig.IndexedResources, l.schedulingConfig.IndexedTaints, l.schedulingConfig.IndexedNodeLabels, diff --git a/internal/scheduler/scheduling_algo_test.go b/internal/scheduler/scheduling_algo_test.go index f504e5d9b78..c0b2e07cd3d 100644 --- a/internal/scheduler/scheduling_algo_test.go +++ b/internal/scheduler/scheduling_algo_test.go @@ -530,7 +530,6 @@ func BenchmarkNodeDbConstruction(b *testing.B) { nodeDb, err := nodedb.NewNodeDb( schedulingConfig.PriorityClasses, - schedulingConfig.MaxExtraNodesToConsider, schedulingConfig.IndexedResources, schedulingConfig.IndexedTaints, schedulingConfig.IndexedNodeLabels, diff --git a/internal/scheduler/simulator/simulator.go b/internal/scheduler/simulator/simulator.go index fa21182ed96..be72f9db882 100644 --- a/internal/scheduler/simulator/simulator.go +++ b/internal/scheduler/simulator/simulator.go @@ -234,7 +234,6 @@ func (s *Simulator) setupClusters() error { for executorGroupIndex, executorGroup := range pool.ClusterGroups { nodeDb, err := nodedb.NewNodeDb( s.schedulingConfig.PriorityClasses, - s.schedulingConfig.MaxExtraNodesToConsider, s.schedulingConfig.IndexedResources, s.schedulingConfig.IndexedTaints, s.schedulingConfig.IndexedNodeLabels, diff --git a/internal/scheduler/submitcheck.go b/internal/scheduler/submitcheck.go index ae1e6583f14..4d4a72b109b 100644 --- a/internal/scheduler/submitcheck.go +++ b/internal/scheduler/submitcheck.go @@ -268,7 +268,6 @@ func (srv *SubmitChecker) getSchedulingResult(gctx *schedulercontext.GangSchedul func (srv *SubmitChecker) constructNodeDb(executor *schedulerobjects.Executor) (*nodedb.NodeDb, error) { nodeDb, err := nodedb.NewNodeDb( srv.schedulingConfig.PriorityClasses, - 0, srv.schedulingConfig.IndexedResources, srv.schedulingConfig.IndexedTaints, srv.schedulingConfig.IndexedNodeLabels, diff --git a/internal/scheduler/testfixtures/testfixtures.go b/internal/scheduler/testfixtures/testfixtures.go index 2a41d81f95e..7fa0254504c 100644 --- a/internal/scheduler/testfixtures/testfixtures.go +++ b/internal/scheduler/testfixtures/testfixtures.go @@ -52,10 +52,9 @@ var ( "armada-preemptible-away": {Priority: 30000, Preemptible: true, AwayNodeTypes: []types.AwayNodeType{{Priority: 29000, WellKnownNodeTypeName: "gpu"}}}, "armada-preemptible": {Priority: 30000, Preemptible: true}, } - TestDefaultPriorityClass = PriorityClass3 - TestPriorities = []int32{0, 1, 2, 3} - TestMaxExtraNodesToConsider uint = 1 - TestResources = []schedulerconfiguration.ResourceType{ + TestDefaultPriorityClass = PriorityClass3 + TestPriorities = []int32{0, 1, 2, 3} + TestResources = []schedulerconfiguration.ResourceType{ {Name: "cpu", Resolution: resource.MustParse("1")}, {Name: "memory", Resolution: resource.MustParse("128Mi")}, {Name: "nvidia.com/gpu", Resolution: resource.MustParse("1")}, @@ -168,7 +167,6 @@ func TestSchedulingConfig() schedulerconfiguration.SchedulingConfig { MaximumSchedulingBurst: math.MaxInt, MaximumPerQueueSchedulingRate: math.Inf(1), MaximumPerQueueSchedulingBurst: math.MaxInt, - MaxExtraNodesToConsider: TestMaxExtraNodesToConsider, IndexedResources: TestResources, IndexedNodeLabels: TestIndexedNodeLabels, IndexedTaints: TestIndexedTaints,