Fix per-pc resource limits

armadaproject · Jun 22, 2023 · f11417a · f11417a
1 parent f869cea
commit f11417a
Show file tree

Hide file tree

Showing 19 changed files with 768 additions and 596 deletions.
diff --git a/internal/armada/configuration/types.go b/internal/armada/configuration/types.go
@@ -233,12 +233,8 @@ type PriorityClass struct {
 	Priority int32
 	// If true, Armada may preempt jobs of this class to improve fairness.
 	Preemptible bool
-	// Limits resources assigned to jobs of priority equal to or lower than that of this priority class.
+	// Limits resources assigned to jobs of this priority class.
 	// Specifically, jobs of this priority class are only scheduled if doing so does not exceed this limit.
-	//
-	// For example, if priority is 10 and MaximumResourceFractionPerQueue is map[string]float64{"cpu": 0.3},
-	// jobs of this priority class are not scheduled if doing so would cause the total resources assigned
-	// to jobs of priority 10 or lower from the same queue to exceed 30% of the total.
 	MaximumResourceFractionPerQueue map[string]float64
 	// Per-pool override of MaximumResourceFractionPerQueue.
 	// If missing for a particular pool, MaximumResourceFractionPerQueue is used instead for that pool.

diff --git a/internal/armada/server/lease.go b/internal/armada/server/lease.go
@@ -40,6 +40,7 @@ import (
 	schedulerconstraints "github.com/armadaproject/armada/internal/scheduler/constraints"
 	schedulercontext "github.com/armadaproject/armada/internal/scheduler/context"
 	"github.com/armadaproject/armada/internal/scheduler/database"
+	"github.com/armadaproject/armada/internal/scheduler/interfaces"
 	schedulerinterfaces "github.com/armadaproject/armada/internal/scheduler/interfaces"
 	"github.com/armadaproject/armada/internal/scheduler/nodedb"
 	"github.com/armadaproject/armada/internal/scheduler/schedulerobjects"
@@ -282,10 +283,31 @@ func (q *AggregatedQueueServer) getJobs(ctx context.Context, req *api.StreamingL
 		})
 	}
 
+	// Map queue names to priority factor for all active queues, i.e.,
+	// all queues for which the jobs queue has not been deleted automatically by Redis.
+	queues, err := q.queueRepository.GetAllQueues()
+	if err != nil {
+		return nil, err
+	}
+	priorityFactorByQueue := make(map[string]float64, len(queues))
+	apiQueues := make([]*api.Queue, len(queues))
+	for i, queue := range queues {
+		priorityFactorByQueue[queue.Name] = float64(queue.PriorityFactor)
+		apiQueues[i] = &api.Queue{Name: queue.Name}
+	}
+	activeQueues, err := q.jobRepository.FilterActiveQueues(apiQueues)
+	if err != nil {
+		return nil, err
+	}
+	priorityFactorByActiveQueue := make(map[string]float64, len(activeQueues))
+	for _, queue := range activeQueues {
+		priorityFactorByActiveQueue[queue.Name] = priorityFactorByQueue[queue.Name]
+	}
+
 	// Nodes to be considered by the scheduler.
 	lastSeen := q.clock.Now()
 	nodes := make([]*schedulerobjects.Node, 0, len(req.Nodes))
-	allocatedByQueueForCluster := make(map[string]schedulerobjects.QuantityByPriorityAndResourceType)
+	allocatedByQueueAndPriorityClassForCluster := make(map[string]schedulerobjects.QuantityByTAndResourceType[string], len(queues))
 	jobIdsByGangId := make(map[string]map[string]bool)
 	gangIdByJobId := make(map[string]string)
 	nodeIdByJobId := make(map[string]string)
@@ -332,11 +354,9 @@ func (q *AggregatedQueueServer) getJobs(ctx context.Context, req *api.StreamingL
 		}
 
 		// Aggregate total resources allocated by queue for this cluster.
-		allocatedByQueueForCluster = scheduler.UpdateUsage(
-			allocatedByQueueForCluster,
-			jobs,
-			q.schedulingConfig.Preemption.PriorityClasses,
-			scheduler.Add,
+		allocatedByQueueAndPriorityClassForCluster = updateAllocatedByQueueAndPriorityClass(
+			allocatedByQueueAndPriorityClassForCluster,
+			add, jobs,
 		)
 
 		// Group gangs.
@@ -398,30 +418,36 @@ func (q *AggregatedQueueServer) getJobs(ctx context.Context, req *api.StreamingL
 		return nil, err
 	}
 
-	// Load executor reports for all clusters, and insert an updated report for this cluster.
+	// Load allocation reports for all executors from Redis.
 	reportsByExecutor, err := q.usageRepository.GetClusterQueueResourceUsage()
 	if err != nil {
 		return nil, err
 	}
-	executorReport := &schedulerobjects.ClusterResourceUsageReport{
+
+	// Insert an updated report for the current executor, which includes information received in this lease call.
+	currentExecutorReport := &schedulerobjects.ClusterResourceUsageReport{
 		Pool:             req.Pool,
 		Created:          q.clock.Now(),
-		ResourcesByQueue: make(map[string]*schedulerobjects.QueueClusterResourceUsage),
+		ResourcesByQueue: make(map[string]*schedulerobjects.QueueClusterResourceUsage, len(queues)),
 	}
-	for queue, allocated := range allocatedByQueueForCluster {
-		executorReport.ResourcesByQueue[queue] = &schedulerobjects.QueueClusterResourceUsage{
-			Created:             executorReport.Created,
-			Queue:               queue,
-			ExecutorId:          req.ClusterId,
-			ResourcesByPriority: allocated.DeepCopy(),
+	for queue, allocatedByPriorityClass := range allocatedByQueueAndPriorityClassForCluster {
+		currentExecutorReport.ResourcesByQueue[queue] = &schedulerobjects.QueueClusterResourceUsage{
+			Created:                      currentExecutorReport.Created,
+			Queue:                        queue,
+			ExecutorId:                   req.ClusterId,
+			ResourcesByPriorityClassName: armadamaps.DeepCopy(allocatedByPriorityClass),
 		}
 	}
-	reportsByExecutor[req.ClusterId] = executorReport
-	if err := q.usageRepository.UpdateClusterQueueResourceUsage(req.ClusterId, executorReport); err != nil {
+	reportsByExecutor[req.ClusterId] = currentExecutorReport
+
+	// Write the updated report into Redis to make the information available to other replicas of the server.
+	if err := q.usageRepository.UpdateClusterQueueResourceUsage(req.ClusterId, currentExecutorReport); err != nil {
 		return nil, errors.WithMessagef(err, "failed to update cluster usage for cluster %s", req.ClusterId)
 	}
-	allocatedByQueueForPool := q.aggregateUsage(reportsByExecutor, req.Pool)
-	log.Infof("allocated resources per queue for pool %s before scheduling: %v", req.Pool, allocatedByQueueForPool)
+
+	// Aggregate allocation across all clusters.
+	allocatedByQueueAndPriorityClassForPool := q.aggregateAllocationAcrossExecutor(reportsByExecutor, req.Pool)
+	log.Infof("allocated resources per queue for pool %s before scheduling: %v", req.Pool, allocatedByQueueAndPriorityClassForPool)
 
 	// Store executor details in Redis so they can be used by submit checks and the new scheduler.
 	if err := q.executorRepository.StoreExecutor(ctx, &schedulerobjects.Executor{
@@ -435,29 +461,7 @@ func (q *AggregatedQueueServer) getJobs(ctx context.Context, req *api.StreamingL
 		log.WithError(err).Warnf("could not store executor details for cluster %s", req.ClusterId)
 	}
 
-	// Map queue names to priority factor for all active queues, i.e.,
-	// all queues for which the jobs queue has not been deleted automatically by Redis.
-	queues, err := q.queueRepository.GetAllQueues()
-	if err != nil {
-		return nil, err
-	}
-	priorityFactorByQueue := make(map[string]float64, len(queues))
-	apiQueues := make([]*api.Queue, len(queues))
-	for i, queue := range queues {
-		priorityFactorByQueue[queue.Name] = float64(queue.PriorityFactor)
-		apiQueues[i] = &api.Queue{Name: queue.Name}
-	}
-	activeQueues, err := q.jobRepository.FilterActiveQueues(apiQueues)
-	if err != nil {
-		return nil, err
-	}
-	priorityFactorByActiveQueue := make(map[string]float64, len(activeQueues))
-	for _, queue := range activeQueues {
-		priorityFactorByActiveQueue[queue.Name] = priorityFactorByQueue[queue.Name]
-	}
-
-	// Give Schedule() a 3 second shorter deadline than ctx,
-	// to give it a chance to finish up before ctx is cancelled.
+	// Give Schedule() a 3 second shorter deadline than ctx to give it a chance to finish up before ctx deadline.
 	if deadline, ok := ctx.Deadline(); ok {
 		var cancel context.CancelFunc
 		ctx, cancel = context.WithDeadline(ctx, deadline.Add(-3*time.Second))
@@ -473,7 +477,7 @@ func (q *AggregatedQueueServer) getJobs(ctx context.Context, req *api.StreamingL
 		schedulerobjects.ResourceList{Resources: totalCapacity},
 	)
 	for queue, priorityFactor := range priorityFactorByQueue {
-		if err := sctx.AddQueueSchedulingContext(queue, priorityFactor, allocatedByQueueForPool[queue]); err != nil {
+		if err := sctx.AddQueueSchedulingContext(queue, priorityFactor, allocatedByQueueAndPriorityClassForPool[queue]); err != nil {
 			return nil, err
 		}
 	}
@@ -629,38 +633,37 @@ func (q *AggregatedQueueServer) getJobs(ctx context.Context, req *api.StreamingL
 	}
 
 	// Update resource cluster report to account for preempted/leased jobs and write it to Redis.
-	allocatedByQueueForCluster = scheduler.UpdateUsage(
-		allocatedByQueueForCluster,
-		result.PreemptedJobs,
-		q.schedulingConfig.Preemption.PriorityClasses,
-		scheduler.Subtract,
+	allocatedByQueueAndPriorityClassForCluster = updateAllocatedByQueueAndPriorityClass(
+		allocatedByQueueAndPriorityClassForCluster,
+		subtract, result.PreemptedJobs,
 	)
-	for queue, m := range allocatedByQueueForCluster {
+	for queue, m := range allocatedByQueueAndPriorityClassForCluster {
 		// Any quantity in the negative indicates a resource accounting problem.
-		if !m.IsStrictlyNonNegative() {
-			log.Errorf("unexpected negative resource quantity for queue %s: %v", queue, m)
+		for _, rl := range m {
+			if !rl.IsStrictlyNonNegative() {
+				return nil, errors.Errorf("unexpected negative resource quantity for queue %s: %v", queue, m)
+			}
 		}
 	}
-	allocatedByQueueForCluster = scheduler.UpdateUsage(
-		allocatedByQueueForCluster,
-		successfullyLeasedApiJobs,
-		q.schedulingConfig.Preemption.PriorityClasses,
-		scheduler.Add,
+	allocatedByQueueAndPriorityClassForCluster = updateAllocatedByQueueAndPriorityClass(
+		allocatedByQueueAndPriorityClassForCluster,
+		add, successfullyLeasedApiJobs,
 	)
-	executorReport.Created = q.clock.Now()
-	for queue, usage := range allocatedByQueueForCluster {
-		executorReport.ResourcesByQueue[queue] = &schedulerobjects.QueueClusterResourceUsage{
-			Created:             executorReport.Created,
-			Queue:               queue,
-			ExecutorId:          req.ClusterId,
-			ResourcesByPriority: usage.DeepCopy(),
+	currentExecutorReport.Created = q.clock.Now()
+	for queue, usage := range allocatedByQueueAndPriorityClassForCluster {
+		currentExecutorReport.ResourcesByQueue[queue] = &schedulerobjects.QueueClusterResourceUsage{
+			Created:                      currentExecutorReport.Created,
+			Queue:                        queue,
+			ExecutorId:                   req.ClusterId,
+			ResourcesByPriorityClassName: armadamaps.DeepCopy(usage),
 		}
 	}
-	if err := q.usageRepository.UpdateClusterQueueResourceUsage(req.ClusterId, executorReport); err != nil {
+	if err := q.usageRepository.UpdateClusterQueueResourceUsage(req.ClusterId, currentExecutorReport); err != nil {
 		logging.WithStacktrace(log, err).Errorf("failed to update cluster usage")
 	}
-	allocatedByQueueForPool = q.aggregateUsage(reportsByExecutor, req.Pool)
-	log.Infof("allocated resources per queue for pool %s after scheduling: %v", req.Pool, allocatedByQueueForPool)
+
+	allocatedByQueueAndPriorityClassForPool = q.aggregateAllocationAcrossExecutor(reportsByExecutor, req.Pool)
+	log.Infof("allocated resources per queue for pool %s after scheduling: %v", req.Pool, allocatedByQueueAndPriorityClassForPool)
 
 	// Optionally set node id selectors on scheduled jobs.
 	if q.schedulingConfig.Preemption.SetNodeIdSelector {
@@ -742,31 +745,65 @@ func (q *AggregatedQueueServer) getJobs(ctx context.Context, req *api.StreamingL
 	return successfullyLeasedApiJobs, nil
 }
 
-// aggregateUsage Creates a map of resource usage first by cluster and then by queue.
-// Clusters in pools other than pool are excluded.
-func (q *AggregatedQueueServer) aggregateUsage(reportsByCluster map[string]*schedulerobjects.ClusterResourceUsageReport, pool string) map[string]schedulerobjects.QuantityByPriorityAndResourceType {
-	const activeClusterExpiry = 10 * time.Minute
+type addOrSubtract int
+
+const (
+	add addOrSubtract = iota
+	subtract
+)
+
+func updateAllocatedByQueueAndPriorityClass[T interfaces.LegacySchedulerJob](allocatedByQueueAndPriorityClass map[string]schedulerobjects.QuantityByTAndResourceType[string], op addOrSubtract, jobs []T) map[string]schedulerobjects.QuantityByTAndResourceType[string] {
+	if allocatedByQueueAndPriorityClass == nil {
+		allocatedByQueueAndPriorityClass = make(map[string]schedulerobjects.QuantityByTAndResourceType[string], 256)
+	}
+	for _, job := range jobs {
+		allocatedByPriorityClassName := allocatedByQueueAndPriorityClass[job.GetQueue()]
+		if allocatedByPriorityClassName == nil {
+			allocatedByPriorityClassName = make(map[string]schedulerobjects.ResourceList)
+			allocatedByQueueAndPriorityClass[job.GetQueue()] = allocatedByPriorityClassName
+		}
+		allocated := allocatedByPriorityClassName[job.GetPriorityClassName()]
+		if op == add {
+			allocated.AddV1ResourceList(job.GetResourceRequirements().Requests)
+		} else if op == subtract {
+			allocated.SubV1ResourceList(job.GetResourceRequirements().Requests)
+		} else {
+			panic(fmt.Sprintf("unknown op %d", op))
+		}
+		allocatedByPriorityClassName[job.GetPriorityClassName()] = allocated
+	}
+	return allocatedByQueueAndPriorityClass
+}
+
+func (q *AggregatedQueueServer) aggregateAllocationAcrossExecutor(reportsByExecutor map[string]*schedulerobjects.ClusterResourceUsageReport, pool string) map[string]schedulerobjects.QuantityByTAndResourceType[string] {
 	now := q.clock.Now()
-	aggregatedUsageByQueue := make(map[string]schedulerobjects.QuantityByPriorityAndResourceType)
-	for _, clusterReport := range reportsByCluster {
-		if clusterReport.Pool != pool {
-			// Separate resource accounting per pool.
+	allocatedByQueueAndPriorityClass := make(map[string]schedulerobjects.QuantityByTAndResourceType[string])
+	for _, executorReport := range reportsByExecutor {
+		if executorReport.Pool != pool {
+			// Only consider executors in the specified pool.
 			continue
 		}
-		if !clusterReport.Created.Add(activeClusterExpiry).After(now) {
-			// Stale report; omit.
-			continue
+		if q.schedulingConfig.ExecutorTimeout != 0 {
+			reportAge := now.Sub(executorReport.Created)
+			if reportAge > q.schedulingConfig.ExecutorTimeout {
+				// Stale report; omit.
+				continue
+			}
 		}
-		for queue, report := range clusterReport.ResourcesByQueue {
-			quantityByPriorityAndResourceType, ok := aggregatedUsageByQueue[queue]
-			if !ok {
-				quantityByPriorityAndResourceType = make(schedulerobjects.QuantityByPriorityAndResourceType)
-				aggregatedUsageByQueue[queue] = quantityByPriorityAndResourceType
+		for queue, queueReport := range executorReport.ResourcesByQueue {
+			allocatedByPriorityClass := allocatedByQueueAndPriorityClass[queue]
+			if allocatedByPriorityClass == nil {
+				allocatedByPriorityClass = make(map[string]schedulerobjects.ResourceList)
+				allocatedByQueueAndPriorityClass[queue] = allocatedByPriorityClass
+			}
+			for priorityClassName, allocated := range queueReport.ResourcesByPriorityClassName {
+				rl := allocatedByPriorityClass[priorityClassName]
+				rl.Add(allocated)
+				allocatedByPriorityClass[priorityClassName] = rl
 			}
-			quantityByPriorityAndResourceType.Add(report.ResourcesByPriority)
 		}
 	}
-	return aggregatedUsageByQueue
+	return allocatedByQueueAndPriorityClass
 }
 
 func (q *AggregatedQueueServer) decompressJobOwnershipGroups(jobs []*api.Job) error {

diff --git a/internal/executor/utilisation/cluster_utilisation.go b/internal/executor/utilisation/cluster_utilisation.go
@@ -281,8 +281,8 @@ func groupPodsByNodes(pods []*v1.Pod) map[string][]*v1.Pod {
 	return podsByNodes
 }
 
-func allocatedByPriorityAndResourceTypeFromPods(pods []*v1.Pod) schedulerobjects.QuantityByPriorityAndResourceType {
-	rv := make(schedulerobjects.QuantityByPriorityAndResourceType)
+func allocatedByPriorityAndResourceTypeFromPods(pods []*v1.Pod) schedulerobjects.QuantityByTAndResourceType[int32] {
+	rv := make(schedulerobjects.QuantityByTAndResourceType[int32])
 	for _, pod := range pods {
 		var priority int32 = 0
 		if pod.Spec.Priority != nil {

diff --git a/internal/scheduler/common.go b/internal/scheduler/common.go
@@ -116,46 +116,6 @@ func JobsSummary(jobs []interfaces.LegacySchedulerJob) string {
 	)
 }
 
-type AddOrSubtract int
-
-const (
-	Add AddOrSubtract = iota
-	Subtract
-)
-
-func UpdateUsage[S ~[]E, E interfaces.LegacySchedulerJob](
-	usage map[string]schedulerobjects.QuantityByPriorityAndResourceType,
-	jobs S,
-	priorityClasses map[string]configuration.PriorityClass,
-	addOrSubtract AddOrSubtract,
-) map[string]schedulerobjects.QuantityByPriorityAndResourceType {
-	if usage == nil {
-		usage = make(map[string]schedulerobjects.QuantityByPriorityAndResourceType)
-	}
-	for _, job := range jobs {
-		req := PodRequirementFromLegacySchedulerJob(job, priorityClasses)
-		if req == nil {
-			continue
-		}
-		requests := schedulerobjects.ResourceListFromV1ResourceList(req.ResourceRequirements.Requests)
-		queue := job.GetQueue()
-		m := usage[queue]
-		if m == nil {
-			m = make(schedulerobjects.QuantityByPriorityAndResourceType)
-		}
-		switch addOrSubtract {
-		case Add:
-			m.Add(schedulerobjects.QuantityByPriorityAndResourceType{req.Priority: requests})
-		case Subtract:
-			m.Sub(schedulerobjects.QuantityByPriorityAndResourceType{req.Priority: requests})
-		default:
-			panic(fmt.Sprintf("invalid operation %d", addOrSubtract))
-		}
-		usage[queue] = m
-	}
-	return usage
-}
-
 func jobSchedulingContextsFromJobs[T interfaces.LegacySchedulerJob](jobs []T, executorId string, priorityClasses map[string]configuration.PriorityClass) []*schedulercontext.JobSchedulingContext {
 	if jobs == nil {
 		return nil