Skip to content

Commit

Permalink
Merge branch 'master' into fix_pod_issue_handling_pods_hanging
Browse files Browse the repository at this point in the history
  • Loading branch information
JamesMurkin authored Jun 27, 2023
2 parents 2fe4596 + 7542cd0 commit fc86e58
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 35 deletions.
14 changes: 14 additions & 0 deletions internal/scheduler/jobdb/job.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,20 @@ func NewJob(
cancelled bool,
created int64,
) *Job {
// Initialise the annotation and nodeSelector maps if nil.
// Since those need to be mutated in-place.
if schedulingInfo != nil {
for _, req := range schedulingInfo.ObjectRequirements {
if podReq := req.GetPodRequirements(); podReq != nil {
if podReq.Annotations == nil {
podReq.Annotations = make(map[string]string)
}
if podReq.NodeSelector == nil {
podReq.NodeSelector = make(map[string]string)
}
}
}
}
return &Job{
id: jobId,
jobset: jobset,
Expand Down
17 changes: 3 additions & 14 deletions internal/scheduler/preempting_queue_scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -861,22 +861,11 @@ func defaultPostEvictFunc(ctx context.Context, job interfaces.LegacySchedulerJob
}

// Add node selector ensuring this job is only re-scheduled onto the node it was evicted from.
req := PodRequirementFromLegacySchedulerJob(job, nil)
if req.NodeSelector == nil {
nodeSelector := job.GetNodeSelector()
if nodeSelector == nil {
log := ctxlogrus.Extract(ctx)
log.Errorf("error evicting job %s: nodeSelector not initialised", job.GetId())
} else {
req.NodeSelector[schedulerconfig.NodeIdLabel] = node.Id
nodeSelector[schedulerconfig.NodeIdLabel] = node.Id
}

// Add a toleration to allow the job to be re-scheduled even if node is unschedulable.
//
// TODO: Because req is allocated by GetJobSchedulingInfo() if job is an api.Job, this toleration may not persist.
// In practice, this isn't an issue now since we don't check static requirements for evicted jobs.
if node.Unschedulable {
req.Tolerations = append(req.Tolerations, nodedb.UnschedulableToleration())
}

// We've changed the scheduling requirements and must clear any cached key.
req.ClearCachedSchedulingKey()
}
2 changes: 0 additions & 2 deletions internal/scheduler/scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -344,8 +344,6 @@ func (s *Scheduler) createSchedulingInfoWithNodeAntiAffinityForAttemptedRuns(job
}
}
podSchedulingRequirement.Affinity = newAffinity
podSchedulingRequirement.ClearCachedSchedulingKey()

return newSchedulingInfo, nil
}

Expand Down
19 changes: 0 additions & 19 deletions internal/scheduler/schedulerobjects/podutils.go
Original file line number Diff line number Diff line change
Expand Up @@ -212,25 +212,6 @@ func (skg *PodRequirementsSerialiser) AppendResourceList(out []byte, resourceLis
return out
}

// ClearCachedSchedulingKey clears any cached scheduling keys.
// Necessary after changing scheduling requirements to avoid inconsistency.
func (jobSchedulingInfo *JobSchedulingInfo) ClearCachedSchedulingKey() {
if jobSchedulingInfo == nil {
return
}
for _, objReq := range jobSchedulingInfo.ObjectRequirements {
if req := objReq.GetPodRequirements(); req != nil {
req.ClearCachedSchedulingKey()
}
}
}

// ClearCachedSchedulingKey clears any cached scheduling key.
// Necessary after changing scheduling requirements to avoid inconsistency.
func (req *PodRequirements) ClearCachedSchedulingKey() {
req.CachedSchedulingKey = nil
}

func lessToleration(a, b v1.Toleration) bool {
if a.Key < b.Key {
return true
Expand Down

0 comments on commit fc86e58

Please sign in to comment.