Skip to content
This repository has been archived by the owner on Apr 2, 2023. It is now read-only.

Sla restart instances #12

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions jobUpdate.go
Original file line number Diff line number Diff line change
Expand Up @@ -284,3 +284,8 @@ func (j *JobUpdate) PartitionPolicy(reschedule bool, delay int64) *JobUpdate {
})
return j
}

func (j *JobUpdate) SlaPolicy(sla *aurora.SlaPolicy) *JobUpdate {
j.task.task.SlaPolicy = sla
return j
}
46 changes: 46 additions & 0 deletions realis.go
Original file line number Diff line number Diff line change
Expand Up @@ -522,6 +522,52 @@ func (c *Client) RestartInstances(key aurora.JobKey, instances ...int32) error {
return nil
}

// Restarts specific instances specified with slaAware.
// If slaPolicy is nil, uses the existing slaPolicy of taskConfig.
func (c *Client) SlaRestartInstances(jobKey aurora.JobKey,
slaPolicy *aurora.SlaPolicy,
instances ...int32) (*aurora.StartJobUpdateResult_, error) {
c.logger.DebugPrintf("SlaRestartInstances Thrift Payload: %v %+v %v\n", slaPolicy, jobKey, instances)

if len(instances) == 0 {
c.logger.DebugPrintf("it is not necessary to restart 0 instances")
return nil, nil
}

jobSummary, err := c.GetJobSummary(jobKey.Role)
lenhattan86 marked this conversation as resolved.
Show resolved Hide resolved
if err != nil {
return nil, err
}
var jobConfig *aurora.JobConfiguration
for _, s := range jobSummary.Summaries {
if s.Job.Key.Environment == jobKey.Environment && s.Job.Key.Name == jobKey.Name {
jobConfig = s.Job
}
}
if jobConfig == nil {
return nil, fmt.Errorf("failed to find %v", jobKey)
}

// create job update request
jobUpdate := JobUpdateFromConfig(jobConfig.TaskConfig)
jobUpdate.
SlaAware(true).
InstanceCount(jobConfig.InstanceCount)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We have to add a small change here to make the scheduler change the config. Let's add a label called `io.github.aurora.restart" with value +1 of the previous value.

if slaPolicy != nil {
jobUpdate.SlaPolicy(slaPolicy)
}
for _, v := range instances {
jobUpdate.AddInstanceRange(v, v)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we have to set this to anything if we're going to operate on all instances. However, once case here that's pretty interesting is a canary deployment. In fact, that may break this entire idea since each task is tied to a different config in that case :/

}

msg := fmt.Sprintf("SlaRestartInstances %v-%v via StartJobUpdate", jobKey, instances)
if result, err := c.StartJobUpdate(jobUpdate, fmt.Sprintf("restart instances %v", instances)); err != nil {
return nil, errors.Wrap(err, msg)
} else {
return result, nil
}
}

// Restarts all active tasks under a job configuration.
func (c *Client) RestartJob(key aurora.JobKey) error {

Expand Down
56 changes: 56 additions & 0 deletions realis_e2e_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -871,3 +871,59 @@ func TestRealisClient_GetJobSummary(t *testing.T) {

assert.NoError(t, err)
}

func TestRealisClient_SlaRestartInstances(t *testing.T) {
// Create a single job
role := "vagrant"
env := "prod"
name := "slaRestartInstances"

job := realis.NewJob().
Environment(env).
Role(role).
Name(name).
ThermosExecutor(thermosExec).
CPU(.01).
RAM(4).
Disk(10).
Tier("preferred").
InstanceCount(3).
IsService(true)

// Needed to populate the task config correctly
assert.NoError(t, job.BuildThermosPayload())

var cpu = 3.5
var ram int64 = 20480
var disk int64 = 10240
err := r.SetQuota(role, &cpu, &ram, &disk)
assert.NoError(t, err)

err = r.CreateJob(job)
assert.NoError(t, err)

// waiting until all instances running
success, err := r.MonitorScheduleStatus(job.JobKey(),
job.GetInstanceCount(),
[]aurora.ScheduleStatus{aurora.ScheduleStatus_RUNNING},
1*time.Second,
150*time.Second)
assert.True(t, success)
assert.NoError(t, err)

slaPolicy := &aurora.SlaPolicy{
PercentageSlaPolicy: &aurora.PercentageSlaPolicy{
Percentage: 50,
DurationSecs: 0,
},
}

t.Run("TestRealisClient_SlaRestartInstances", func(t *testing.T) {
result, err := r.SlaRestartInstances(job.JobKey(), slaPolicy, 0)
assert.NoError(t, err)
assert.NotNil(t, result)

assert.NoError(t, r.AbortJobUpdate(*result.GetKey(), "abort update to kill the job"))
assert.NoError(t, r.KillJob(job.JobKey()))
})
}