From ae1db4df67f6a472506b66b2b14008b7417926d2 Mon Sep 17 00:00:00 2001 From: Mauricio Araujo Date: Fri, 30 Sep 2022 11:54:23 -0500 Subject: [PATCH] Scale up correct number of instances (#7416) * Add helper function to compute instances to scale * Standardize types to int where possible, finish function to compute instances to scale * Fix bad rebase * Return 0 instead of 1 * Fix generate capacity map --- .../scaling_algorithms/default/assign_test.go | 8 ++--- .../scaling_algorithms/default/config.go | 8 ++--- .../scaling_algorithms/default/config_test.go | 8 ++--- .../scaling_algorithms/default/deploy.go | 9 +++-- .../scaling_algorithms/default/scaling.go | 9 ++--- .../scaling_algorithms/default/verify.go | 11 ++++-- .../scaling_algorithms/helpers/capacity.go | 36 +++++++++++++------ backend/services/subscriptions/types.go | 4 +-- 8 files changed, 58 insertions(+), 35 deletions(-) diff --git a/backend/services/scaling-service/scaling_algorithms/default/assign_test.go b/backend/services/scaling-service/scaling_algorithms/default/assign_test.go index 27f7f2d095b..f1619caa156 100644 --- a/backend/services/scaling-service/scaling_algorithms/default/assign_test.go +++ b/backend/services/scaling-service/scaling_algorithms/default/assign_test.go @@ -30,7 +30,7 @@ func TestMandelboxAssign(t *testing.T) { var tests = []struct { name string - capacity int64 + capacity int regions []string clientSHA, want string shouldBeAssigned bool @@ -72,7 +72,7 @@ func TestMandelboxAssign(t *testing.T) { Region: "us-east-1", IPAddress: "1.1.1.1/24", ClientSHA: "test-sha", - RemainingCapacity: int64(tt.capacity), + RemainingCapacity: tt.capacity, }, { ID: "test-assign-instance-2", @@ -83,7 +83,7 @@ func TestMandelboxAssign(t *testing.T) { Region: "us-west-1", IPAddress: "1.1.1.1/24", ClientSHA: "test-sha", - RemainingCapacity: int64(tt.capacity), + RemainingCapacity: tt.capacity, }, { ID: "test-assign-instance-3", @@ -94,7 +94,7 @@ func TestMandelboxAssign(t *testing.T) { Region: "ap-south-1", IPAddress: "1.1.1.1/24", ClientSHA: "test-sha", - RemainingCapacity: int64(tt.capacity), + RemainingCapacity: tt.capacity, }, } diff --git a/backend/services/scaling-service/scaling_algorithms/default/config.go b/backend/services/scaling-service/scaling_algorithms/default/config.go index 36e110f0cc4..3399dc57054 100644 --- a/backend/services/scaling-service/scaling_algorithms/default/config.go +++ b/backend/services/scaling-service/scaling_algorithms/default/config.go @@ -17,7 +17,7 @@ var ( // defaultInstanceBuffer is the number of instances with space to run // mandelboxes. This value is used when deciding how many instances to // scale up if we don't have enough capacity. - defaultInstanceBuffer = 1 + defaultInstanceBuffer int = 1 // desiredFreeMandelboxesPerRegion is the number of free mandelboxes we always // want available in a region. This value is set per-region and it represents // the free mandelboxes we want on each. @@ -90,9 +90,9 @@ var ( // generateInstanceCapacityMap uses the global instanceTypeToGPUNum and instanceTypeToVCPUNum maps // to generate the maximum mandelbox capacity for each instance type in the intersection // of their keys. -func generateInstanceCapacityMap(instanceToGPUMap, instanceToVCPUMap map[string]int) map[string]int64 { +func generateInstanceCapacityMap(instanceToGPUMap, instanceToVCPUMap map[string]int) map[string]int { // Initialize the instance capacity map - capacityMap := map[string]int64{} + capacityMap := map[string]int{} for instanceType, gpuNum := range instanceToGPUMap { // Only populate for instances that are in both maps vcpuNum, ok := instanceToVCPUMap[instanceType] @@ -100,7 +100,7 @@ func generateInstanceCapacityMap(instanceToGPUMap, instanceToVCPUMap map[string] continue } min := utils.Min(gpuNum*constants.MaxMandelboxesPerGPU, vcpuNum/VCPUsPerMandelbox) - capacityMap[instanceType] = int64(min) + capacityMap[instanceType] = min } return capacityMap } diff --git a/backend/services/scaling-service/scaling_algorithms/default/config_test.go b/backend/services/scaling-service/scaling_algorithms/default/config_test.go index f45b40f35e4..a84d4174f6c 100644 --- a/backend/services/scaling-service/scaling_algorithms/default/config_test.go +++ b/backend/services/scaling-service/scaling_algorithms/default/config_test.go @@ -14,7 +14,7 @@ func TestGenerateInstanceCapacityMap(t *testing.T) { name string gpuMap map[string]int vcpuMap map[string]int - expected map[string]int64 + expected map[string]int }{ { name: "gpu limited", @@ -26,7 +26,7 @@ func TestGenerateInstanceCapacityMap(t *testing.T) { "a": 1000, "b": 2000, }, - expected: map[string]int64{ + expected: map[string]int{ "a": constants.MaxMandelboxesPerGPU, "b": 2 * constants.MaxMandelboxesPerGPU, }, @@ -41,7 +41,7 @@ func TestGenerateInstanceCapacityMap(t *testing.T) { "a": 4, "b": 8, }, - expected: map[string]int64{ + expected: map[string]int{ "a": 4 / VCPUsPerMandelbox, "b": 8 / VCPUsPerMandelbox, }, @@ -56,7 +56,7 @@ func TestGenerateInstanceCapacityMap(t *testing.T) { "c": 4, "d": 8, }, - expected: map[string]int64{}, + expected: map[string]int{}, }, } diff --git a/backend/services/scaling-service/scaling_algorithms/default/deploy.go b/backend/services/scaling-service/scaling_algorithms/default/deploy.go index 89efa0b56d8..293a23f0947 100644 --- a/backend/services/scaling-service/scaling_algorithms/default/deploy.go +++ b/backend/services/scaling-service/scaling_algorithms/default/deploy.go @@ -63,6 +63,11 @@ func (s *DefaultScalingAlgorithm) UpgradeImage(scalingCtx context.Context, event fakeMandelboxesForDb []subscriptions.Mandelbox ) + // Compute the size of the buffer we want to start. We pass in a current capacity of 0 + // because we want a full buffer of instances for the new version. + // TODO: change to a different instance type once we support more cloud providers/types + instancesToScale := helpers.ComputeInstancesToScale(desiredFreeMandelboxesPerRegion[event.Region], 0, instanceCapacity["g4dn.2xlarge"]) + // If we are running on a local or testing environment, spinup "fake" instances to avoid // creating them on a cloud provider. In any other case we call the host handler to create // them on the cloud provider for us. @@ -70,7 +75,7 @@ func (s *DefaultScalingAlgorithm) UpgradeImage(scalingCtx context.Context, event logger.Infow("Running on localdev so scaling up fake instances.", contextFields) instancesForDb, fakeMandelboxesForDb = helpers.SpinUpFakeInstances(defaultInstanceBuffer, newImage.ImageID, event.Region) } else { - instancesForDb, err = s.Host.SpinUpInstances(scalingCtx, int32(defaultInstanceBuffer), maxWaitTimeReady, newImage) + instancesForDb, err = s.Host.SpinUpInstances(scalingCtx, int32(instancesToScale), maxWaitTimeReady, newImage) if err != nil { return utils.MakeError("failed to create instance buffer: %s", err) } @@ -78,7 +83,7 @@ func (s *DefaultScalingAlgorithm) UpgradeImage(scalingCtx context.Context, event // Set the instance capacity field and add to the slice // that will be passed to the database. for i := 0; i < len(instancesForDb); i++ { - instancesForDb[i].RemainingCapacity = int64(instanceCapacity[instancesForDb[i].Type]) + instancesForDb[i].RemainingCapacity = instanceCapacity[instancesForDb[i].Type] } } diff --git a/backend/services/scaling-service/scaling_algorithms/default/scaling.go b/backend/services/scaling-service/scaling_algorithms/default/scaling.go index 67630bc22ab..56d2a535dba 100644 --- a/backend/services/scaling-service/scaling_algorithms/default/scaling.go +++ b/backend/services/scaling-service/scaling_algorithms/default/scaling.go @@ -69,7 +69,7 @@ func (s *DefaultScalingAlgorithm) ScaleDownIfNecessary(scalingCtx context.Contex // Extra capacity is considered once we have a full instance worth of capacity // more than the desired free mandelboxes. TODO: Set the instance type once we // have support for more instance types. For now default to `g4dn.2xlarge`. - extraCapacity := int64(desiredFreeMandelboxesPerRegion[event.Region]) + (int64(defaultInstanceBuffer) * instanceCapacity["g4dn.2xlarge"]) + extraCapacity := desiredFreeMandelboxesPerRegion[event.Region] + (defaultInstanceBuffer * instanceCapacity["g4dn.2xlarge"]) // Acquire lock on protected from scale down map s.protectedMapLock.Lock() @@ -205,9 +205,6 @@ func (s *DefaultScalingAlgorithm) ScaleUpIfNecessary(instancesToScale int, scali logger.Infow("Starting scale up action.", contextFields) defer logger.Infow("Finished scale up action.", contextFields) - // Try scale up in given region - instanceNum := int32(instancesToScale) - var ( // Slice that will hold the instances and pass them to the dbclient instancesForDb []subscriptions.Instance @@ -224,7 +221,7 @@ func (s *DefaultScalingAlgorithm) ScaleUpIfNecessary(instancesToScale int, scali instancesForDb, fakeMandelboxesForDb = helpers.SpinUpFakeInstances(instancesToScale, image.ImageID, event.Region) } else { // Call the host handler to handle the instance spinup in the cloud provider - instancesForDb, err = s.Host.SpinUpInstances(scalingCtx, instanceNum, maxWaitTimeReady, image) + instancesForDb, err = s.Host.SpinUpInstances(scalingCtx, int32(instancesToScale), maxWaitTimeReady, image) if err != nil { return utils.MakeError("failed to spin up instances: %s", err) } @@ -235,7 +232,7 @@ func (s *DefaultScalingAlgorithm) ScaleUpIfNecessary(instancesToScale int, scali } for i := 0; i < len(instancesForDb); i++ { - instancesForDb[i].RemainingCapacity = int64(instanceCapacity[instancesForDb[i].Type]) + instancesForDb[i].RemainingCapacity = instanceCapacity[instancesForDb[i].Type] logger.Infow(utils.Sprintf("Created tagged instance with ID %s", instancesForDb[i].ID), contextFields) } } diff --git a/backend/services/scaling-service/scaling_algorithms/default/verify.go b/backend/services/scaling-service/scaling_algorithms/default/verify.go index 000bca1677f..78d32677c33 100644 --- a/backend/services/scaling-service/scaling_algorithms/default/verify.go +++ b/backend/services/scaling-service/scaling_algorithms/default/verify.go @@ -88,10 +88,15 @@ func (s *DefaultScalingAlgorithm) VerifyCapacity(scalingCtx context.Context, eve // We consider the expected mandelbox capacity (active instances + starting instances) // to account for warmup time and so that we don't scale up unnecesary instances. - if mandelboxCapacity < int64(desiredFreeMandelboxesPerRegion[event.Region]) { + if mandelboxCapacity < desiredFreeMandelboxesPerRegion[event.Region] { + + // TODO: Change to a different instance type once we support more types or cloud providers + instancesToScale := helpers.ComputeInstancesToScale(desiredFreeMandelboxesPerRegion[event.Region], mandelboxCapacity, instanceCapacity["g4dn.2xlarge"]) + logger.Infow(utils.Sprintf("Current mandelbox capacity of %d is less than desired %d. Scaling up %d instances to satisfy minimum desired capacity.", - mandelboxCapacity, desiredFreeMandelboxesPerRegion[event.Region], defaultInstanceBuffer), contextFields) - err = s.ScaleUpIfNecessary(defaultInstanceBuffer, scalingCtx, event, latestImage) + mandelboxCapacity, desiredFreeMandelboxesPerRegion[event.Region], instancesToScale), contextFields) + + err = s.ScaleUpIfNecessary(instancesToScale, scalingCtx, event, latestImage) if err != nil { return err } diff --git a/backend/services/scaling-service/scaling_algorithms/helpers/capacity.go b/backend/services/scaling-service/scaling_algorithms/helpers/capacity.go index 6c88ac13eb7..062cfcb3dfd 100644 --- a/backend/services/scaling-service/scaling_algorithms/helpers/capacity.go +++ b/backend/services/scaling-service/scaling_algorithms/helpers/capacity.go @@ -9,33 +9,33 @@ some functions that make it easier to test the scaling service locally. package helpers import ( + "math" + "github.com/whisthq/whist/backend/services/subscriptions" ) // ComputeRealMandelboxCapacity is a helper function to compute the real mandelbox capacity. // Real capcity is the number of free mandelboxes available on instances with active status. -func ComputeRealMandelboxCapacity(imageID string, activeInstances []subscriptions.Instance) int64 { - var ( - realMandelboxCapacity int - ) +func ComputeRealMandelboxCapacity(imageID string, activeInstances []subscriptions.Instance) int { + var realMandelboxCapacity int // Loop over active instances (status ACTIVE), only consider the ones with the current image for _, instance := range activeInstances { if instance.ImageID == imageID { - realMandelboxCapacity += int(instance.RemainingCapacity) + realMandelboxCapacity += instance.RemainingCapacity } } - return int64(realMandelboxCapacity) + return realMandelboxCapacity } // ComputeExpectedMandelboxCapacity is a helper function to compute the expected mandelbox capacity. // Expected capacity is the number of free mandelboxes available on instances with active status and // in starting instances. -func ComputeExpectedMandelboxCapacity(imageID string, activeInstances []subscriptions.Instance, startingInstances []subscriptions.Instance) int64 { +func ComputeExpectedMandelboxCapacity(imageID string, activeInstances []subscriptions.Instance, startingInstances []subscriptions.Instance) int { var ( - realMandelboxCapacity int64 - expectedMandelboxCapacity int64 + realMandelboxCapacity int + expectedMandelboxCapacity int ) // Get the capacity from active instances @@ -49,5 +49,21 @@ func ComputeExpectedMandelboxCapacity(imageID string, activeInstances []subscrip } expectedMandelboxCapacity += realMandelboxCapacity - return int64(expectedMandelboxCapacity) + return expectedMandelboxCapacity +} + +// ComputeInstancesToScale computes the number of instances we want to scale, according to the current number of desired mandelboxes. +// desiredMandelboxes: the number of free mandelboxes we want at all times. Set in the config db. +// currentCapacity: the current free mandelboxes in the database. Obtained by querying database and computing capacity. +// instanceCapacity: the mandelbox capacity for the specific instance type in use (e.g. a "g4dn.2xlarge" instance can run 2 mandelboxes). +func ComputeInstancesToScale(desiredMandelboxes int, currentCapacity int, instanceCapacity int) int { + // This will never happen realistically, but we should try to handle all cases. + if instanceCapacity <= 0 { + return 0 + } + + desiredCapacity := desiredMandelboxes - currentCapacity + instancesToScale := math.Ceil(float64(desiredCapacity) / float64(instanceCapacity)) + + return int(instancesToScale) } diff --git a/backend/services/subscriptions/types.go b/backend/services/subscriptions/types.go index 90d462b9b6e..6b7da9c6016 100644 --- a/backend/services/subscriptions/types.go +++ b/backend/services/subscriptions/types.go @@ -102,7 +102,7 @@ type Instance struct { ClientSHA string `json:"client_sha"` // Commit hash IPAddress string `json:"ip_addr"` // Public IPv4 address Type string `json:"instance_type"` // Instance type - RemainingCapacity int64 `json:"remaining_capacity"` // Number of mandelboxes that can be allocated + RemainingCapacity int `json:"remaining_capacity"` // Number of mandelboxes that can be allocated Status string `json:"status"` // Current status of the instance CreatedAt time.Time `json:"created_at"` // Timestamp when the instance was creates UpdatedAt time.Time `json:"updated_at"` // Timestamp when the instance was last updated @@ -215,7 +215,7 @@ func ToInstances(dbInstances []WhistInstance) []Instance { ClientSHA: string(instance.ClientSHA), IPAddress: instance.IPAddress, Type: string(instance.Type), - RemainingCapacity: int64(instance.RemainingCapacity), + RemainingCapacity: int(instance.RemainingCapacity), Status: string(instance.Status), CreatedAt: instance.CreatedAt, UpdatedAt: instance.UpdatedAt,