Skip to content

Commit

Permalink
fix(modelgateway): Allow modelgateway consumers for transient error (#…
Browse files Browse the repository at this point in the history
…6017)

* skip removing schedule failed models from consumers

* include model progressing in skip

* adjust note in code
  • Loading branch information
sakoush authored Oct 30, 2024
1 parent ca025b3 commit 659d4ba
Showing 1 changed file with 10 additions and 0 deletions.
10 changes: 10 additions & 0 deletions scheduler/pkg/kafka/gateway/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,16 @@ func (kc *KafkaSchedulerClient) SubscribeModelEvents() error {

logger.Infof("Received event name %s version %d state %s", event.ModelName, latestVersionStatus.Version, latestVersionStatus.State.State.String())

// if the model is in a failed state and the consumer exists then we skip the removal
// this is to prevent the consumer from being removed during transient failures of the control plane
// in this way data plane can potentially continue to serve requests
if latestVersionStatus.GetState().GetState() == scheduler.ModelStatus_ScheduleFailed || latestVersionStatus.GetState().GetState() == scheduler.ModelStatus_ModelProgressing {
if kc.consumerManager.Exists(event.ModelName) {
logger.Warnf("Model %s schedule failed / progressing and consumer exists, skipping from removal", event.ModelName)
continue
}
}

// if there are available replicas then we add the consumer for the model
// note that this will also get triggered if the model is already added but there is a status change (e.g. due to scale up)
// and in the case then it is a no-op
Expand Down

0 comments on commit 659d4ba

Please sign in to comment.