From 533cf03defb5da9f74682a4d41baea3dba8f8da0 Mon Sep 17 00:00:00 2001 From: Niall D <4562759+driev@users.noreply.github.com> Date: Fri, 25 Oct 2024 14:47:52 +0100 Subject: [PATCH 1/4] exposing the deleted resource ttl as a CLI param --- scheduler/cmd/scheduler/main.go | 10 +++-- scheduler/pkg/server/server_test.go | 44 +++++++++++++------- scheduler/pkg/store/experiment/db.go | 19 +++++---- scheduler/pkg/store/experiment/db_test.go | 18 ++++---- scheduler/pkg/store/experiment/store.go | 6 +-- scheduler/pkg/store/experiment/store_test.go | 8 ++-- scheduler/pkg/store/pipeline/db.go | 30 +++++++------ scheduler/pkg/store/pipeline/db_test.go | 25 ++++++----- scheduler/pkg/store/pipeline/store.go | 4 +- scheduler/pkg/store/pipeline/store_test.go | 4 +- scheduler/pkg/store/utils/utils.go | 4 +- 11 files changed, 95 insertions(+), 77 deletions(-) diff --git a/scheduler/cmd/scheduler/main.go b/scheduler/cmd/scheduler/main.go index 132025bcea..4ca8afb717 100644 --- a/scheduler/cmd/scheduler/main.go +++ b/scheduler/cmd/scheduler/main.go @@ -55,10 +55,11 @@ var ( tracingConfigPath string dbPath string nodeID string - allowPlaintxt bool //scheduler server + allowPlaintxt bool // scheduler server autoscalingDisabled bool kafkaConfigPath string schedulerReadyTimeoutSeconds uint + deletedResourceTTL uint ) const ( @@ -115,6 +116,9 @@ func init() { // Timeout for scheduler to be ready flag.UintVar(&schedulerReadyTimeoutSeconds, "scheduler-ready-timeout-seconds", 300, "Timeout for scheduler to be ready") + + // This TTL is set in badger DB + flag.UintVar(&deletedResourceTTL, "deleted-resource-ttl", 1440, "TTL for deleted experiments and pipelines (in minutes)") } func getNamespace() string { @@ -211,11 +215,11 @@ func main() { // Do here after other services created so eventHub events will be handled on pipeline/experiment load // If we start earlier events will be sent but not received by services that start listening "late" to eventHub if dbPath != "" { - err := ps.InitialiseOrRestoreDB(dbPath) + err := ps.InitialiseOrRestoreDB(dbPath, deletedResourceTTL) if err != nil { log.WithError(err).Fatalf("Failed to initialise pipeline db at %s", dbPath) } - err = es.InitialiseOrRestoreDB(dbPath) + err = es.InitialiseOrRestoreDB(dbPath, deletedResourceTTL) if err != nil { log.WithError(err).Fatalf("Failed to initialise experiment db at %s", dbPath) } diff --git a/scheduler/pkg/server/server_test.go b/scheduler/pkg/server/server_test.go index 26f0623c54..e5e05915cb 100644 --- a/scheduler/pkg/server/server_test.go +++ b/scheduler/pkg/server/server_test.go @@ -127,7 +127,8 @@ func TestLoadModel(t *testing.T) { }, }, }, - model: &pb.Model{Meta: &pb.MetaData{Name: "model1"}, + model: &pb.Model{ + Meta: &pb.MetaData{Name: "model1"}, ModelSpec: &pb.ModelSpec{ Uri: "gs://model", Requirements: []string{"sklearn"}, @@ -384,8 +385,11 @@ func TestUnloadModel(t *testing.T) { { name: "Simple", req: []*pba.AgentSubscribeRequest{ - {ServerName: "server1", ReplicaIdx: 0, Shared: true, AvailableMemoryBytes: 1000, - ReplicaConfig: &pba.ReplicaConfig{InferenceSvc: "server1", InferenceHttpPort: 1, Capabilities: []string{"sklearn"}}}}, + { + ServerName: "server1", ReplicaIdx: 0, Shared: true, AvailableMemoryBytes: 1000, + ReplicaConfig: &pba.ReplicaConfig{InferenceSvc: "server1", InferenceHttpPort: 1, Capabilities: []string{"sklearn"}}, + }, + }, model: &pb.Model{Meta: &pb.MetaData{Name: "model1"}, ModelSpec: &pb.ModelSpec{Uri: "gs://model", Requirements: []string{"sklearn"}, MemoryBytes: &smallMemory}, DeploymentSpec: &pb.DeploymentSpec{Replicas: 1}}, code: codes.OK, modelState: store.ModelTerminated, @@ -393,8 +397,11 @@ func TestUnloadModel(t *testing.T) { { name: "Multiple", req: []*pba.AgentSubscribeRequest{ - {ServerName: "server1", ReplicaIdx: 0, Shared: true, AvailableMemoryBytes: 1000, - ReplicaConfig: &pba.ReplicaConfig{InferenceSvc: "server1", InferenceHttpPort: 1, Capabilities: []string{"sklearn", "xgboost"}}}}, + { + ServerName: "server1", ReplicaIdx: 0, Shared: true, AvailableMemoryBytes: 1000, + ReplicaConfig: &pba.ReplicaConfig{InferenceSvc: "server1", InferenceHttpPort: 1, Capabilities: []string{"sklearn", "xgboost"}}, + }, + }, model: &pb.Model{Meta: &pb.MetaData{Name: "model1"}, ModelSpec: &pb.ModelSpec{Uri: "gs://model", Requirements: []string{"sklearn", "xgboost"}, MemoryBytes: &smallMemory}, DeploymentSpec: &pb.DeploymentSpec{Replicas: 1}}, code: codes.OK, modelState: store.ModelTerminated, @@ -402,10 +409,15 @@ func TestUnloadModel(t *testing.T) { { name: "TwoReplicas", req: []*pba.AgentSubscribeRequest{ - {ServerName: "server1", ReplicaIdx: 0, Shared: true, AvailableMemoryBytes: 1000, - ReplicaConfig: &pba.ReplicaConfig{InferenceSvc: "server1", InferenceHttpPort: 1, Capabilities: []string{"sklearn"}}}, - {ServerName: "server1", ReplicaIdx: 1, Shared: true, AvailableMemoryBytes: 1000, - ReplicaConfig: &pba.ReplicaConfig{InferenceSvc: "server1", InferenceHttpPort: 1, Capabilities: []string{"sklearn"}}}}, + { + ServerName: "server1", ReplicaIdx: 0, Shared: true, AvailableMemoryBytes: 1000, + ReplicaConfig: &pba.ReplicaConfig{InferenceSvc: "server1", InferenceHttpPort: 1, Capabilities: []string{"sklearn"}}, + }, + { + ServerName: "server1", ReplicaIdx: 1, Shared: true, AvailableMemoryBytes: 1000, + ReplicaConfig: &pba.ReplicaConfig{InferenceSvc: "server1", InferenceHttpPort: 1, Capabilities: []string{"sklearn"}}, + }, + }, model: &pb.Model{Meta: &pb.MetaData{Name: "model1"}, ModelSpec: &pb.ModelSpec{Uri: "gs://model", Requirements: []string{"sklearn"}, MemoryBytes: &smallMemory}, DeploymentSpec: &pb.DeploymentSpec{Replicas: 2}}, code: codes.OK, modelState: store.ModelTerminated, @@ -413,10 +425,14 @@ func TestUnloadModel(t *testing.T) { { name: "NotExist", req: []*pba.AgentSubscribeRequest{ - {ServerName: "server1", ReplicaIdx: 0, Shared: true, AvailableMemoryBytes: 1000, - ReplicaConfig: &pba.ReplicaConfig{InferenceSvc: "server1", InferenceHttpPort: 1, Capabilities: []string{"sklearn"}}}}, + { + ServerName: "server1", ReplicaIdx: 0, Shared: true, AvailableMemoryBytes: 1000, + ReplicaConfig: &pba.ReplicaConfig{InferenceSvc: "server1", InferenceHttpPort: 1, Capabilities: []string{"sklearn"}}, + }, + }, model: nil, - code: codes.FailedPrecondition}, + code: codes.FailedPrecondition, + }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { @@ -518,7 +534,6 @@ func TestLoadPipeline(t *testing.T) { } }) } - } func TestUnloadPipeline(t *testing.T) { @@ -562,7 +577,7 @@ func TestUnloadPipeline(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { path := fmt.Sprintf("%s/db", t.TempDir()) - _ = test.server.pipelineHandler.(*pipeline.PipelineStore).InitialiseOrRestoreDB(path) + _ = test.server.pipelineHandler.(*pipeline.PipelineStore).InitialiseOrRestoreDB(path, 10) if test.loadReq != nil { err := test.server.pipelineHandler.AddPipeline(test.loadReq.Pipeline) g.Expect(err).To(BeNil()) @@ -575,7 +590,6 @@ func TestUnloadPipeline(t *testing.T) { } }) } - } func TestPipelineStatus(t *testing.T) { diff --git a/scheduler/pkg/store/experiment/db.go b/scheduler/pkg/store/experiment/db.go index e41f3bd87f..e2c41a380e 100644 --- a/scheduler/pkg/store/experiment/db.go +++ b/scheduler/pkg/store/experiment/db.go @@ -10,6 +10,8 @@ the Change License after the Change Date as each is defined in accordance with t package experiment import ( + "time" + "github.com/dgraph-io/badger/v3" "github.com/sirupsen/logrus" "google.golang.org/protobuf/proto" @@ -25,19 +27,21 @@ const ( ) type ExperimentDBManager struct { - db *badger.DB - logger logrus.FieldLogger + db *badger.DB + logger logrus.FieldLogger + deletedResourceTTL time.Duration } -func newExperimentDbManager(path string, logger logrus.FieldLogger) (*ExperimentDBManager, error) { +func newExperimentDbManager(path string, logger logrus.FieldLogger, deletedResourceTTL uint) (*ExperimentDBManager, error) { db, err := utils.Open(path, logger, "experimentDb") if err != nil { return nil, err } edb := &ExperimentDBManager{ - db: db, - logger: logger, + db: db, + logger: logger, + deletedResourceTTL: time.Duration(deletedResourceTTL * uint(time.Minute)), } version, err := edb.getVersion() @@ -73,9 +77,8 @@ func (edb *ExperimentDBManager) save(experiment *Experiment) error { return err }) } else { - ttl := utils.DeletedResourceTTL return edb.db.Update(func(txn *badger.Txn) error { - e := badger.NewEntry([]byte(experiment.Name), experimentBytes).WithTTL(ttl) + e := badger.NewEntry([]byte(experiment.Name), experimentBytes).WithTTL(edb.deletedResourceTTL) err = txn.SetEntry(e) return err }) @@ -119,7 +122,7 @@ func (edb *ExperimentDBManager) restore( } experiment := CreateExperimentFromSnapshot(&snapshot) if experiment.Deleted { - experiment.DeletedAt = utils.GetDeletedAt(item) + experiment.DeletedAt = utils.GetDeletedAt(item, edb.deletedResourceTTL) err = stopExperimentCb(experiment) } else { // otherwise attempt to start the experiment diff --git a/scheduler/pkg/store/experiment/db_test.go b/scheduler/pkg/store/experiment/db_test.go index a184ac0505..44f99abe4a 100644 --- a/scheduler/pkg/store/experiment/db_test.go +++ b/scheduler/pkg/store/experiment/db_test.go @@ -103,7 +103,7 @@ func TestSaveWithTTL(t *testing.T) { path := fmt.Sprintf("%s/db", t.TempDir()) logger := log.New() - db, err := newExperimentDbManager(getExperimentDbFolder(path), logger) + db, err := newExperimentDbManager(getExperimentDbFolder(path), logger, 10) g.Expect(err).To(BeNil()) experiment.DeletedAt = time.Now() err = db.save(experiment) @@ -297,7 +297,7 @@ func TestSaveAndRestore(t *testing.T) { t.Run(test.name, func(t *testing.T) { path := fmt.Sprintf("%s/db", t.TempDir()) logger := log.New() - db, err := newExperimentDbManager(getExperimentDbFolder(path), logger) + db, err := newExperimentDbManager(getExperimentDbFolder(path), logger, 10) g.Expect(err).To(BeNil()) for _, p := range test.experiments { err := db.save(p) @@ -307,7 +307,7 @@ func TestSaveAndRestore(t *testing.T) { g.Expect(err).To(BeNil()) es := NewExperimentServer(log.New(), nil, nil, nil) - err = es.InitialiseOrRestoreDB(path) + err = es.InitialiseOrRestoreDB(path, 10) g.Expect(err).To(BeNil()) for idx, p := range test.experiments { if !test.errors[idx] { @@ -369,7 +369,7 @@ func TestSaveAndRestoreDeletedExperiments(t *testing.T) { g.Expect(test.experiment.Deleted).To(BeTrue(), "this is a test for deleted experiments") path := fmt.Sprintf("%s/db", t.TempDir()) logger := log.New() - edb, err := newExperimentDbManager(getExperimentDbFolder(path), logger) + edb, err := newExperimentDbManager(getExperimentDbFolder(path), logger, 10) g.Expect(err).To(BeNil()) if !test.withTTL { err = saveWithOutTTL(&test.experiment, edb.db) @@ -382,7 +382,7 @@ func TestSaveAndRestoreDeletedExperiments(t *testing.T) { g.Expect(err).To(BeNil()) es := NewExperimentServer(log.New(), nil, nil, nil) - err = es.InitialiseOrRestoreDB(path) + err = es.InitialiseOrRestoreDB(path, 10) g.Expect(err).To(BeNil()) if !test.withTTL { @@ -528,7 +528,7 @@ func TestGetExperimentFromDB(t *testing.T) { t.Run(test.name, func(t *testing.T) { path := fmt.Sprintf("%s/db", t.TempDir()) logger := log.New() - db, err := newExperimentDbManager(getExperimentDbFolder(path), logger) + db, err := newExperimentDbManager(getExperimentDbFolder(path), logger, 10) g.Expect(err).To(BeNil()) for _, p := range test.experiments { err := db.save(p) @@ -668,7 +668,7 @@ func TestDeleteExperimentFromDB(t *testing.T) { t.Run(test.name, func(t *testing.T) { path := fmt.Sprintf("%s/db", t.TempDir()) logger := log.New() - db, err := newExperimentDbManager(getExperimentDbFolder(path), logger) + db, err := newExperimentDbManager(getExperimentDbFolder(path), logger, 10) g.Expect(err).To(BeNil()) for _, p := range test.experiments { err := db.save(p) @@ -834,7 +834,7 @@ func TestMigrateFromV1ToV2(t *testing.T) { _ = db.Close() // migrate - edb, err := newExperimentDbManager(getExperimentDbFolder(path), logger) + edb, err := newExperimentDbManager(getExperimentDbFolder(path), logger, 10) g.Expect(err).To(BeNil()) g.Expect(err).To(BeNil()) @@ -846,7 +846,7 @@ func TestMigrateFromV1ToV2(t *testing.T) { // check that we have no experiments in the db format es := NewExperimentServer(log.New(), nil, nil, nil) - err = es.InitialiseOrRestoreDB(path) + err = es.InitialiseOrRestoreDB(path, 10) g.Expect(err).To(BeNil()) g.Expect(len(es.experiments)).To(Equal(0)) }) diff --git a/scheduler/pkg/store/experiment/store.go b/scheduler/pkg/store/experiment/store.go index 247946a203..f5cf49de66 100644 --- a/scheduler/pkg/store/experiment/store.go +++ b/scheduler/pkg/store/experiment/store.go @@ -108,7 +108,7 @@ func (es *ExperimentStore) addExperimentInMap(experiment *Experiment) error { } } -func (es *ExperimentStore) InitialiseOrRestoreDB(path string) error { +func (es *ExperimentStore) InitialiseOrRestoreDB(path string, deletedResourceTTL uint) error { logger := es.logger.WithField("func", "initialiseDB") experimentDbPath := getExperimentDbFolder(path) logger.Infof("Initialise DB at %s", experimentDbPath) @@ -116,7 +116,7 @@ func (es *ExperimentStore) InitialiseOrRestoreDB(path string) error { if err != nil { return err } - db, err := newExperimentDbManager(experimentDbPath, es.logger) + db, err := newExperimentDbManager(experimentDbPath, es.logger, deletedResourceTTL) if err != nil { return err } @@ -485,7 +485,7 @@ func (es *ExperimentStore) cleanupDeletedExperiments() { es.logger.Warnf("could not update DB TTL for experiment: %s", experiment.Name) } } - } else if experiment.DeletedAt.Add(utils.DeletedResourceTTL).Before(time.Now()) { + } else if experiment.DeletedAt.Add(es.db.deletedResourceTTL).Before(time.Now()) { delete(es.experiments, experiment.Name) } } diff --git a/scheduler/pkg/store/experiment/store_test.go b/scheduler/pkg/store/experiment/store_test.go index b6fc5a27b8..7ee6e83bb0 100644 --- a/scheduler/pkg/store/experiment/store_test.go +++ b/scheduler/pkg/store/experiment/store_test.go @@ -195,7 +195,7 @@ func TestStartExperiment(t *testing.T) { g.Expect(err).To(BeNil()) server := NewExperimentServer(logger, eventHub, fakeModelStore{}, fakePipelineStore{}) // init db - _ = server.InitialiseOrRestoreDB(path) + _ = server.InitialiseOrRestoreDB(path, 10) for _, ea := range test.experiments { err := server.StartExperiment(ea.experiment) if ea.fail { @@ -262,7 +262,7 @@ func TestStopExperiment(t *testing.T) { path := fmt.Sprintf("%s/db", t.TempDir()) // init db - err := test.store.InitialiseOrRestoreDB(path) + err := test.store.InitialiseOrRestoreDB(path, 10) g.Expect(err).To(BeNil()) for _, p := range test.store.experiments { err := test.store.db.save(p) @@ -413,7 +413,7 @@ func TestRestoreExperiments(t *testing.T) { experiments: make(map[string]*Experiment), } // init db - err := store.InitialiseOrRestoreDB(path) + err := store.InitialiseOrRestoreDB(path, 10) g.Expect(err).To(BeNil()) for _, p := range test.experiments { err := store.db.save(p) @@ -422,7 +422,7 @@ func TestRestoreExperiments(t *testing.T) { _ = store.db.Stop() // restore from db now that we have state on disk - _ = store.InitialiseOrRestoreDB(path) + _ = store.InitialiseOrRestoreDB(path, 10) for _, p := range test.experiments { experimentFromDB, _ := store.db.get(p.Name) diff --git a/scheduler/pkg/store/pipeline/db.go b/scheduler/pkg/store/pipeline/db.go index f778105796..f2708b8683 100644 --- a/scheduler/pkg/store/pipeline/db.go +++ b/scheduler/pkg/store/pipeline/db.go @@ -10,6 +10,8 @@ the Change License after the Change Date as each is defined in accordance with t package pipeline import ( + "time" + "github.com/dgraph-io/badger/v3" "github.com/sirupsen/logrus" "google.golang.org/protobuf/proto" @@ -25,19 +27,21 @@ const ( ) type PipelineDBManager struct { - db *badger.DB - logger logrus.FieldLogger + db *badger.DB + logger logrus.FieldLogger + deletedResourceTTL time.Duration } -func newPipelineDbManager(path string, logger logrus.FieldLogger) (*PipelineDBManager, error) { +func newPipelineDbManager(path string, logger logrus.FieldLogger, deletedResourceTTL uint) (*PipelineDBManager, error) { db, err := utils.Open(path, logger, "pipelineDb") if err != nil { return nil, err } pdb := &PipelineDBManager{ - db: db, - logger: logger, + db: db, + logger: logger, + deletedResourceTTL: time.Duration(deletedResourceTTL * uint(time.Minute)), } version, err := pdb.getVersion() @@ -57,22 +61,20 @@ func newPipelineDbManager(path string, logger logrus.FieldLogger) (*PipelineDBMa } // a nil ttl will save the pipeline indefinitely -func save(pipeline *Pipeline, db *badger.DB) error { +func (pdb *PipelineDBManager) save(pipeline *Pipeline) error { pipelineProto := CreatePipelineSnapshotFromPipeline(pipeline) pipelineBytes, err := proto.Marshal(pipelineProto) if err != nil { return err } if !pipeline.Deleted { - return db.Update(func(txn *badger.Txn) error { + return pdb.db.Update(func(txn *badger.Txn) error { err = txn.Set([]byte(pipeline.Name), pipelineBytes) return err }) } else { - // useful for testing - ttl := utils.DeletedResourceTTL - return db.Update(func(txn *badger.Txn) error { - e := badger.NewEntry([]byte(pipeline.Name), pipelineBytes).WithTTL(ttl) + return pdb.db.Update(func(txn *badger.Txn) error { + e := badger.NewEntry([]byte(pipeline.Name), pipelineBytes).WithTTL(pdb.deletedResourceTTL) err = txn.SetEntry(e) return err }) @@ -83,10 +85,6 @@ func (pdb *PipelineDBManager) Stop() error { return utils.Stop(pdb.db) } -func (pdb *PipelineDBManager) save(pipeline *Pipeline) error { - return save(pipeline, pdb.db) -} - // TODO: delete unused pipelines from the store as for now it increases indefinitely func (pdb *PipelineDBManager) delete(name string) error { return utils.Delete(pdb.db, name) @@ -125,7 +123,7 @@ func (pdb *PipelineDBManager) restore(createPipelineCb func(pipeline *Pipeline)) } if pipeline.Deleted { - pipeline.DeletedAt = utils.GetDeletedAt(item) + pipeline.DeletedAt = utils.GetDeletedAt(item, pdb.deletedResourceTTL) } createPipelineCb(pipeline) diff --git a/scheduler/pkg/store/pipeline/db_test.go b/scheduler/pkg/store/pipeline/db_test.go index 12d01680fd..1aeb15249b 100644 --- a/scheduler/pkg/store/pipeline/db_test.go +++ b/scheduler/pkg/store/pipeline/db_test.go @@ -55,7 +55,7 @@ func TestSaveWithTTL(t *testing.T) { path := fmt.Sprintf("%s/db", t.TempDir()) logger := log.New() - db, err := newPipelineDbManager(getPipelineDbFolder(path), logger) + db, err := newPipelineDbManager(getPipelineDbFolder(path), logger, 10) g.Expect(err).To(BeNil()) err = db.save(pipeline) g.Expect(err).To(BeNil()) @@ -171,7 +171,7 @@ func TestSaveAndRestore(t *testing.T) { t.Run(test.name, func(t *testing.T) { path := fmt.Sprintf("%s/db", t.TempDir()) logger := log.New() - db, err := newPipelineDbManager(getPipelineDbFolder(path), logger) + db, err := newPipelineDbManager(getPipelineDbFolder(path), logger, 10) g.Expect(err).To(BeNil()) for _, p := range test.pipelines { err := db.save(p) @@ -181,7 +181,7 @@ func TestSaveAndRestore(t *testing.T) { g.Expect(err).To(BeNil()) ps := NewPipelineStore(log.New(), nil, fakeModelStore{status: map[string]store.ModelState{}}) - err = ps.InitialiseOrRestoreDB(path) + err = ps.InitialiseOrRestoreDB(path, 10) g.Expect(err).To(BeNil()) for _, p := range test.pipelines { g.Expect(cmp.Equal(p, ps.pipelines[p.Name])).To(BeTrue()) @@ -242,7 +242,7 @@ func TestSaveAndRestoreDeletedPipelines(t *testing.T) { g.Expect(test.pipeline.Deleted).To(BeTrue(), "this is a test for deleted pipelines") path := fmt.Sprintf("%s/db", t.TempDir()) logger := log.New() - pdb, err := newPipelineDbManager(getPipelineDbFolder(path), logger) + pdb, err := newPipelineDbManager(getPipelineDbFolder(path), logger, 10) g.Expect(err).To(BeNil()) if !test.withTTL { err = saveWithOutTTL(&test.pipeline, pdb.db) @@ -255,7 +255,7 @@ func TestSaveAndRestoreDeletedPipelines(t *testing.T) { g.Expect(err).To(BeNil()) ps := NewPipelineStore(log.New(), nil, fakeModelStore{status: map[string]store.ModelState{}}) - err = ps.InitialiseOrRestoreDB(path) + err = ps.InitialiseOrRestoreDB(path, 10) g.Expect(err).To(BeNil()) if !test.withTTL { @@ -334,7 +334,7 @@ func TestGetPipelineFromDB(t *testing.T) { t.Run(test.name, func(t *testing.T) { path := fmt.Sprintf("%s/db", t.TempDir()) logger := log.New() - db, err := newPipelineDbManager(getPipelineDbFolder(path), logger) + db, err := newPipelineDbManager(getPipelineDbFolder(path), logger, 10) g.Expect(err).To(BeNil()) for _, p := range test.pipelines { err := db.save(p) @@ -409,7 +409,7 @@ func TestDeletePipelineFromDB(t *testing.T) { t.Run(test.name, func(t *testing.T) { path := fmt.Sprintf("%s/db", t.TempDir()) logger := log.New() - db, err := newPipelineDbManager(getPipelineDbFolder(path), logger) + db, err := newPipelineDbManager(getPipelineDbFolder(path), logger, 10) g.Expect(err).To(BeNil()) for _, p := range test.pipelines { err := db.save(p) @@ -527,18 +527,17 @@ func TestMigrateFromV1ToV2(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { path := fmt.Sprintf("%s/db", t.TempDir()) - logger := log.New() - db, err := utils.Open(getPipelineDbFolder(path), logger, "pipelineDb") + ps := NewPipelineStore(log.New(), nil, fakeModelStore{status: map[string]store.ModelState{}}) + err := ps.InitialiseOrRestoreDB(path, 10) g.Expect(err).To(BeNil()) for _, p := range test.pipelines { - err := save(p, db) + err := ps.db.save(p) g.Expect(err).To(BeNil()) } - err = db.Close() + err = ps.db.db.Close() g.Expect(err).To(BeNil()) - ps := NewPipelineStore(log.New(), nil, fakeModelStore{status: map[string]store.ModelState{}}) - err = ps.InitialiseOrRestoreDB(path) + err = ps.InitialiseOrRestoreDB(path, 10) g.Expect(err).To(BeNil()) // make sure we still have the pipelines diff --git a/scheduler/pkg/store/pipeline/store.go b/scheduler/pkg/store/pipeline/store.go index d9c056b5dd..53dbfe6148 100644 --- a/scheduler/pkg/store/pipeline/store.go +++ b/scheduler/pkg/store/pipeline/store.go @@ -81,7 +81,7 @@ func getPipelineDbFolder(basePath string) string { return filepath.Join(basePath, pipelineDbFolder) } -func (ps *PipelineStore) InitialiseOrRestoreDB(path string) error { +func (ps *PipelineStore) InitialiseOrRestoreDB(path string, deletedResourceTTL uint) error { logger := ps.logger.WithField("func", "initialiseDB") pipelineDbPath := getPipelineDbFolder(path) logger.Infof("Initialise DB at %s", pipelineDbPath) @@ -89,7 +89,7 @@ func (ps *PipelineStore) InitialiseOrRestoreDB(path string) error { if err != nil { return err } - db, err := newPipelineDbManager(pipelineDbPath, ps.logger) + db, err := newPipelineDbManager(pipelineDbPath, ps.logger, deletedResourceTTL) if err != nil { return err } diff --git a/scheduler/pkg/store/pipeline/store_test.go b/scheduler/pkg/store/pipeline/store_test.go index 51e53bf6e5..44192201e8 100644 --- a/scheduler/pkg/store/pipeline/store_test.go +++ b/scheduler/pkg/store/pipeline/store_test.go @@ -234,7 +234,7 @@ func TestAddPipeline(t *testing.T) { t.Run(test.name, func(t *testing.T) { logger := logrus.New() path := fmt.Sprintf("%s/db", t.TempDir()) - db, _ := newPipelineDbManager(getPipelineDbFolder(path), logger) + db, _ := newPipelineDbManager(getPipelineDbFolder(path), logger, 10) test.store.db = db err := test.store.AddPipeline(test.proto) if test.err == nil { @@ -385,7 +385,7 @@ func TestRemovePipeline(t *testing.T) { t.Run(test.name, func(t *testing.T) { logger := logrus.New() path := fmt.Sprintf("%s/db", t.TempDir()) - db, _ := newPipelineDbManager(getPipelineDbFolder(path), logger) + db, _ := newPipelineDbManager(getPipelineDbFolder(path), logger, 10) test.store.db = db err := test.store.RemovePipeline(test.pipelineName) if test.err == nil { diff --git a/scheduler/pkg/store/utils/utils.go b/scheduler/pkg/store/utils/utils.go index 5d0b3bd0bc..5857668d97 100644 --- a/scheduler/pkg/store/utils/utils.go +++ b/scheduler/pkg/store/utils/utils.go @@ -48,11 +48,11 @@ func SaveVersion(db *badger.DB, version string) error { }) } -func GetDeletedAt(item *badger.Item) time.Time { +func GetDeletedAt(item *badger.Item, ttl time.Duration) time.Time { if item.ExpiresAt() == 0 { return time.Time{} } else { - return time.Unix(int64(item.ExpiresAt()), 0).Add(-DeletedResourceTTL) + return time.Unix(int64(item.ExpiresAt()), 0).Add(-ttl) } } From a3708c3e406bc1a7050f68a4cbaa00fd003edf9d Mon Sep 17 00:00:00 2001 From: Niall D <4562759+driev@users.noreply.github.com> Date: Fri, 1 Nov 2024 15:31:34 +0000 Subject: [PATCH 2/4] switching to seconds and adding a little test for the cleanup functions --- scheduler/cmd/scheduler/main.go | 8 ++++---- scheduler/pkg/store/experiment/db.go | 2 +- scheduler/pkg/store/experiment/store_test.go | 6 +++++- scheduler/pkg/store/pipeline/db.go | 2 +- scheduler/pkg/store/pipeline/db_test.go | 5 ++--- scheduler/pkg/store/pipeline/store.go | 2 +- scheduler/pkg/store/pipeline/store_test.go | 6 +++++- scheduler/pkg/store/utils/utils.go | 1 - 8 files changed, 19 insertions(+), 13 deletions(-) diff --git a/scheduler/cmd/scheduler/main.go b/scheduler/cmd/scheduler/main.go index 4ca8afb717..e5f2e46948 100644 --- a/scheduler/cmd/scheduler/main.go +++ b/scheduler/cmd/scheduler/main.go @@ -59,7 +59,7 @@ var ( autoscalingDisabled bool kafkaConfigPath string schedulerReadyTimeoutSeconds uint - deletedResourceTTL uint + deletedResourceTTLSeconds uint ) const ( @@ -118,7 +118,7 @@ func init() { flag.UintVar(&schedulerReadyTimeoutSeconds, "scheduler-ready-timeout-seconds", 300, "Timeout for scheduler to be ready") // This TTL is set in badger DB - flag.UintVar(&deletedResourceTTL, "deleted-resource-ttl", 1440, "TTL for deleted experiments and pipelines (in minutes)") + flag.UintVar(&deletedResourceTTLSeconds, "deleted-resource-ttl-seconds", 86400, "TTL for deleted experiments and pipelines (in seconds)") } func getNamespace() string { @@ -215,11 +215,11 @@ func main() { // Do here after other services created so eventHub events will be handled on pipeline/experiment load // If we start earlier events will be sent but not received by services that start listening "late" to eventHub if dbPath != "" { - err := ps.InitialiseOrRestoreDB(dbPath, deletedResourceTTL) + err := ps.InitialiseOrRestoreDB(dbPath, deletedResourceTTLSeconds) if err != nil { log.WithError(err).Fatalf("Failed to initialise pipeline db at %s", dbPath) } - err = es.InitialiseOrRestoreDB(dbPath, deletedResourceTTL) + err = es.InitialiseOrRestoreDB(dbPath, deletedResourceTTLSeconds) if err != nil { log.WithError(err).Fatalf("Failed to initialise experiment db at %s", dbPath) } diff --git a/scheduler/pkg/store/experiment/db.go b/scheduler/pkg/store/experiment/db.go index e2c41a380e..391a6859f6 100644 --- a/scheduler/pkg/store/experiment/db.go +++ b/scheduler/pkg/store/experiment/db.go @@ -41,7 +41,7 @@ func newExperimentDbManager(path string, logger logrus.FieldLogger, deletedResou edb := &ExperimentDBManager{ db: db, logger: logger, - deletedResourceTTL: time.Duration(deletedResourceTTL * uint(time.Minute)), + deletedResourceTTL: time.Duration(deletedResourceTTL * uint(time.Second)), } version, err := edb.getVersion() diff --git a/scheduler/pkg/store/experiment/store_test.go b/scheduler/pkg/store/experiment/store_test.go index 7ee6e83bb0..870dfd579b 100644 --- a/scheduler/pkg/store/experiment/store_test.go +++ b/scheduler/pkg/store/experiment/store_test.go @@ -262,7 +262,7 @@ func TestStopExperiment(t *testing.T) { path := fmt.Sprintf("%s/db", t.TempDir()) // init db - err := test.store.InitialiseOrRestoreDB(path, 10) + err := test.store.InitialiseOrRestoreDB(path, 1) g.Expect(err).To(BeNil()) for _, p := range test.store.experiments { err := test.store.db.save(p) @@ -288,6 +288,10 @@ func TestStopExperiment(t *testing.T) { // check db experimentFromDB, _ := test.store.db.get(test.experimentName) g.Expect(experimentFromDB.Deleted).To(BeTrue()) + + time.Sleep(1 * time.Second) + test.store.cleanupDeletedExperiments() + g.Expect(test.store.experiments[test.experimentName]).To(BeNil()) } }) } diff --git a/scheduler/pkg/store/pipeline/db.go b/scheduler/pkg/store/pipeline/db.go index f2708b8683..0bdbaa8f9f 100644 --- a/scheduler/pkg/store/pipeline/db.go +++ b/scheduler/pkg/store/pipeline/db.go @@ -41,7 +41,7 @@ func newPipelineDbManager(path string, logger logrus.FieldLogger, deletedResourc pdb := &PipelineDBManager{ db: db, logger: logger, - deletedResourceTTL: time.Duration(deletedResourceTTL * uint(time.Minute)), + deletedResourceTTL: time.Duration(deletedResourceTTL * uint(time.Second)), } version, err := pdb.getVersion() diff --git a/scheduler/pkg/store/pipeline/db_test.go b/scheduler/pkg/store/pipeline/db_test.go index 1aeb15249b..ce3ca86cdb 100644 --- a/scheduler/pkg/store/pipeline/db_test.go +++ b/scheduler/pkg/store/pipeline/db_test.go @@ -21,7 +21,6 @@ import ( "google.golang.org/protobuf/proto" "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" - "github.com/seldonio/seldon-core/scheduler/v2/pkg/store/utils" ) func TestSaveWithTTL(t *testing.T) { @@ -51,7 +50,7 @@ func TestSaveWithTTL(t *testing.T) { Deleted: true, } ttl := time.Duration(time.Second) - pipeline.DeletedAt = time.Now().Add(-utils.DeletedResourceTTL).Add(ttl) + pipeline.DeletedAt = time.Now().Add(ttl) path := fmt.Sprintf("%s/db", t.TempDir()) logger := log.New() @@ -247,7 +246,7 @@ func TestSaveAndRestoreDeletedPipelines(t *testing.T) { if !test.withTTL { err = saveWithOutTTL(&test.pipeline, pdb.db) } else { - test.pipeline.DeletedAt = time.Now().Add(-utils.DeletedResourceTTL) + test.pipeline.DeletedAt = time.Now() err = pdb.save(&test.pipeline) } g.Expect(err).To(BeNil()) diff --git a/scheduler/pkg/store/pipeline/store.go b/scheduler/pkg/store/pipeline/store.go index 53dbfe6148..cd08ed553a 100644 --- a/scheduler/pkg/store/pipeline/store.go +++ b/scheduler/pkg/store/pipeline/store.go @@ -459,7 +459,7 @@ func (ps *PipelineStore) cleanupDeletedPipelines() { ps.logger.Warnf("could not update DB TTL for pipeline: %s", pipeline.Name) } } - } else if pipeline.DeletedAt.Add(utils.DeletedResourceTTL).Before(time.Now()) { + } else if pipeline.DeletedAt.Add(ps.db.deletedResourceTTL).Before(time.Now()) { delete(ps.pipelines, pipeline.Name) } } diff --git a/scheduler/pkg/store/pipeline/store_test.go b/scheduler/pkg/store/pipeline/store_test.go index 44192201e8..3d0b1d2870 100644 --- a/scheduler/pkg/store/pipeline/store_test.go +++ b/scheduler/pkg/store/pipeline/store_test.go @@ -12,6 +12,7 @@ package pipeline import ( "fmt" "testing" + "time" . "github.com/onsi/gomega" "github.com/sirupsen/logrus" @@ -385,7 +386,7 @@ func TestRemovePipeline(t *testing.T) { t.Run(test.name, func(t *testing.T) { logger := logrus.New() path := fmt.Sprintf("%s/db", t.TempDir()) - db, _ := newPipelineDbManager(getPipelineDbFolder(path), logger, 10) + db, _ := newPipelineDbManager(getPipelineDbFolder(path), logger, 1) test.store.db = db err := test.store.RemovePipeline(test.pipelineName) if test.err == nil { @@ -408,6 +409,9 @@ func TestRemovePipeline(t *testing.T) { g.Expect(actualPipeline.LastVersion).To(Equal(expectedPipeline.LastVersion)) g.Expect(len(actualPipeline.Versions)).To(Equal(len(expectedPipeline.Versions))) g.Expect(len(actualPipeline.Versions)).To(Equal(len(expectedPipeline.Versions))) + time.Sleep(1 * time.Second) + test.store.cleanupDeletedPipelines() + g.Expect(test.store.pipelines[test.pipelineName]).To(BeNil()) } else { g.Expect(err.Error()).To(Equal(test.err.Error())) } diff --git a/scheduler/pkg/store/utils/utils.go b/scheduler/pkg/store/utils/utils.go index 5857668d97..489de3d497 100644 --- a/scheduler/pkg/store/utils/utils.go +++ b/scheduler/pkg/store/utils/utils.go @@ -18,7 +18,6 @@ import ( const ( VersionKey = "__version_key__" - DeletedResourceTTL time.Duration = time.Duration(24 * time.Hour) DeletedResourceCleanupFrequency time.Duration = time.Duration(10 * time.Minute) ) From 4cf864426a0b2d58bf3f67062f11600ffbd1904c Mon Sep 17 00:00:00 2001 From: Niall D <4562759+driev@users.noreply.github.com> Date: Fri, 1 Nov 2024 16:14:39 +0000 Subject: [PATCH 3/4] merge v2 --- docs-gb/architecture/README.md | 58 +++-- docs-gb/examples/pipeline-examples.md | 1 + docs-gb/kubernetes/service-meshes/traefik.md | 234 +++++++++++++++++- operator/scheduler/client.go | 33 ++- operator/scheduler/control_plane.go | 4 +- operator/scheduler/experiment.go | 12 +- operator/scheduler/model.go | 12 +- operator/scheduler/pipeline.go | 12 +- operator/scheduler/server.go | 8 +- scheduler/config/envoy.yaml | 7 +- .../src/main/kotlin/io/seldon/dataflow/Cli.kt | 23 +- .../main/kotlin/io/seldon/dataflow/Main.kt | 4 +- .../io/seldon/dataflow/PipelineSubscriber.kt | 4 + .../src/main/resources/local.properties | 1 + .../test/kotlin/io/seldon/dataflow/CliTest.kt | 31 +++ scheduler/pkg/agent/client.go | 12 +- .../agent/modelserver_controlplane/oip/v2.go | 8 - scheduler/pkg/agent/server.go | 18 +- scheduler/pkg/agent/server_test.go | 142 ++++++++++- scheduler/pkg/envoy/server/server.go | 4 + scheduler/pkg/kafka/dataflow/server.go | 28 ++- scheduler/pkg/kafka/dataflow/server_test.go | 136 +++++++++- scheduler/pkg/kafka/gateway/client.go | 29 ++- scheduler/pkg/kafka/gateway/worker.go | 8 - scheduler/pkg/kafka/pipeline/status/client.go | 19 +- scheduler/pkg/server/server.go | 5 + scheduler/pkg/store/experiment/db_test.go | 14 +- scheduler/pkg/store/experiment/store.go | 2 +- scheduler/pkg/store/pipeline/db_test.go | 12 + scheduler/pkg/store/pipeline/store.go | 2 +- scheduler/pkg/util/constants.go | 13 +- scheduler/pkg/util/grpc.go | 38 +++ 32 files changed, 808 insertions(+), 126 deletions(-) create mode 100644 scheduler/pkg/util/grpc.go diff --git a/docs-gb/architecture/README.md b/docs-gb/architecture/README.md index 48247cd482..bb73260360 100644 --- a/docs-gb/architecture/README.md +++ b/docs-gb/architecture/README.md @@ -1,36 +1,62 @@ # Architecture -The current set of components used in Seldon Core 2 is shown below: +Seldon Core 2 uses a microservice architecture where each service has limited and well-defined responsibilities working together to orchestrate scalable and fault-tolerant ML serving and management. These components communicate internally using gRPC and they can be scaled independently. Seldon Core 2 services can be split into two categories: + +* **Control Plane** services are responsible for managing the operations and configurations of your ML models and workflows. This includes functionality to instantiate new inference servers, load models, update new versions of models, configure model experiments and pipelines, and expose endpoints that may receive inference requests. The main control plane component is the **Scheduler** that is responsible for managing the loading and unloading of resources (models, pipelines, experiments) onto the respective components. + +* **Data Plane** services are responsible for managing the flow of data between components or models. Core 2 supports REST and gRPC payloads that follow the Open Inference Protocol (OIP). The main data plane service is **Envoy**, which acts as a single ingress for all data plane load and routes data to the relevant servers internally (e.g. Seldon MLServer or NVidia Triton pods). + +{% hint style="info" %} +**Note**: Because Core 2 architecture separates control plane and data plane responsibilities, when control plane services are down (e.g. the Scheduler), data plane inference can still be served. In this manner the system is more resilient to failures. For example, an outage of control plane services does not impact the ability of the system to respond to end user traffic. Core 2 can be provisioned to be **highly available** on the data plane path. +{% endhint %} + + +The current set of services used in Seldon Core 2 is shown below. Following the diagram, we will describe each control plane and data plan service. ![architecture](../images/architecture.png) -The core components are: +## Control Plane -* Scheduler : manages the load and unload of models, pipelines, explainers and experiments. -* Pipeline gateway : handles REST/gRPC calls to pipelines. -* Dataflow engine : handles the flow of data between components in a pipeline. -* Model gateway : handles the flow of data from models to inference requests on servers and passes on the responses. -* Agent : manages the loading and unloading of models on a server and access to the server over REST/gRPC. -* Envoy : manages the proxying of requests to the correct servers including load balancing. +### Scheduler +This service manages the loading and unloading of Models, Pipelines and Experiments on the relevant micro services. It is also responsible for matching Models with available Servers in a way that optimises infrastructure use. In the current design we can only have _one_ instance of the Scheduler as its internal state is persisted on disk. -All the above are Kubernetes agnostic and can run locally, e.g. on Docker Compose. +When the Scheduler (re)starts there is a synchronisation flow to coordinate the startup process and to attempt to wait for expected Model Servers to connect before proceeding with control plane operations. This is important so that ongoing data plane operations are not interrupted. This then introduces a delay on any control plane operations until the process has finished (including control plan resources status updates). This synchronisation process has a timeout, which has a default of 10 minutes. It can be changed by setting helm seldon-core-v2-components value `scheduler.schedulerReadyTimeoutSeconds`. -We also provide a Kubernetes Operator to allow Kubernetes usage. +### Agent +This service manages the loading and unloading of models on a server and access to the server over REST/gRPC. It acts as a reverse proxy to connect end users with the actual Model Servers. In this way the system collects stats and metrics about data plane inferences that helps with observability and scaling. -Kafka is used as the backbone for Pipelines allowing a decentralized, synchronous and asynchronous usage. +### Controller +We also provide a Kubernetes Operator to allow Kubernetes usage. This is implemented in the Controller Manager microservice, which manages CRD reconciliation with Scheduler. Currently Core 2 supports _one_ instance of the Controller. -## Kafka +{% hint style="info" %} +**Note**: All services besides the Controller are Kubernetes agnostic and can run locally, e.g. on Docker Compose. +{% endhint %} -Kafka is used as the backbone for allowing Pipelines of Models to be connected together into arbitrary directed acyclic graphs. Models can be reused in different Pipelines. The flow of data between models is handled by the dataflow engine using [KStreams](https://docs.confluent.io/platform/current/streams/concepts.html). +## Data Plane -![kafka](../images/kafka.png) +### Pipeline Gateway +This service handles REST/gRPC calls to Pipelines. It translates between synchronous requests to Kafka operations, producing a message on the relevant input topic for a Pipeline and consuming from the output topic to return inference results back to the users. -## Dataflow Architecture +### Model Gateway +This service handles the flow of data from models to inference requests on servers and passes on the responses via Kafka. -Seldon V2 follows a dataflow design paradigm and it's part of the current movement for data centric machine learning. By taking a decentralized route that focuses on the flow of data users can have more flexibility and insight in building complex applications containing machine learning and traditional components. This contrasts with a more centralized orchestration more traditional in service orientated architectures. +### Dataflow Engine +This service handles the flow of data between components in a pipeline, using Kafka Streams. It enables Core 2 to chain and join Models together to provide complex Pipelines. + +### Envoy +This service manages the proxying of requests to the correct servers including load balancing. + +## Dataflow Architecture and Pipelines + +To support the movement towards data centric machine learning Seldon Core 2 follows a dataflow paradigm. By taking a decentralized route that focuses on the flow of data, users can have more flexibility and insight as they build and manage complex AI applications in production. This contrasts with more centralized orchestration approaches where data is secondary. ![dataflow](../images/dataflow.png) +### Kafka +Kafka is used as the backbone for Pipelines allowing decentralized, synchronous and asynchronous usage. This enables Models to be connected together into arbitrary directed acyclic graphs. Models can be reused in different Pipelines. The flow of data between models is handled by the dataflow engine using [Kafka Streams](https://docs.confluent.io/platform/current/streams/concepts.html). + +![kafka](../images/kafka.png) + By focusing on the data we allow users to join various flows together using stream joining concepts as shown below. ![joins](../images/joins.png) diff --git a/docs-gb/examples/pipeline-examples.md b/docs-gb/examples/pipeline-examples.md index 55f9a94ea1..647838e37f 100644 --- a/docs-gb/examples/pipeline-examples.md +++ b/docs-gb/examples/pipeline-examples.md @@ -31,6 +31,7 @@ spec: requirements: - tensorflow memory: 100Ki +--- apiVersion: mlops.seldon.io/v1alpha1 kind: Model metadata: diff --git a/docs-gb/kubernetes/service-meshes/traefik.md b/docs-gb/kubernetes/service-meshes/traefik.md index 472d6799eb..2b2f1cf8df 100644 --- a/docs-gb/kubernetes/service-meshes/traefik.md +++ b/docs-gb/kubernetes/service-meshes/traefik.md @@ -11,15 +11,239 @@ We will run through some examples as shown in the notebook `service-meshes/traef * Traefik IngressRoute * Traefik Middleware for adding a header -```{literalinclude} ../../../../../../service-meshes/traefik/static/single-model.yaml -:language: yaml +```yaml +apiVersion: v1 +kind: Service +metadata: + name: myapps + namespace: seldon-mesh +spec: + ports: + - name: web + port: 80 + protocol: TCP + selector: + app: traefik-ingress-lb + type: LoadBalancer +--- +apiVersion: mlops.seldon.io/v1alpha1 +kind: Model +metadata: + name: iris + namespace: seldon-mesh +spec: + requirements: + - sklearn + storageUri: gs://seldon-models/mlserver/iris +--- +apiVersion: traefik.containo.us/v1alpha1 +kind: IngressRoute +metadata: + name: iris + namespace: seldon-mesh +spec: + entryPoints: + - web + routes: + - kind: Rule + match: PathPrefix(`/`) + middlewares: + - name: iris-header + services: + - name: seldon-mesh + port: 80 + scheme: h2c +--- +apiVersion: traefik.containo.us/v1alpha1 +kind: Middleware +metadata: + name: iris-header + namespace: seldon-mesh +spec: + headers: + customRequestHeaders: + seldon-model: iris ``` ## Traffic Split -```{warning} -Traffic splitting does not presently work due to this [issue](https://github.com/emissary-ingress/emissary/issues/4062). We recommend you use a Seldon Experiment instead. +{% hint style="warning" %} +**Warning** Traffic splitting does not presently work due to this [issue](https://github.com/emissary-ingress/emissary/issues/4062). We recommend you use a Seldon Experiment instead. +{% endhint %} + +## Traefik Examples + +Assumes + + * You have installed Traefik as per their [docs](https://doc.traefik.io/traefik/getting-started/install-traefik/#use-the-helm-chart) into namespace traefik-v2 + + Tested with traefik-10.19.4 + + + +```python +INGRESS_IP=!kubectl get svc traefik -n traefik-v2 -o jsonpath='{.status.loadBalancer.ingress[0].ip}' +INGRESS_IP=INGRESS_IP[0] +import os +os.environ['INGRESS_IP'] = INGRESS_IP +INGRESS_IP ``` -```{include} ../../../../../../service-meshes/traefik/README.md + + + + '172.21.255.1' + + + +### Traefik Single Model Example + + +```python +!kustomize build config/single-model ``` + + apiVersion: v1 + kind: Service + metadata: + name: myapps + namespace: seldon-mesh + spec: + ports: + - name: web + port: 80 + protocol: TCP + selector: + app: traefik-ingress-lb + type: LoadBalancer + --- + apiVersion: mlops.seldon.io/v1alpha1 + kind: Model + metadata: + name: iris + namespace: seldon-mesh + spec: + requirements: + - sklearn + storageUri: gs://seldon-models/mlserver/iris + --- + apiVersion: traefik.containo.us/v1alpha1 + kind: IngressRoute + metadata: + name: iris + namespace: seldon-mesh + spec: + entryPoints: + - web + routes: + - kind: Rule + match: PathPrefix(`/`) + middlewares: + - name: iris-header + services: + - name: seldon-mesh + port: 80 + scheme: h2c + --- + apiVersion: traefik.containo.us/v1alpha1 + kind: Middleware + metadata: + name: iris-header + namespace: seldon-mesh + spec: + headers: + customRequestHeaders: + seldon-model: iris + + + +```python +!kustomize build config/single-model | kubectl apply -f - +``` + + service/myapps created + model.mlops.seldon.io/iris created + ingressroute.traefik.containo.us/iris created + middleware.traefik.containo.us/iris-header created + + + +```python +!kubectl wait --for condition=ready --timeout=300s model --all -n seldon-mesh +``` + + model.mlops.seldon.io/iris condition met + + + +```python +!curl -v http://${INGRESS_IP}/v2/models/iris/infer -H "Content-Type: application/json" \ + -d '{"inputs": [{"name": "predict", "shape": [1, 4], "datatype": "FP32", "data": [[1, 2, 3, 4]]}]}' +``` + + * Trying 172.21.255.1... + * Connected to 172.21.255.1 (172.21.255.1) port 80 (#0) + > POST /v2/models/iris/infer HTTP/1.1 + > Host: 172.21.255.1 + > User-Agent: curl/7.47.0 + > Accept: */* + > Content-Type: application/json + > Content-Length: 94 + > + * upload completely sent off: 94 out of 94 bytes + < HTTP/1.1 200 OK + < Content-Length: 196 + < Content-Type: application/json + < Date: Sat, 16 Apr 2022 15:53:27 GMT + < Seldon-Route: iris_1 + < Server: envoy + < X-Envoy-Upstream-Service-Time: 895 + < + * Connection #0 to host 172.21.255.1 left intact + {"model_name":"iris_1","model_version":"1","id":"0dccf477-78fa-4a11-92ff-4d7e4f1cdda8","parameters":null,"outputs":[{"name":"predict","shape":[1],"datatype":"INT64","parameters":null,"data":[2]}]} + + +```python +!grpcurl -d '{"model_name":"iris","inputs":[{"name":"input","contents":{"fp32_contents":[1,2,3,4]},"datatype":"FP32","shape":[1,4]}]}' \ + -plaintext \ + -import-path ../../apis \ + -proto ../../apis/mlops/v2_dataplane/v2_dataplane.proto \ + ${INGRESS_IP}:80 inference.GRPCInferenceService/ModelInfer +``` + + { + "modelName": "iris_1", + "modelVersion": "1", + "outputs": [ + { + "name": "predict", + "datatype": "INT64", + "shape": [ + "1" + ], + "contents": { + "int64Contents": [ + "2" + ] + } + } + ] + } + + + +```python +!kustomize build config/single-model | kubectl delete -f - +``` + + service "myapps" deleted + model.mlops.seldon.io "iris" deleted + ingressroute.traefik.containo.us "iris" deleted + middleware.traefik.containo.us "iris-header" deleted + + + +```python + +``` + diff --git a/operator/scheduler/client.go b/operator/scheduler/client.go index a2d0142ade..98492d31c4 100644 --- a/operator/scheduler/client.go +++ b/operator/scheduler/client.go @@ -33,11 +33,16 @@ import ( const ( // these 2 constants in combination with the backoff exponential function will give us a max backoff of 13.5 minutes - SchedulerConnectMaxRetries = 12 - SchedulerConnectBackoffScalar = 200 * time.Millisecond - ClientKeapAliveTime = 60 * time.Second - ClientKeapAliveTimeout = 2 * time.Second - ClientKeapAlivePermit = true + schedulerConnectMaxRetries = 100 + schedulerConnectBackoffScalar = 200 * time.Millisecond + // these keep alive settings need to match the scheduler counterpart in scheduler/pkg/util/constants.go + clientKeepAliveTime = 60 * time.Second + clientKeepAliveTimeout = 2 * time.Second + clientKeepAlivePermit = false + // backoff + backoffMaxElapsedTime = 0 // Never stop due to large time between calls + backOffMaxInterval = time.Second * 15 + backOffInitialInterval = time.Second ) type SchedulerClient struct { @@ -229,9 +234,9 @@ func (s *SchedulerClient) connectToScheduler(host string, namespace string, plai } } kacp := keepalive.ClientParameters{ - Time: ClientKeapAliveTime, - Timeout: ClientKeapAliveTimeout, - PermitWithoutStream: ClientKeapAlivePermit, + Time: clientKeepAliveTime, + Timeout: clientKeepAliveTimeout, + PermitWithoutStream: clientKeepAlivePermit, } retryOpts := []grpc_retry.CallOption{ @@ -249,7 +254,7 @@ func (s *SchedulerClient) connectToScheduler(host string, namespace string, plai opts = append(opts, grpc.WithTransportCredentials(insecure.NewCredentials())) s.logger.Info("Running scheduler client in plain text mode", "port", port) } - opts = append(opts, grpc.WithStreamInterceptor(grpc_retry.StreamClientInterceptor(retryOpts...))) + // we dont have backoff retry on the grpc streams as we handle this in the event handlers opts = append(opts, grpc.WithUnaryInterceptor(grpc_retry.UnaryClientInterceptor(retryOpts...))) opts = append(opts, grpc.WithKeepaliveParams(kacp)) s.logger.Info("Dialing scheduler", "host", host, "port", port) @@ -313,7 +318,7 @@ func retryFn( logFailure := func(err error, delay time.Duration) { logger.Error(err, "Scheduler not ready") } - backOffExp := backoff.NewExponentialBackOff() + backOffExp := getClientExponentialBackoff() fnWithArgs := func() error { grpcClient := scheduler.NewSchedulerClient(conn) return fn(context.Background(), grpcClient, namespace) @@ -325,3 +330,11 @@ func retryFn( } return nil } + +func getClientExponentialBackoff() *backoff.ExponentialBackOff { + backOffExp := backoff.NewExponentialBackOff() + backOffExp.MaxElapsedTime = backoffMaxElapsedTime + backOffExp.MaxInterval = backOffMaxInterval + backOffExp.InitialInterval = backOffInitialInterval + return backOffExp +} diff --git a/operator/scheduler/control_plane.go b/operator/scheduler/control_plane.go index 9dbf18a590..9834cfea98 100644 --- a/operator/scheduler/control_plane.go +++ b/operator/scheduler/control_plane.go @@ -31,8 +31,8 @@ func (s *SchedulerClient) SubscribeControlPlaneEvents(ctx context.Context, grpcC stream, err := grpcClient.SubscribeControlPlane( ctx, &scheduler.ControlPlaneSubscriptionRequest{SubscriberName: "seldon manager"}, - grpc_retry.WithMax(SchedulerConnectMaxRetries), - grpc_retry.WithBackoff(grpc_retry.BackoffExponential(SchedulerConnectBackoffScalar)), + grpc_retry.WithMax(schedulerConnectMaxRetries), + grpc_retry.WithBackoff(grpc_retry.BackoffExponential(schedulerConnectBackoffScalar)), ) if err != nil { return err diff --git a/operator/scheduler/experiment.go b/operator/scheduler/experiment.go index c2b4275800..50be2e6cf8 100644 --- a/operator/scheduler/experiment.go +++ b/operator/scheduler/experiment.go @@ -43,8 +43,8 @@ func (s *SchedulerClient) StartExperiment(ctx context.Context, experiment *v1alp _, err = grpcClient.StartExperiment( ctx, req, - grpc_retry.WithMax(SchedulerConnectMaxRetries), - grpc_retry.WithBackoff(grpc_retry.BackoffExponential(SchedulerConnectBackoffScalar)), + grpc_retry.WithMax(schedulerConnectMaxRetries), + grpc_retry.WithBackoff(grpc_retry.BackoffExponential(schedulerConnectBackoffScalar)), ) return s.checkErrorRetryable(experiment.Kind, experiment.Name, err), err } @@ -66,8 +66,8 @@ func (s *SchedulerClient) StopExperiment(ctx context.Context, experiment *v1alph _, err = grpcClient.StopExperiment( ctx, req, - grpc_retry.WithMax(SchedulerConnectMaxRetries), - grpc_retry.WithBackoff(grpc_retry.BackoffExponential(SchedulerConnectBackoffScalar)), + grpc_retry.WithMax(schedulerConnectMaxRetries), + grpc_retry.WithBackoff(grpc_retry.BackoffExponential(schedulerConnectBackoffScalar)), ) return s.checkErrorRetryable(experiment.Kind, experiment.Name, err), err } @@ -79,8 +79,8 @@ func (s *SchedulerClient) SubscribeExperimentEvents(ctx context.Context, grpcCli stream, err := grpcClient.SubscribeExperimentStatus( ctx, &scheduler.ExperimentSubscriptionRequest{SubscriberName: "seldon manager"}, - grpc_retry.WithMax(SchedulerConnectMaxRetries), - grpc_retry.WithBackoff(grpc_retry.BackoffExponential(SchedulerConnectBackoffScalar)), + grpc_retry.WithMax(schedulerConnectMaxRetries), + grpc_retry.WithBackoff(grpc_retry.BackoffExponential(schedulerConnectBackoffScalar)), ) if err != nil { return err diff --git a/operator/scheduler/model.go b/operator/scheduler/model.go index 767aabd84f..f49217be38 100644 --- a/operator/scheduler/model.go +++ b/operator/scheduler/model.go @@ -60,8 +60,8 @@ func (s *SchedulerClient) LoadModel(ctx context.Context, model *v1alpha1.Model, _, err = grpcClient.LoadModel( ctx, &loadModelRequest, - grpc_retry.WithMax(SchedulerConnectMaxRetries), - grpc_retry.WithBackoff(grpc_retry.BackoffExponential(SchedulerConnectBackoffScalar)), + grpc_retry.WithMax(schedulerConnectMaxRetries), + grpc_retry.WithBackoff(grpc_retry.BackoffExponential(schedulerConnectBackoffScalar)), ) if err != nil { return s.checkErrorRetryable(model.Kind, model.Name, err), err @@ -102,8 +102,8 @@ func (s *SchedulerClient) UnloadModel(ctx context.Context, model *v1alpha1.Model _, err = grpcClient.UnloadModel( ctx, modelRef, - grpc_retry.WithMax(SchedulerConnectMaxRetries), - grpc_retry.WithBackoff(grpc_retry.BackoffExponential(SchedulerConnectBackoffScalar)), + grpc_retry.WithMax(schedulerConnectMaxRetries), + grpc_retry.WithBackoff(grpc_retry.BackoffExponential(schedulerConnectBackoffScalar)), ) if err != nil { return s.checkErrorRetryable(model.Kind, model.Name, err), err @@ -117,8 +117,8 @@ func (s *SchedulerClient) SubscribeModelEvents(ctx context.Context, grpcClient s stream, err := grpcClient.SubscribeModelStatus( ctx, &scheduler.ModelSubscriptionRequest{SubscriberName: "seldon manager"}, - grpc_retry.WithMax(SchedulerConnectMaxRetries), - grpc_retry.WithBackoff(grpc_retry.BackoffExponential(SchedulerConnectBackoffScalar)), + grpc_retry.WithMax(schedulerConnectMaxRetries), + grpc_retry.WithBackoff(grpc_retry.BackoffExponential(schedulerConnectBackoffScalar)), ) if err != nil { return err diff --git a/operator/scheduler/pipeline.go b/operator/scheduler/pipeline.go index 586e84103d..fce4af998c 100644 --- a/operator/scheduler/pipeline.go +++ b/operator/scheduler/pipeline.go @@ -42,8 +42,8 @@ func (s *SchedulerClient) LoadPipeline(ctx context.Context, pipeline *v1alpha1.P _, err = grpcClient.LoadPipeline( ctx, &req, - grpc_retry.WithMax(SchedulerConnectMaxRetries), - grpc_retry.WithBackoff(grpc_retry.BackoffExponential(SchedulerConnectBackoffScalar)), + grpc_retry.WithMax(schedulerConnectMaxRetries), + grpc_retry.WithBackoff(grpc_retry.BackoffExponential(schedulerConnectBackoffScalar)), ) return s.checkErrorRetryable(pipeline.Kind, pipeline.Name, err), err } @@ -62,8 +62,8 @@ func (s *SchedulerClient) UnloadPipeline(ctx context.Context, pipeline *v1alpha1 _, err = grpcClient.UnloadPipeline( ctx, &req, - grpc_retry.WithMax(SchedulerConnectMaxRetries), - grpc_retry.WithBackoff(grpc_retry.BackoffExponential(SchedulerConnectBackoffScalar)), + grpc_retry.WithMax(schedulerConnectMaxRetries), + grpc_retry.WithBackoff(grpc_retry.BackoffExponential(schedulerConnectBackoffScalar)), ) if err != nil { return err, s.checkErrorRetryable(pipeline.Kind, pipeline.Name, err) @@ -85,8 +85,8 @@ func (s *SchedulerClient) SubscribePipelineEvents(ctx context.Context, grpcClien stream, err := grpcClient.SubscribePipelineStatus( ctx, &scheduler.PipelineSubscriptionRequest{SubscriberName: "seldon manager"}, - grpc_retry.WithMax(SchedulerConnectMaxRetries), - grpc_retry.WithBackoff(grpc_retry.BackoffExponential(SchedulerConnectBackoffScalar)), + grpc_retry.WithMax(schedulerConnectMaxRetries), + grpc_retry.WithBackoff(grpc_retry.BackoffExponential(schedulerConnectBackoffScalar)), ) if err != nil { return err diff --git a/operator/scheduler/server.go b/operator/scheduler/server.go index af1f88fc19..8ecf39bf17 100644 --- a/operator/scheduler/server.go +++ b/operator/scheduler/server.go @@ -64,8 +64,8 @@ func (s *SchedulerClient) ServerNotify(ctx context.Context, grpcClient scheduler _, err := grpcClient.ServerNotify( ctx, request, - grpc_retry.WithMax(SchedulerConnectMaxRetries), - grpc_retry.WithBackoff(grpc_retry.BackoffExponential(SchedulerConnectBackoffScalar)), + grpc_retry.WithMax(schedulerConnectMaxRetries), + grpc_retry.WithBackoff(grpc_retry.BackoffExponential(schedulerConnectBackoffScalar)), ) if err != nil { logger.Error(err, "Failed to send notify server to scheduler") @@ -82,8 +82,8 @@ func (s *SchedulerClient) SubscribeServerEvents(ctx context.Context, grpcClient stream, err := grpcClient.SubscribeServerStatus( ctx, &scheduler.ServerSubscriptionRequest{SubscriberName: "seldon manager"}, - grpc_retry.WithMax(SchedulerConnectMaxRetries), - grpc_retry.WithBackoff(grpc_retry.BackoffExponential(SchedulerConnectBackoffScalar)), + grpc_retry.WithMax(schedulerConnectMaxRetries), + grpc_retry.WithBackoff(grpc_retry.BackoffExponential(schedulerConnectBackoffScalar)), ) if err != nil { return err diff --git a/scheduler/config/envoy.yaml b/scheduler/config/envoy.yaml index 5275cc8b15..fba6fa0de1 100644 --- a/scheduler/config/envoy.yaml +++ b/scheduler/config/envoy.yaml @@ -11,7 +11,12 @@ static_resources: socket_address: address: seldon-scheduler port_value: 9002 - http2_protocol_options: {} + http2_protocol_options: { + connection_keepalive: { + interval: 60s, + timeout: 2s, + } + } name: xds_cluster - connect_timeout: 0.250s type: LOGICAL_DNS diff --git a/scheduler/data-flow/src/main/kotlin/io/seldon/dataflow/Cli.kt b/scheduler/data-flow/src/main/kotlin/io/seldon/dataflow/Cli.kt index 1e5485aaea..94a5e393a5 100644 --- a/scheduler/data-flow/src/main/kotlin/io/seldon/dataflow/Cli.kt +++ b/scheduler/data-flow/src/main/kotlin/io/seldon/dataflow/Cli.kt @@ -11,6 +11,7 @@ package io.seldon.dataflow import com.natpryce.konfig.CommandLineOption import com.natpryce.konfig.Configuration +import com.natpryce.konfig.ConfigurationMap import com.natpryce.konfig.ConfigurationProperties import com.natpryce.konfig.EnvironmentVariables import com.natpryce.konfig.Key @@ -25,6 +26,8 @@ import io.klogging.Level import io.klogging.noCoLogger import io.seldon.dataflow.kafka.security.KafkaSaslMechanisms import io.seldon.dataflow.kafka.security.KafkaSecurityProtocols +import java.net.InetAddress +import java.util.UUID object Cli { private const val ENV_VAR_PREFIX = "SELDON_" @@ -34,6 +37,7 @@ object Cli { val logLevelApplication = Key("log.level.app", enumType(*Level.values())) val logLevelKafka = Key("log.level.kafka", enumType(*Level.values())) val namespace = Key("pod.namespace", stringType) + val dataflowReplicaId = Key("dataflow.replica.id", stringType) // Seldon components val upstreamHost = Key("upstream.host", stringType) @@ -75,6 +79,7 @@ object Cli { logLevelApplication, logLevelKafka, namespace, + dataflowReplicaId, upstreamHost, upstreamPort, kafkaBootstrapServers, @@ -105,10 +110,26 @@ object Cli { fun configWith(rawArgs: Array): Configuration { val fromProperties = ConfigurationProperties.fromResource("local.properties") + val fromSystem = getSystemConfig() val fromEnv = EnvironmentVariables(prefix = ENV_VAR_PREFIX) val fromArgs = parseArguments(rawArgs) - return fromArgs overriding fromEnv overriding fromProperties + return fromArgs overriding fromEnv overriding fromSystem overriding fromProperties + } + + private fun getSystemConfig(): Configuration { + val dataflowIdPair = this.dataflowReplicaId to getNewDataflowId() + return ConfigurationMap(dataflowIdPair) + } + + fun getNewDataflowId(assignRandomUuid: Boolean = false): String { + if (!assignRandomUuid) { + try { + return InetAddress.getLocalHost().hostName + } catch (_: Exception) { + } + } + return "seldon-dataflow-engine-" + UUID.randomUUID().toString() } private fun parseArguments(rawArgs: Array): Configuration { diff --git a/scheduler/data-flow/src/main/kotlin/io/seldon/dataflow/Main.kt b/scheduler/data-flow/src/main/kotlin/io/seldon/dataflow/Main.kt index 8d4a899eaa..b064a974f5 100644 --- a/scheduler/data-flow/src/main/kotlin/io/seldon/dataflow/Main.kt +++ b/scheduler/data-flow/src/main/kotlin/io/seldon/dataflow/Main.kt @@ -102,9 +102,11 @@ object Main { describeRetries = config[Cli.topicDescribeRetries], describeRetryDelayMillis = config[Cli.topicDescribeRetryDelayMillis], ) + val subscriberId = config[Cli.dataflowReplicaId] + val subscriber = PipelineSubscriber( - "seldon-dataflow-engine", + subscriberId, kafkaProperties, kafkaAdminProperties, kafkaStreamsParams, diff --git a/scheduler/data-flow/src/main/kotlin/io/seldon/dataflow/PipelineSubscriber.kt b/scheduler/data-flow/src/main/kotlin/io/seldon/dataflow/PipelineSubscriber.kt index f1d3de4066..b9a2c16a30 100644 --- a/scheduler/data-flow/src/main/kotlin/io/seldon/dataflow/PipelineSubscriber.kt +++ b/scheduler/data-flow/src/main/kotlin/io/seldon/dataflow/PipelineSubscriber.kt @@ -37,6 +37,7 @@ import kotlinx.coroutines.flow.onCompletion import kotlinx.coroutines.flow.onEach import kotlinx.coroutines.runBlocking import java.util.concurrent.ConcurrentHashMap +import java.util.concurrent.TimeUnit import io.klogging.logger as coLogger @OptIn(FlowPreview::class) @@ -60,6 +61,9 @@ class PipelineSubscriber( .defaultServiceConfig(grpcServiceConfig) .usePlaintext() // Use TLS .enableRetry() + // these keep alive settings need to match the go counterpart in scheduler/pkg/util/constants.go + .keepAliveTime(60L, TimeUnit.SECONDS) + .keepAliveTimeout(2L, TimeUnit.SECONDS) .build() private val client = ChainerGrpcKt.ChainerCoroutineStub(channel) diff --git a/scheduler/data-flow/src/main/resources/local.properties b/scheduler/data-flow/src/main/resources/local.properties index 46a7218380..68b3bd408d 100644 --- a/scheduler/data-flow/src/main/resources/local.properties +++ b/scheduler/data-flow/src/main/resources/local.properties @@ -1,5 +1,6 @@ log.level.app=INFO log.level.kafka=WARN +dataflow.replica.id=seldon-dataflow-engine kafka.bootstrap.servers=localhost:9092 kafka.consumer.prefix= kafka.security.protocol=PLAINTEXT diff --git a/scheduler/data-flow/src/test/kotlin/io/seldon/dataflow/CliTest.kt b/scheduler/data-flow/src/test/kotlin/io/seldon/dataflow/CliTest.kt index 9011ff3d4c..52a97fa4b5 100644 --- a/scheduler/data-flow/src/test/kotlin/io/seldon/dataflow/CliTest.kt +++ b/scheduler/data-flow/src/test/kotlin/io/seldon/dataflow/CliTest.kt @@ -16,9 +16,15 @@ import org.junit.jupiter.params.provider.Arguments import org.junit.jupiter.params.provider.Arguments.arguments import org.junit.jupiter.params.provider.MethodSource import strikt.api.expectCatching +import strikt.api.expectThat +import strikt.assertions.hasLength import strikt.assertions.isEqualTo +import strikt.assertions.isNotEqualTo import strikt.assertions.isSuccess +import strikt.assertions.startsWith +import java.util.UUID import java.util.stream.Stream +import kotlin.test.Test internal class CliTest { @DisplayName("Passing auth mechanism via cli argument") @@ -36,6 +42,31 @@ internal class CliTest { .isEqualTo(expectedMechanism) } + @Test + fun `should handle dataflow replica id`() { + val cliDefault = Cli.configWith(arrayOf()) + val testReplicaId = "dataflow-id-1" + val cli = Cli.configWith(arrayOf("--dataflow-replica-id", testReplicaId)) + + expectThat(cliDefault[Cli.dataflowReplicaId]) { + isNotEqualTo("seldon-dataflow-engine") + } + expectThat(cli[Cli.dataflowReplicaId]) { + isEqualTo(testReplicaId) + } + + // test random Uuid (v4) + val expectedReplicaIdPrefix = "seldon-dataflow-engine-" + val uuidStringLength = 36 + val randomReplicaUuid = Cli.getNewDataflowId(true) + expectThat(randomReplicaUuid) { + startsWith(expectedReplicaIdPrefix) + hasLength(expectedReplicaIdPrefix.length + uuidStringLength) + } + expectCatching { UUID.fromString(randomReplicaUuid.removePrefix(expectedReplicaIdPrefix)) } + .isSuccess() + } + companion object { @JvmStatic private fun saslMechanisms(): Stream { diff --git a/scheduler/pkg/agent/client.go b/scheduler/pkg/agent/client.go index 7698df52ab..79a388266d 100644 --- a/scheduler/pkg/agent/client.go +++ b/scheduler/pkg/agent/client.go @@ -24,7 +24,6 @@ import ( "google.golang.org/grpc" "google.golang.org/grpc/credentials" "google.golang.org/grpc/credentials/insecure" - "google.golang.org/grpc/keepalive" "google.golang.org/protobuf/encoding/protojson" "github.com/seldonio/seldon-core/apis/go/v2/mlops/agent" @@ -237,8 +236,7 @@ func (c *Client) Start() error { logFailure := func(err error, delay time.Duration) { c.logger.WithError(err).Errorf("Scheduler not ready") } - backOffExp := backoff.NewExponentialBackOff() - backOffExp.MaxElapsedTime = 0 // Never stop due to large time between calls + backOffExp := util.GetClientExponentialBackoff() err := backoff.RetryNotify(c.StartService, backOffExp, logFailure) if err != nil { c.logger.WithError(err).Fatal("Failed to start client") @@ -417,11 +415,7 @@ func (c *Client) getConnection(host string, plainTxtPort int, tlsPort int) (*grp logger.Infof("Connecting (non-blocking) to scheduler at %s:%d", host, port) - kacp := keepalive.ClientParameters{ - Time: util.ClientKeapAliveTime, - Timeout: util.ClientKeapAliveTimeout, - PermitWithoutStream: util.ClientKeapAlivePermit, - } + kacp := util.GetClientKeepAliveParameters() opts := []grpc.DialOption{ grpc.WithTransportCredentials(transCreds), @@ -453,7 +447,7 @@ func (c *Client) StartService() error { Shared: true, AvailableMemoryBytes: c.stateManager.GetAvailableMemoryBytesWithOverCommit(), }, - grpc_retry.WithMax(1), + grpc_retry.WithMax(util.MaxGRPCRetriesOnStream), ) // TODO make configurable if err != nil { return err diff --git a/scheduler/pkg/agent/modelserver_controlplane/oip/v2.go b/scheduler/pkg/agent/modelserver_controlplane/oip/v2.go index 136de4e171..39fad22bf0 100644 --- a/scheduler/pkg/agent/modelserver_controlplane/oip/v2.go +++ b/scheduler/pkg/agent/modelserver_controlplane/oip/v2.go @@ -19,7 +19,6 @@ import ( "go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc" "google.golang.org/grpc" "google.golang.org/grpc/credentials/insecure" - "google.golang.org/grpc/keepalive" "google.golang.org/grpc/status" v2 "github.com/seldonio/seldon-core/apis/go/v2/mlops/v2_dataplane" @@ -51,19 +50,12 @@ func CreateV2GrpcConnection(v2Config V2Config) (*grpc.ClientConn, error) { grpc_retry.WithMax(v2Config.GRPRetryMaxCount), } - kacp := keepalive.ClientParameters{ - Time: util.ClientKeapAliveTime, - Timeout: util.ClientKeapAliveTimeout, - PermitWithoutStream: util.ClientKeapAlivePermit, - } - opts := []grpc.DialOption{ grpc.WithTransportCredentials(insecure.NewCredentials()), grpc.WithDefaultCallOptions(grpc.MaxCallRecvMsgSize(v2Config.GRPCMaxMsgSizeBytes), grpc.MaxCallSendMsgSize(v2Config.GRPCMaxMsgSizeBytes)), grpc.WithStreamInterceptor(grpc_retry.StreamClientInterceptor(retryOpts...)), grpc.WithUnaryInterceptor(grpc_retry.UnaryClientInterceptor(retryOpts...)), grpc.WithStatsHandler(otelgrpc.NewClientHandler()), - grpc.WithKeepaliveParams(kacp), } conn, err := grpc.NewClient(fmt.Sprintf("%s:%d", v2Config.Host, v2Config.GRPCPort), opts...) if err != nil { diff --git a/scheduler/pkg/agent/server.go b/scheduler/pkg/agent/server.go index 5388b8d552..6744916ee9 100644 --- a/scheduler/pkg/agent/server.go +++ b/scheduler/pkg/agent/server.go @@ -112,6 +112,7 @@ type Server struct { certificateStore *seldontls.CertificateStore waiter *modelRelocatedWaiter // waiter for when we want to drain a particular server replica autoscalingServiceEnabled bool + agentMutex sync.Map // to force a serial order per agent (serverName, replicaIdx) } type SchedulerAgent interface { @@ -138,6 +139,7 @@ func NewAgentServer( scheduler: scheduler, waiter: newModelRelocatedWaiter(), autoscalingServiceEnabled: autoscalingServiceEnabled, + agentMutex: sync.Map{}, } hub.RegisterModelEventHandler( @@ -165,12 +167,16 @@ func (s *Server) startServer(port uint, secure bool) error { if err != nil { return err } + + kaep := util.GetServerKeepAliveEnforcementPolicy() + opts := []grpc.ServerOption{} if secure { opts = append(opts, grpc.Creds(s.certificateStore.CreateServerTransportCredentials())) } opts = append(opts, grpc.MaxConcurrentStreams(grpcMaxConcurrentStreams)) opts = append(opts, grpc.StatsHandler(otelgrpc.NewServerHandler())) + opts = append(opts, grpc.KeepaliveEnforcementPolicy(kaep)) grpcServer := grpc.NewServer(opts...) pb.RegisterAgentServiceServer(grpcServer, s) s.logger.Printf("Agent server running on %d mtls:%v", port, secure) @@ -379,12 +385,20 @@ func (s *Server) ModelScalingTrigger(stream pb.AgentService_ModelScalingTriggerS func (s *Server) Subscribe(request *pb.AgentSubscribeRequest, stream pb.AgentService_SubscribeServer) error { logger := s.logger.WithField("func", "Subscribe") + key := ServerKey{serverName: request.ServerName, replicaIdx: request.ReplicaIdx} + + // this is forcing a serial order per agent (serverName, replicaIdx) + // in general this will make sure that a given agent disconnects fully before another agent is allowed to connect + mu, _ := s.agentMutex.LoadOrStore(key, &sync.Mutex{}) + mu.(*sync.Mutex).Lock() + defer mu.(*sync.Mutex).Unlock() + logger.Infof("Received subscribe request from %s:%d", request.ServerName, request.ReplicaIdx) fin := make(chan bool) s.mutex.Lock() - s.agents[ServerKey{serverName: request.ServerName, replicaIdx: request.ReplicaIdx}] = &AgentSubscriber{ + s.agents[key] = &AgentSubscriber{ finished: fin, stream: stream, } @@ -410,7 +424,7 @@ func (s *Server) Subscribe(request *pb.AgentSubscribeRequest, stream pb.AgentSer case <-ctx.Done(): logger.Infof("Client replica %s:%d has disconnected", request.ServerName, request.ReplicaIdx) s.mutex.Lock() - delete(s.agents, ServerKey{serverName: request.ServerName, replicaIdx: request.ReplicaIdx}) + delete(s.agents, key) s.mutex.Unlock() s.removeServerReplicaImpl(request.GetServerName(), int(request.GetReplicaIdx())) // this is non-blocking beyond rescheduling models on removed server return nil diff --git a/scheduler/pkg/agent/server_test.go b/scheduler/pkg/agent/server_test.go index 088c6e6237..75b6adf56b 100644 --- a/scheduler/pkg/agent/server_test.go +++ b/scheduler/pkg/agent/server_test.go @@ -10,21 +10,40 @@ the Change License after the Change Date as each is defined in accordance with t package agent import ( + "context" "fmt" + "sync" "testing" "time" . "github.com/onsi/gomega" log "github.com/sirupsen/logrus" "google.golang.org/grpc" + "google.golang.org/grpc/credentials/insecure" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/agent" pb "github.com/seldonio/seldon-core/apis/go/v2/mlops/agent" pbs "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" "github.com/seldonio/seldon-core/scheduler/v2/pkg/coordinator" + testing_utils "github.com/seldonio/seldon-core/scheduler/v2/pkg/internal/testing_utils" + "github.com/seldonio/seldon-core/scheduler/v2/pkg/scheduler" "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" ) +type mockScheduler struct { +} + +var _ scheduler.Scheduler = (*mockScheduler)(nil) + +func (s mockScheduler) Schedule(_ string) error { + return nil +} + +func (s mockScheduler) ScheduleFailedModels() ([]string, error) { + return nil, nil +} + type mockStore struct { models map[string]*store.ModelSnapshot } @@ -91,7 +110,7 @@ func (m *mockStore) UpdateModelState(modelKey string, version uint32, serverKey } func (m *mockStore) AddServerReplica(request *pb.AgentSubscribeRequest) error { - panic("implement me") + return nil } func (m *mockStore) ServerNotify(request *pbs.ServerNotify) error { @@ -99,7 +118,7 @@ func (m *mockStore) ServerNotify(request *pbs.ServerNotify) error { } func (m *mockStore) RemoveServerReplica(serverName string, replicaIdx int) ([]string, error) { - panic("implement me") + return nil, nil } func (m *mockStore) DrainServerReplica(serverName string, replicaIdx int) ([]string, error) { @@ -943,3 +962,122 @@ func TestAutoscalingEnabled(t *testing.T) { } } + +func TestSubscribe(t *testing.T) { + log.SetLevel(log.DebugLevel) + g := NewGomegaWithT(t) + + type ag struct { + id uint32 + doClose bool + } + type test struct { + name string + agents []ag + expectedAgentsCount int + expectedAgentsCountAfterClose int + } + tests := []test{ + { + name: "simple", + agents: []ag{ + {1, true}, {2, true}, + }, + expectedAgentsCount: 2, + expectedAgentsCountAfterClose: 0, + }, + { + name: "simple - no close", + agents: []ag{ + {1, true}, {2, false}, + }, + expectedAgentsCount: 2, + expectedAgentsCountAfterClose: 1, + }, + { + name: "duplicates", + agents: []ag{ + {1, true}, {1, false}, + }, + expectedAgentsCount: 1, + expectedAgentsCountAfterClose: 1, + }, + { + name: "duplicates with all close", + agents: []ag{ + {1, true}, {1, true}, {1, true}, + }, + expectedAgentsCount: 1, + expectedAgentsCountAfterClose: 0, + }, + } + + getStream := func(id uint32, context context.Context, port int) *grpc.ClientConn { + conn, _ := grpc.NewClient(fmt.Sprintf(":%d", port), grpc.WithTransportCredentials(insecure.NewCredentials())) + grpcClient := agent.NewAgentServiceClient(conn) + _, _ = grpcClient.Subscribe( + context, + &agent.AgentSubscribeRequest{ + ServerName: "dummy", + ReplicaIdx: id, + ReplicaConfig: &agent.ReplicaConfig{}, + Shared: true, + AvailableMemoryBytes: 0, + }, + ) + return conn + } + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + logger := log.New() + eventHub, err := coordinator.NewEventHub(logger) + g.Expect(err).To(BeNil()) + server := NewAgentServer(logger, &mockStore{}, mockScheduler{}, eventHub, false) + port, err := testing_utils.GetFreePortForTest() + if err != nil { + t.Fatal(err) + } + err = server.startServer(uint(port), false) + if err != nil { + t.Fatal(err) + } + time.Sleep(100 * time.Millisecond) + + mu := sync.Mutex{} + streams := make([]*grpc.ClientConn, 0) + for _, a := range test.agents { + go func(id uint32) { + conn := getStream(id, context.Background(), port) + mu.Lock() + streams = append(streams, conn) + mu.Unlock() + }(a.id) + } + + maxCount := 10 + count := 0 + for len(server.agents) != test.expectedAgentsCount && count < maxCount { + time.Sleep(100 * time.Millisecond) + count++ + } + g.Expect(len(server.agents)).To(Equal(test.expectedAgentsCount)) + + for idx, s := range streams { + go func(idx int, s *grpc.ClientConn) { + if test.agents[idx].doClose { + s.Close() + } + }(idx, s) + } + + count = 0 + for len(server.agents) != test.expectedAgentsCountAfterClose && count < maxCount { + time.Sleep(100 * time.Millisecond) + count++ + } + g.Expect(len(server.agents)).To(Equal(test.expectedAgentsCountAfterClose)) + + server.StopAgentStreams() + }) + } +} diff --git a/scheduler/pkg/envoy/server/server.go b/scheduler/pkg/envoy/server/server.go index a46eaff52c..cb9d1bceed 100644 --- a/scheduler/pkg/envoy/server/server.go +++ b/scheduler/pkg/envoy/server/server.go @@ -24,6 +24,8 @@ import ( "google.golang.org/grpc" seldontls "github.com/seldonio/seldon-core/components/tls/v2/pkg/tls" + + "github.com/seldonio/seldon-core/scheduler/v2/pkg/util" ) const ( @@ -66,12 +68,14 @@ func (x *XDSServer) StartXDSServer(port uint) error { return err } } + kaep := util.GetServerKeepAliveEnforcementPolicy() secure := x.certificateStore != nil var grpcOptions []grpc.ServerOption if secure { grpcOptions = append(grpcOptions, grpc.Creds(x.certificateStore.CreateServerTransportCredentials())) } grpcOptions = append(grpcOptions, grpc.MaxConcurrentStreams(grpcMaxConcurrentStreams)) + grpcOptions = append(grpcOptions, grpc.KeepaliveEnforcementPolicy(kaep)) grpcServer := grpc.NewServer(grpcOptions...) lis, err := net.Listen("tcp", fmt.Sprintf(":%d", port)) diff --git a/scheduler/pkg/kafka/dataflow/server.go b/scheduler/pkg/kafka/dataflow/server.go index 23ab122364..aeed2ce299 100644 --- a/scheduler/pkg/kafka/dataflow/server.go +++ b/scheduler/pkg/kafka/dataflow/server.go @@ -44,6 +44,7 @@ type ChainerServer struct { pipelineHandler pipeline.PipelineHandler topicNamer *kafka.TopicNamer loadBalancer util.LoadBalancer + chainerMutex sync.Map chainer.UnimplementedChainerServer } @@ -66,6 +67,7 @@ func NewChainerServer(logger log.FieldLogger, eventHub *coordinator.EventHub, pi pipelineHandler: pipelineHandler, topicNamer: topicNamer, loadBalancer: loadBalancer, + chainerMutex: sync.Map{}, } eventHub.RegisterPipelineEventHandler( @@ -82,8 +84,12 @@ func (c *ChainerServer) StartGrpcServer(agentPort uint) error { if err != nil { log.Fatalf("failed to create listener: %v", err) } + + kaep := util.GetServerKeepAliveEnforcementPolicy() + var grpcOptions []grpc.ServerOption grpcOptions = append(grpcOptions, grpc.MaxConcurrentStreams(grpcMaxConcurrentStreams)) + grpcOptions = append(grpcOptions, grpc.KeepaliveEnforcementPolicy(kaep)) grpcServer := grpc.NewServer(grpcOptions...) chainer.RegisterChainerServer(grpcServer, c) c.logger.Printf("Chainer server running on %d", agentPort) @@ -121,17 +127,25 @@ func (c *ChainerServer) PipelineUpdateEvent(ctx context.Context, message *chaine func (c *ChainerServer) SubscribePipelineUpdates(req *chainer.PipelineSubscriptionRequest, stream chainer.Chainer_SubscribePipelineUpdatesServer) error { logger := c.logger.WithField("func", "SubscribePipelineStatus") + + key := req.GetName() + // this is forcing a serial order per dataflow-engine + // in general this will make sure that a given dataflow-engine disconnects fully before another dataflow-engine is allowed to connect + mu, _ := c.chainerMutex.LoadOrStore(key, &sync.Mutex{}) + mu.(*sync.Mutex).Lock() + defer mu.(*sync.Mutex).Unlock() + logger.Infof("Received subscribe request from %s", req.GetName()) fin := make(chan bool) c.mu.Lock() - c.streams[req.Name] = &ChainerSubscription{ - name: req.Name, + c.streams[key] = &ChainerSubscription{ + name: key, stream: stream, fin: fin, } - c.loadBalancer.AddServer(req.Name) + c.loadBalancer.AddServer(key) c.mu.Unlock() // Handle addition of new server @@ -144,13 +158,13 @@ func (c *ChainerServer) SubscribePipelineUpdates(req *chainer.PipelineSubscripti for { select { case <-fin: - logger.Infof("Closing stream for %s", req.GetName()) + logger.Infof("Closing stream for %s", key) return nil case <-ctx.Done(): - logger.Infof("Stream disconnected %s", req.GetName()) + logger.Infof("Stream disconnected %s", key) c.mu.Lock() - c.loadBalancer.RemoveServer(req.Name) - delete(c.streams, req.Name) + c.loadBalancer.RemoveServer(key) + delete(c.streams, key) c.mu.Unlock() // Handle removal of server c.rebalance() diff --git a/scheduler/pkg/kafka/dataflow/server_test.go b/scheduler/pkg/kafka/dataflow/server_test.go index d7f0860f4b..e52780a9a1 100644 --- a/scheduler/pkg/kafka/dataflow/server_test.go +++ b/scheduler/pkg/kafka/dataflow/server_test.go @@ -10,19 +10,23 @@ the Change License after the Change Date as each is defined in accordance with t package dataflow import ( + "context" "fmt" "os" + "sync" "testing" "time" . "github.com/onsi/gomega" log "github.com/sirupsen/logrus" "google.golang.org/grpc" + "google.golang.org/grpc/credentials/insecure" "github.com/seldonio/seldon-core/apis/go/v2/mlops/chainer" "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" "github.com/seldonio/seldon-core/scheduler/v2/pkg/coordinator" + testing_utils "github.com/seldonio/seldon-core/scheduler/v2/pkg/internal/testing_utils" "github.com/seldonio/seldon-core/scheduler/v2/pkg/kafka" "github.com/seldonio/seldon-core/scheduler/v2/pkg/kafka/config" "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" @@ -321,7 +325,7 @@ func TestPipelineRollingUpgradeEvents(t *testing.T) { } // to allow events to propagate - time.Sleep(500 * time.Millisecond) + time.Sleep(700 * time.Millisecond) if test.connection { if test.loadReqV2 != nil { @@ -634,6 +638,136 @@ func TestPipelineRebalance(t *testing.T) { } } +func TestPipelineSubscribe(t *testing.T) { + g := NewGomegaWithT(t) + type ag struct { + id uint32 + doClose bool + } + + type test struct { + name string + agents []ag + expectedAgentsCount int + expectedAgentsCountAfterClose int + } + + tests := []test{ + { + name: "single connection", + agents: []ag{ + {id: 1, doClose: true}, + }, + expectedAgentsCount: 1, + expectedAgentsCountAfterClose: 0, + }, + { + name: "multiple connection - one not closed", + agents: []ag{ + {id: 1, doClose: false}, {id: 2, doClose: true}, + }, + expectedAgentsCount: 2, + expectedAgentsCountAfterClose: 1, + }, + { + name: "multiple connection - not closed", + agents: []ag{ + {id: 1, doClose: false}, {id: 2, doClose: false}, + }, + expectedAgentsCount: 2, + expectedAgentsCountAfterClose: 2, + }, + { + name: "multiple connection - closed", + agents: []ag{ + {id: 1, doClose: true}, {id: 2, doClose: true}, + }, + expectedAgentsCount: 2, + expectedAgentsCountAfterClose: 0, + }, + { + name: "multiple connection - duplicate", + agents: []ag{ + {id: 1, doClose: true}, {id: 1, doClose: true}, {id: 1, doClose: true}, + }, + expectedAgentsCount: 1, + expectedAgentsCountAfterClose: 0, + }, + { + name: "multiple connection - duplicate not closed", + agents: []ag{ + {id: 1, doClose: true}, {id: 1, doClose: false}, {id: 1, doClose: true}, + }, + expectedAgentsCount: 1, + expectedAgentsCountAfterClose: 1, + }, + } + + getStream := func(id uint32, context context.Context, port int) *grpc.ClientConn { + conn, _ := grpc.NewClient(fmt.Sprintf(":%d", port), grpc.WithTransportCredentials(insecure.NewCredentials())) + grpcClient := chainer.NewChainerClient(conn) + _, _ = grpcClient.SubscribePipelineUpdates( + context, + &chainer.PipelineSubscriptionRequest{ + Name: fmt.Sprintf("agent-%d", id), + }, + ) + return conn + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + serverName := "dummy" + s, _ := createTestScheduler(t, serverName) + port, err := testing_utils.GetFreePortForTest() + if err != nil { + t.Fatal(err) + } + go func() { + _ = s.StartGrpcServer(uint(port)) + }() + + time.Sleep(100 * time.Millisecond) + + mu := sync.Mutex{} + streams := make([]*grpc.ClientConn, 0) + for _, a := range test.agents { + go func(id uint32) { + conn := getStream(id, context.Background(), port) + mu.Lock() + streams = append(streams, conn) + mu.Unlock() + }(a.id) + } + + maxCount := 10 + count := 0 + for len(s.streams) != test.expectedAgentsCount && count < maxCount { + time.Sleep(100 * time.Millisecond) + count++ + } + g.Expect(len(s.streams)).To(Equal(test.expectedAgentsCount)) + + for idx, s := range streams { + go func(idx int, s *grpc.ClientConn) { + if test.agents[idx].doClose { + s.Close() + } + }(idx, s) + } + + count = 0 + for len(s.streams) != test.expectedAgentsCountAfterClose && count < maxCount { + time.Sleep(100 * time.Millisecond) + count++ + } + g.Expect(len(s.streams)).To(Equal(test.expectedAgentsCountAfterClose)) + + s.StopSendPipelineEvents() + }) + } +} + type stubChainerServer struct { msgs chan *chainer.PipelineUpdateMessage grpc.ServerStream diff --git a/scheduler/pkg/kafka/gateway/client.go b/scheduler/pkg/kafka/gateway/client.go index def18eb33f..f2c5c60131 100644 --- a/scheduler/pkg/kafka/gateway/client.go +++ b/scheduler/pkg/kafka/gateway/client.go @@ -22,7 +22,6 @@ import ( "google.golang.org/grpc" "google.golang.org/grpc/credentials" "google.golang.org/grpc/credentials/insecure" - "google.golang.org/grpc/keepalive" "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" seldontls "github.com/seldonio/seldon-core/components/tls/v2/pkg/tls" @@ -81,11 +80,7 @@ func (kc *KafkaSchedulerClient) ConnectToScheduler(host string, plainTxtPort int port = tlsPort } - kacp := keepalive.ClientParameters{ - Time: util.ClientKeapAliveTime, - Timeout: util.ClientKeapAliveTimeout, - PermitWithoutStream: util.ClientKeapAlivePermit, - } + kacp := util.GetClientKeepAliveParameters() // note: retry is done in the caller opts := []grpc.DialOption{ @@ -123,11 +118,7 @@ func (kc *KafkaSchedulerClient) Start() error { logFailure := func(err error, delay time.Duration) { kc.logger.WithError(err).Errorf("Scheduler not ready") } - backOffExp := backoff.NewExponentialBackOff() - // Set some reasonable settings for trying to reconnect to scheduler - backOffExp.MaxElapsedTime = 0 // Never stop due to large time between calls - backOffExp.MaxInterval = time.Second * 15 - backOffExp.InitialInterval = time.Second + backOffExp := util.GetClientExponentialBackoff() err := backoff.RetryNotify(kc.SubscribeModelEvents, backOffExp, logFailure) if err != nil { kc.logger.WithError(err).Fatal("Failed to start modelgateway client") @@ -141,7 +132,11 @@ func (kc *KafkaSchedulerClient) SubscribeModelEvents() error { logger := kc.logger.WithField("func", "SubscribeModelEvents") grpcClient := scheduler.NewSchedulerClient(kc.conn) logger.Info("Subscribing to model status events") - stream, errSub := grpcClient.SubscribeModelStatus(context.Background(), &scheduler.ModelSubscriptionRequest{SubscriberName: SubscriberName}, grpc_retry.WithMax(100)) + stream, errSub := grpcClient.SubscribeModelStatus( + context.Background(), + &scheduler.ModelSubscriptionRequest{SubscriberName: SubscriberName}, + grpc_retry.WithMax(util.MaxGRPCRetriesOnStream), + ) if errSub != nil { return errSub } @@ -164,6 +159,16 @@ func (kc *KafkaSchedulerClient) SubscribeModelEvents() error { logger.Infof("Received event name %s version %d state %s", event.ModelName, latestVersionStatus.Version, latestVersionStatus.State.State.String()) + // if the model is in a failed state and the consumer exists then we skip the removal + // this is to prevent the consumer from being removed during transient failures of the control plane + // in this way data plane can potentially continue to serve requests + if latestVersionStatus.GetState().GetState() == scheduler.ModelStatus_ScheduleFailed || latestVersionStatus.GetState().GetState() == scheduler.ModelStatus_ModelProgressing { + if kc.consumerManager.Exists(event.ModelName) { + logger.Warnf("Model %s schedule failed / progressing and consumer exists, skipping from removal", event.ModelName) + continue + } + } + // if there are available replicas then we add the consumer for the model // note that this will also get triggered if the model is already added but there is a status change (e.g. due to scale up) // and in the case then it is a no-op diff --git a/scheduler/pkg/kafka/gateway/worker.go b/scheduler/pkg/kafka/gateway/worker.go index 3852556fdd..6b544c01a1 100644 --- a/scheduler/pkg/kafka/gateway/worker.go +++ b/scheduler/pkg/kafka/gateway/worker.go @@ -32,7 +32,6 @@ import ( "google.golang.org/grpc" "google.golang.org/grpc/credentials" "google.golang.org/grpc/credentials/insecure" - "google.golang.org/grpc/keepalive" "google.golang.org/grpc/metadata" "google.golang.org/protobuf/proto" @@ -124,12 +123,6 @@ func (iw *InferWorker) getGrpcClient(host string, port int) (v2.GRPCInferenceSer creds = insecure.NewCredentials() } - kacp := keepalive.ClientParameters{ - Time: util.ClientKeapAliveTime, - Timeout: util.ClientKeapAliveTimeout, - PermitWithoutStream: util.ClientKeapAlivePermit, - } - opts := []grpc.DialOption{ grpc.WithTransportCredentials(creds), grpc.WithDefaultCallOptions( @@ -142,7 +135,6 @@ func (iw *InferWorker) getGrpcClient(host string, port int) (v2.GRPCInferenceSer grpc.WithUnaryInterceptor( grpc_retry.UnaryClientInterceptor(retryOpts...), ), - grpc.WithKeepaliveParams(kacp), } conn, err := grpc.NewClient(fmt.Sprintf("%s:%d", host, port), opts...) diff --git a/scheduler/pkg/kafka/pipeline/status/client.go b/scheduler/pkg/kafka/pipeline/status/client.go index df77b4d91d..e9191bc4e1 100644 --- a/scheduler/pkg/kafka/pipeline/status/client.go +++ b/scheduler/pkg/kafka/pipeline/status/client.go @@ -22,7 +22,6 @@ import ( "google.golang.org/grpc" "google.golang.org/grpc/credentials" "google.golang.org/grpc/credentials/insecure" - "google.golang.org/grpc/keepalive" "google.golang.org/protobuf/encoding/protojson" "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" @@ -87,11 +86,7 @@ func (pc *PipelineSchedulerClient) connectToScheduler(host string, plainTxtPort port = tlsPort } - kacp := keepalive.ClientParameters{ - Time: util.ClientKeapAliveTime, - Timeout: util.ClientKeapAliveTimeout, - PermitWithoutStream: util.ClientKeapAlivePermit, - } + kacp := util.GetClientKeepAliveParameters() // note: retry is done in the caller opts := []grpc.DialOption{ @@ -129,11 +124,7 @@ func (pc *PipelineSchedulerClient) Start(host string, plainTxtPort int, tlsPort logFailure := func(err error, delay time.Duration) { logger.WithError(err).Errorf("Scheduler not ready") } - backOffExp := backoff.NewExponentialBackOff() - // Set some reasonable settings for trying to reconnect to scheduler - backOffExp.MaxElapsedTime = 0 // Never stop due to large time between calls - backOffExp.MaxInterval = time.Second * 15 - backOffExp.InitialInterval = time.Second + backOffExp := util.GetClientExponentialBackoff() err = backoff.RetryNotify(pc.SubscribePipelineEvents, backOffExp, logFailure) if err != nil { logger.WithError(err).Fatal("Failed to start pipeline gateway client") @@ -147,7 +138,11 @@ func (pc *PipelineSchedulerClient) SubscribePipelineEvents() error { logger := pc.logger.WithField("func", "SubscribePipelineEvents") grpcClient := scheduler.NewSchedulerClient(pc.conn) logger.Info("Subscribing to pipeline status events") - stream, errSub := grpcClient.SubscribePipelineStatus(context.Background(), &scheduler.PipelineSubscriptionRequest{SubscriberName: SubscriberName}, grpc_retry.WithMax(100)) + stream, errSub := grpcClient.SubscribePipelineStatus( + context.Background(), + &scheduler.PipelineSubscriptionRequest{SubscriberName: SubscriberName}, + grpc_retry.WithMax(util.MaxGRPCRetriesOnStream), + ) if errSub != nil { return errSub } diff --git a/scheduler/pkg/server/server.go b/scheduler/pkg/server/server.go index 3f74c40da1..a9a369989c 100644 --- a/scheduler/pkg/server/server.go +++ b/scheduler/pkg/server/server.go @@ -32,6 +32,7 @@ import ( "github.com/seldonio/seldon-core/scheduler/v2/pkg/store/experiment" "github.com/seldonio/seldon-core/scheduler/v2/pkg/store/pipeline" "github.com/seldonio/seldon-core/scheduler/v2/pkg/synchroniser" + "github.com/seldonio/seldon-core/scheduler/v2/pkg/util" ) const ( @@ -131,12 +132,16 @@ func (s *SchedulerServer) startServer(port uint, secure bool) error { if err != nil { return err } + + kaep := util.GetServerKeepAliveEnforcementPolicy() + opts := []grpc.ServerOption{} if secure { opts = append(opts, grpc.Creds(s.certificateStore.CreateServerTransportCredentials())) } opts = append(opts, grpc.MaxConcurrentStreams(grpcMaxConcurrentStreams)) opts = append(opts, grpc.StatsHandler(otelgrpc.NewServerHandler())) + opts = append(opts, grpc.KeepaliveEnforcementPolicy(kaep)) grpcServer := grpc.NewServer(opts...) pb.RegisterSchedulerServer(grpcServer, s) s.logger.Printf("Scheduler server running on %d mtls:%v", port, secure) diff --git a/scheduler/pkg/store/experiment/db_test.go b/scheduler/pkg/store/experiment/db_test.go index 44f99abe4a..b8085a1e46 100644 --- a/scheduler/pkg/store/experiment/db_test.go +++ b/scheduler/pkg/store/experiment/db_test.go @@ -12,7 +12,6 @@ package experiment import ( "fmt" "testing" - "time" "github.com/dgraph-io/badger/v3" "github.com/google/go-cmp/cmp" @@ -105,7 +104,6 @@ func TestSaveWithTTL(t *testing.T) { logger := log.New() db, err := newExperimentDbManager(getExperimentDbFolder(path), logger, 10) g.Expect(err).To(BeNil()) - experiment.DeletedAt = time.Now() err = db.save(experiment) g.Expect(err).To(BeNil()) @@ -117,6 +115,18 @@ func TestSaveWithTTL(t *testing.T) { g.Expect(err).To(BeNil()) g.Expect(item.ExpiresAt()).ToNot(BeZero()) + // check that the resource can be "undeleted" + experiment.Deleted = false + err = db.save(experiment) + g.Expect(err).To(BeNil()) + + err = db.db.View(func(txn *badger.Txn) error { + item, err = txn.Get(([]byte(experiment.Name))) + return err + }) + g.Expect(err).To(BeNil()) + g.Expect(item.ExpiresAt()).To(BeZero()) + err = db.Stop() g.Expect(err).To(BeNil()) } diff --git a/scheduler/pkg/store/experiment/store.go b/scheduler/pkg/store/experiment/store.go index f5cf49de66..d4ba9ed918 100644 --- a/scheduler/pkg/store/experiment/store.go +++ b/scheduler/pkg/store/experiment/store.go @@ -474,7 +474,6 @@ func (es *ExperimentStore) GetExperiments() ([]*Experiment, error) { func (es *ExperimentStore) cleanupDeletedExperiments() { es.mu.Lock() defer es.mu.Unlock() - es.logger.Info("cleaning up deleted experiments") for _, experiment := range es.experiments { if experiment.Deleted { if experiment.DeletedAt.IsZero() { @@ -487,6 +486,7 @@ func (es *ExperimentStore) cleanupDeletedExperiments() { } } else if experiment.DeletedAt.Add(es.db.deletedResourceTTL).Before(time.Now()) { delete(es.experiments, experiment.Name) + es.logger.Info("cleaning up deleted experiment: %s", experiment.Name) } } } diff --git a/scheduler/pkg/store/pipeline/db_test.go b/scheduler/pkg/store/pipeline/db_test.go index ce3ca86cdb..414cf90638 100644 --- a/scheduler/pkg/store/pipeline/db_test.go +++ b/scheduler/pkg/store/pipeline/db_test.go @@ -67,6 +67,18 @@ func TestSaveWithTTL(t *testing.T) { g.Expect(err).To(BeNil()) g.Expect(item.ExpiresAt()).ToNot(BeZero()) + // check that the resource can be "undeleted" + pipeline.Deleted = false + err = db.save(pipeline) + g.Expect(err).To(BeNil()) + + err = db.db.View(func(txn *badger.Txn) error { + item, err = txn.Get(([]byte(pipeline.Name))) + return err + }) + g.Expect(err).To(BeNil()) + g.Expect(item.ExpiresAt()).To(BeZero()) + err = db.Stop() g.Expect(err).To(BeNil()) } diff --git a/scheduler/pkg/store/pipeline/store.go b/scheduler/pkg/store/pipeline/store.go index cd08ed553a..36ddc16405 100644 --- a/scheduler/pkg/store/pipeline/store.go +++ b/scheduler/pkg/store/pipeline/store.go @@ -448,7 +448,6 @@ func (ps *PipelineStore) handleModelEvents(event coordinator.ModelEventMsg) { func (ps *PipelineStore) cleanupDeletedPipelines() { ps.mu.Lock() defer ps.mu.Unlock() - ps.logger.Info("cleaning up deleted pipelines") for _, pipeline := range ps.pipelines { if pipeline.Deleted { if pipeline.DeletedAt.IsZero() { @@ -461,6 +460,7 @@ func (ps *PipelineStore) cleanupDeletedPipelines() { } } else if pipeline.DeletedAt.Add(ps.db.deletedResourceTTL).Before(time.Now()) { delete(ps.pipelines, pipeline.Name) + ps.logger.Info("cleaning up deleted pipeline: %s", pipeline.Name) } } } diff --git a/scheduler/pkg/util/constants.go b/scheduler/pkg/util/constants.go index 971c4a860c..ea837db8ce 100644 --- a/scheduler/pkg/util/constants.go +++ b/scheduler/pkg/util/constants.go @@ -36,7 +36,14 @@ const ( const ( EnvoyUpdateDefaultBatchWait = 250 * time.Millisecond - ClientKeapAliveTime = 60 * time.Second - ClientKeapAliveTimeout = 2 * time.Second - ClientKeapAlivePermit = true + // note that we keep client and server keepalive times the same + // they need to match counterparts in controller client: operator/scheduler/client.go + // and dataflow-engine: data-flow/src/main/kotlin/io/seldon/dataflow/PipelineSubscriber.kt + gRPCKeepAliveTime = 60 * time.Second + clientKeepAliveTimeout = 2 * time.Second + gRPCKeepAlivePermit = false + MaxGRPCRetriesOnStream = 100 // this is at the grpc library level + backOffExpMaxElapsedTime = 0 // Never stop due to large time between calls + backOffExpMaxInterval = time.Second * 15 + backOffExpInitialInterval = time.Second ) diff --git a/scheduler/pkg/util/grpc.go b/scheduler/pkg/util/grpc.go new file mode 100644 index 0000000000..96aed017da --- /dev/null +++ b/scheduler/pkg/util/grpc.go @@ -0,0 +1,38 @@ +/* +Copyright (c) 2024 Seldon Technologies Ltd. + +Use of this software is governed by +(1) the license included in the LICENSE file or +(2) if the license included in the LICENSE file is the Business Source License 1.1, +the Change License after the Change Date as each is defined in accordance with the LICENSE file. +*/ + +package util + +import ( + "github.com/cenkalti/backoff/v4" + "google.golang.org/grpc/keepalive" +) + +func GetClientKeepAliveParameters() keepalive.ClientParameters { + return keepalive.ClientParameters{ + Time: gRPCKeepAliveTime, + Timeout: clientKeepAliveTimeout, + PermitWithoutStream: gRPCKeepAlivePermit, + } +} + +func GetServerKeepAliveEnforcementPolicy() keepalive.EnforcementPolicy { + return keepalive.EnforcementPolicy{ + MinTime: gRPCKeepAliveTime, + PermitWithoutStream: gRPCKeepAlivePermit, + } +} + +func GetClientExponentialBackoff() *backoff.ExponentialBackOff { + backOffExp := backoff.NewExponentialBackOff() + backOffExp.MaxElapsedTime = backOffExpMaxElapsedTime + backOffExp.MaxInterval = backOffExpMaxInterval + backOffExp.InitialInterval = backOffExpInitialInterval + return backOffExp +} From e85a6a7a1e502d37de43791cfd27fc1bbbdc2f48 Mon Sep 17 00:00:00 2001 From: Niall D <4562759+driev@users.noreply.github.com> Date: Fri, 1 Nov 2024 16:43:09 +0000 Subject: [PATCH 4/4] Revert "merge v2" This reverts commit 4cf864426a0b2d58bf3f67062f11600ffbd1904c. --- docs-gb/architecture/README.md | 58 ++--- docs-gb/examples/pipeline-examples.md | 1 - docs-gb/kubernetes/service-meshes/traefik.md | 234 +----------------- operator/scheduler/client.go | 33 +-- operator/scheduler/control_plane.go | 4 +- operator/scheduler/experiment.go | 12 +- operator/scheduler/model.go | 12 +- operator/scheduler/pipeline.go | 12 +- operator/scheduler/server.go | 8 +- scheduler/config/envoy.yaml | 7 +- .../src/main/kotlin/io/seldon/dataflow/Cli.kt | 23 +- .../main/kotlin/io/seldon/dataflow/Main.kt | 4 +- .../io/seldon/dataflow/PipelineSubscriber.kt | 4 - .../src/main/resources/local.properties | 1 - .../test/kotlin/io/seldon/dataflow/CliTest.kt | 31 --- scheduler/pkg/agent/client.go | 12 +- .../agent/modelserver_controlplane/oip/v2.go | 8 + scheduler/pkg/agent/server.go | 18 +- scheduler/pkg/agent/server_test.go | 142 +---------- scheduler/pkg/envoy/server/server.go | 4 - scheduler/pkg/kafka/dataflow/server.go | 28 +-- scheduler/pkg/kafka/dataflow/server_test.go | 136 +--------- scheduler/pkg/kafka/gateway/client.go | 29 +-- scheduler/pkg/kafka/gateway/worker.go | 8 + scheduler/pkg/kafka/pipeline/status/client.go | 19 +- scheduler/pkg/server/server.go | 5 - scheduler/pkg/store/experiment/db_test.go | 14 +- scheduler/pkg/store/experiment/store.go | 2 +- scheduler/pkg/store/pipeline/db_test.go | 12 - scheduler/pkg/store/pipeline/store.go | 2 +- scheduler/pkg/util/constants.go | 13 +- scheduler/pkg/util/grpc.go | 38 --- 32 files changed, 126 insertions(+), 808 deletions(-) delete mode 100644 scheduler/pkg/util/grpc.go diff --git a/docs-gb/architecture/README.md b/docs-gb/architecture/README.md index bb73260360..48247cd482 100644 --- a/docs-gb/architecture/README.md +++ b/docs-gb/architecture/README.md @@ -1,62 +1,36 @@ # Architecture -Seldon Core 2 uses a microservice architecture where each service has limited and well-defined responsibilities working together to orchestrate scalable and fault-tolerant ML serving and management. These components communicate internally using gRPC and they can be scaled independently. Seldon Core 2 services can be split into two categories: - -* **Control Plane** services are responsible for managing the operations and configurations of your ML models and workflows. This includes functionality to instantiate new inference servers, load models, update new versions of models, configure model experiments and pipelines, and expose endpoints that may receive inference requests. The main control plane component is the **Scheduler** that is responsible for managing the loading and unloading of resources (models, pipelines, experiments) onto the respective components. - -* **Data Plane** services are responsible for managing the flow of data between components or models. Core 2 supports REST and gRPC payloads that follow the Open Inference Protocol (OIP). The main data plane service is **Envoy**, which acts as a single ingress for all data plane load and routes data to the relevant servers internally (e.g. Seldon MLServer or NVidia Triton pods). - -{% hint style="info" %} -**Note**: Because Core 2 architecture separates control plane and data plane responsibilities, when control plane services are down (e.g. the Scheduler), data plane inference can still be served. In this manner the system is more resilient to failures. For example, an outage of control plane services does not impact the ability of the system to respond to end user traffic. Core 2 can be provisioned to be **highly available** on the data plane path. -{% endhint %} - - -The current set of services used in Seldon Core 2 is shown below. Following the diagram, we will describe each control plane and data plan service. +The current set of components used in Seldon Core 2 is shown below: ![architecture](../images/architecture.png) -## Control Plane - -### Scheduler -This service manages the loading and unloading of Models, Pipelines and Experiments on the relevant micro services. It is also responsible for matching Models with available Servers in a way that optimises infrastructure use. In the current design we can only have _one_ instance of the Scheduler as its internal state is persisted on disk. +The core components are: -When the Scheduler (re)starts there is a synchronisation flow to coordinate the startup process and to attempt to wait for expected Model Servers to connect before proceeding with control plane operations. This is important so that ongoing data plane operations are not interrupted. This then introduces a delay on any control plane operations until the process has finished (including control plan resources status updates). This synchronisation process has a timeout, which has a default of 10 minutes. It can be changed by setting helm seldon-core-v2-components value `scheduler.schedulerReadyTimeoutSeconds`. +* Scheduler : manages the load and unload of models, pipelines, explainers and experiments. +* Pipeline gateway : handles REST/gRPC calls to pipelines. +* Dataflow engine : handles the flow of data between components in a pipeline. +* Model gateway : handles the flow of data from models to inference requests on servers and passes on the responses. +* Agent : manages the loading and unloading of models on a server and access to the server over REST/gRPC. +* Envoy : manages the proxying of requests to the correct servers including load balancing. -### Agent -This service manages the loading and unloading of models on a server and access to the server over REST/gRPC. It acts as a reverse proxy to connect end users with the actual Model Servers. In this way the system collects stats and metrics about data plane inferences that helps with observability and scaling. +All the above are Kubernetes agnostic and can run locally, e.g. on Docker Compose. -### Controller -We also provide a Kubernetes Operator to allow Kubernetes usage. This is implemented in the Controller Manager microservice, which manages CRD reconciliation with Scheduler. Currently Core 2 supports _one_ instance of the Controller. +We also provide a Kubernetes Operator to allow Kubernetes usage. -{% hint style="info" %} -**Note**: All services besides the Controller are Kubernetes agnostic and can run locally, e.g. on Docker Compose. -{% endhint %} +Kafka is used as the backbone for Pipelines allowing a decentralized, synchronous and asynchronous usage. -## Data Plane +## Kafka -### Pipeline Gateway -This service handles REST/gRPC calls to Pipelines. It translates between synchronous requests to Kafka operations, producing a message on the relevant input topic for a Pipeline and consuming from the output topic to return inference results back to the users. +Kafka is used as the backbone for allowing Pipelines of Models to be connected together into arbitrary directed acyclic graphs. Models can be reused in different Pipelines. The flow of data between models is handled by the dataflow engine using [KStreams](https://docs.confluent.io/platform/current/streams/concepts.html). -### Model Gateway -This service handles the flow of data from models to inference requests on servers and passes on the responses via Kafka. - -### Dataflow Engine -This service handles the flow of data between components in a pipeline, using Kafka Streams. It enables Core 2 to chain and join Models together to provide complex Pipelines. - -### Envoy -This service manages the proxying of requests to the correct servers including load balancing. +![kafka](../images/kafka.png) -## Dataflow Architecture and Pipelines +## Dataflow Architecture -To support the movement towards data centric machine learning Seldon Core 2 follows a dataflow paradigm. By taking a decentralized route that focuses on the flow of data, users can have more flexibility and insight as they build and manage complex AI applications in production. This contrasts with more centralized orchestration approaches where data is secondary. +Seldon V2 follows a dataflow design paradigm and it's part of the current movement for data centric machine learning. By taking a decentralized route that focuses on the flow of data users can have more flexibility and insight in building complex applications containing machine learning and traditional components. This contrasts with a more centralized orchestration more traditional in service orientated architectures. ![dataflow](../images/dataflow.png) -### Kafka -Kafka is used as the backbone for Pipelines allowing decentralized, synchronous and asynchronous usage. This enables Models to be connected together into arbitrary directed acyclic graphs. Models can be reused in different Pipelines. The flow of data between models is handled by the dataflow engine using [Kafka Streams](https://docs.confluent.io/platform/current/streams/concepts.html). - -![kafka](../images/kafka.png) - By focusing on the data we allow users to join various flows together using stream joining concepts as shown below. ![joins](../images/joins.png) diff --git a/docs-gb/examples/pipeline-examples.md b/docs-gb/examples/pipeline-examples.md index 647838e37f..55f9a94ea1 100644 --- a/docs-gb/examples/pipeline-examples.md +++ b/docs-gb/examples/pipeline-examples.md @@ -31,7 +31,6 @@ spec: requirements: - tensorflow memory: 100Ki ---- apiVersion: mlops.seldon.io/v1alpha1 kind: Model metadata: diff --git a/docs-gb/kubernetes/service-meshes/traefik.md b/docs-gb/kubernetes/service-meshes/traefik.md index 2b2f1cf8df..472d6799eb 100644 --- a/docs-gb/kubernetes/service-meshes/traefik.md +++ b/docs-gb/kubernetes/service-meshes/traefik.md @@ -11,239 +11,15 @@ We will run through some examples as shown in the notebook `service-meshes/traef * Traefik IngressRoute * Traefik Middleware for adding a header -```yaml -apiVersion: v1 -kind: Service -metadata: - name: myapps - namespace: seldon-mesh -spec: - ports: - - name: web - port: 80 - protocol: TCP - selector: - app: traefik-ingress-lb - type: LoadBalancer ---- -apiVersion: mlops.seldon.io/v1alpha1 -kind: Model -metadata: - name: iris - namespace: seldon-mesh -spec: - requirements: - - sklearn - storageUri: gs://seldon-models/mlserver/iris ---- -apiVersion: traefik.containo.us/v1alpha1 -kind: IngressRoute -metadata: - name: iris - namespace: seldon-mesh -spec: - entryPoints: - - web - routes: - - kind: Rule - match: PathPrefix(`/`) - middlewares: - - name: iris-header - services: - - name: seldon-mesh - port: 80 - scheme: h2c ---- -apiVersion: traefik.containo.us/v1alpha1 -kind: Middleware -metadata: - name: iris-header - namespace: seldon-mesh -spec: - headers: - customRequestHeaders: - seldon-model: iris +```{literalinclude} ../../../../../../service-meshes/traefik/static/single-model.yaml +:language: yaml ``` ## Traffic Split -{% hint style="warning" %} -**Warning** Traffic splitting does not presently work due to this [issue](https://github.com/emissary-ingress/emissary/issues/4062). We recommend you use a Seldon Experiment instead. -{% endhint %} - -## Traefik Examples - -Assumes - - * You have installed Traefik as per their [docs](https://doc.traefik.io/traefik/getting-started/install-traefik/#use-the-helm-chart) into namespace traefik-v2 - - Tested with traefik-10.19.4 - - - -```python -INGRESS_IP=!kubectl get svc traefik -n traefik-v2 -o jsonpath='{.status.loadBalancer.ingress[0].ip}' -INGRESS_IP=INGRESS_IP[0] -import os -os.environ['INGRESS_IP'] = INGRESS_IP -INGRESS_IP +```{warning} +Traffic splitting does not presently work due to this [issue](https://github.com/emissary-ingress/emissary/issues/4062). We recommend you use a Seldon Experiment instead. ``` - - - - '172.21.255.1' - - - -### Traefik Single Model Example - - -```python -!kustomize build config/single-model +```{include} ../../../../../../service-meshes/traefik/README.md ``` - - apiVersion: v1 - kind: Service - metadata: - name: myapps - namespace: seldon-mesh - spec: - ports: - - name: web - port: 80 - protocol: TCP - selector: - app: traefik-ingress-lb - type: LoadBalancer - --- - apiVersion: mlops.seldon.io/v1alpha1 - kind: Model - metadata: - name: iris - namespace: seldon-mesh - spec: - requirements: - - sklearn - storageUri: gs://seldon-models/mlserver/iris - --- - apiVersion: traefik.containo.us/v1alpha1 - kind: IngressRoute - metadata: - name: iris - namespace: seldon-mesh - spec: - entryPoints: - - web - routes: - - kind: Rule - match: PathPrefix(`/`) - middlewares: - - name: iris-header - services: - - name: seldon-mesh - port: 80 - scheme: h2c - --- - apiVersion: traefik.containo.us/v1alpha1 - kind: Middleware - metadata: - name: iris-header - namespace: seldon-mesh - spec: - headers: - customRequestHeaders: - seldon-model: iris - - - -```python -!kustomize build config/single-model | kubectl apply -f - -``` - - service/myapps created - model.mlops.seldon.io/iris created - ingressroute.traefik.containo.us/iris created - middleware.traefik.containo.us/iris-header created - - - -```python -!kubectl wait --for condition=ready --timeout=300s model --all -n seldon-mesh -``` - - model.mlops.seldon.io/iris condition met - - - -```python -!curl -v http://${INGRESS_IP}/v2/models/iris/infer -H "Content-Type: application/json" \ - -d '{"inputs": [{"name": "predict", "shape": [1, 4], "datatype": "FP32", "data": [[1, 2, 3, 4]]}]}' -``` - - * Trying 172.21.255.1... - * Connected to 172.21.255.1 (172.21.255.1) port 80 (#0) - > POST /v2/models/iris/infer HTTP/1.1 - > Host: 172.21.255.1 - > User-Agent: curl/7.47.0 - > Accept: */* - > Content-Type: application/json - > Content-Length: 94 - > - * upload completely sent off: 94 out of 94 bytes - < HTTP/1.1 200 OK - < Content-Length: 196 - < Content-Type: application/json - < Date: Sat, 16 Apr 2022 15:53:27 GMT - < Seldon-Route: iris_1 - < Server: envoy - < X-Envoy-Upstream-Service-Time: 895 - < - * Connection #0 to host 172.21.255.1 left intact - {"model_name":"iris_1","model_version":"1","id":"0dccf477-78fa-4a11-92ff-4d7e4f1cdda8","parameters":null,"outputs":[{"name":"predict","shape":[1],"datatype":"INT64","parameters":null,"data":[2]}]} - - -```python -!grpcurl -d '{"model_name":"iris","inputs":[{"name":"input","contents":{"fp32_contents":[1,2,3,4]},"datatype":"FP32","shape":[1,4]}]}' \ - -plaintext \ - -import-path ../../apis \ - -proto ../../apis/mlops/v2_dataplane/v2_dataplane.proto \ - ${INGRESS_IP}:80 inference.GRPCInferenceService/ModelInfer -``` - - { - "modelName": "iris_1", - "modelVersion": "1", - "outputs": [ - { - "name": "predict", - "datatype": "INT64", - "shape": [ - "1" - ], - "contents": { - "int64Contents": [ - "2" - ] - } - } - ] - } - - - -```python -!kustomize build config/single-model | kubectl delete -f - -``` - - service "myapps" deleted - model.mlops.seldon.io "iris" deleted - ingressroute.traefik.containo.us "iris" deleted - middleware.traefik.containo.us "iris-header" deleted - - - -```python - -``` - diff --git a/operator/scheduler/client.go b/operator/scheduler/client.go index 98492d31c4..a2d0142ade 100644 --- a/operator/scheduler/client.go +++ b/operator/scheduler/client.go @@ -33,16 +33,11 @@ import ( const ( // these 2 constants in combination with the backoff exponential function will give us a max backoff of 13.5 minutes - schedulerConnectMaxRetries = 100 - schedulerConnectBackoffScalar = 200 * time.Millisecond - // these keep alive settings need to match the scheduler counterpart in scheduler/pkg/util/constants.go - clientKeepAliveTime = 60 * time.Second - clientKeepAliveTimeout = 2 * time.Second - clientKeepAlivePermit = false - // backoff - backoffMaxElapsedTime = 0 // Never stop due to large time between calls - backOffMaxInterval = time.Second * 15 - backOffInitialInterval = time.Second + SchedulerConnectMaxRetries = 12 + SchedulerConnectBackoffScalar = 200 * time.Millisecond + ClientKeapAliveTime = 60 * time.Second + ClientKeapAliveTimeout = 2 * time.Second + ClientKeapAlivePermit = true ) type SchedulerClient struct { @@ -234,9 +229,9 @@ func (s *SchedulerClient) connectToScheduler(host string, namespace string, plai } } kacp := keepalive.ClientParameters{ - Time: clientKeepAliveTime, - Timeout: clientKeepAliveTimeout, - PermitWithoutStream: clientKeepAlivePermit, + Time: ClientKeapAliveTime, + Timeout: ClientKeapAliveTimeout, + PermitWithoutStream: ClientKeapAlivePermit, } retryOpts := []grpc_retry.CallOption{ @@ -254,7 +249,7 @@ func (s *SchedulerClient) connectToScheduler(host string, namespace string, plai opts = append(opts, grpc.WithTransportCredentials(insecure.NewCredentials())) s.logger.Info("Running scheduler client in plain text mode", "port", port) } - // we dont have backoff retry on the grpc streams as we handle this in the event handlers + opts = append(opts, grpc.WithStreamInterceptor(grpc_retry.StreamClientInterceptor(retryOpts...))) opts = append(opts, grpc.WithUnaryInterceptor(grpc_retry.UnaryClientInterceptor(retryOpts...))) opts = append(opts, grpc.WithKeepaliveParams(kacp)) s.logger.Info("Dialing scheduler", "host", host, "port", port) @@ -318,7 +313,7 @@ func retryFn( logFailure := func(err error, delay time.Duration) { logger.Error(err, "Scheduler not ready") } - backOffExp := getClientExponentialBackoff() + backOffExp := backoff.NewExponentialBackOff() fnWithArgs := func() error { grpcClient := scheduler.NewSchedulerClient(conn) return fn(context.Background(), grpcClient, namespace) @@ -330,11 +325,3 @@ func retryFn( } return nil } - -func getClientExponentialBackoff() *backoff.ExponentialBackOff { - backOffExp := backoff.NewExponentialBackOff() - backOffExp.MaxElapsedTime = backoffMaxElapsedTime - backOffExp.MaxInterval = backOffMaxInterval - backOffExp.InitialInterval = backOffInitialInterval - return backOffExp -} diff --git a/operator/scheduler/control_plane.go b/operator/scheduler/control_plane.go index 9834cfea98..9dbf18a590 100644 --- a/operator/scheduler/control_plane.go +++ b/operator/scheduler/control_plane.go @@ -31,8 +31,8 @@ func (s *SchedulerClient) SubscribeControlPlaneEvents(ctx context.Context, grpcC stream, err := grpcClient.SubscribeControlPlane( ctx, &scheduler.ControlPlaneSubscriptionRequest{SubscriberName: "seldon manager"}, - grpc_retry.WithMax(schedulerConnectMaxRetries), - grpc_retry.WithBackoff(grpc_retry.BackoffExponential(schedulerConnectBackoffScalar)), + grpc_retry.WithMax(SchedulerConnectMaxRetries), + grpc_retry.WithBackoff(grpc_retry.BackoffExponential(SchedulerConnectBackoffScalar)), ) if err != nil { return err diff --git a/operator/scheduler/experiment.go b/operator/scheduler/experiment.go index 50be2e6cf8..c2b4275800 100644 --- a/operator/scheduler/experiment.go +++ b/operator/scheduler/experiment.go @@ -43,8 +43,8 @@ func (s *SchedulerClient) StartExperiment(ctx context.Context, experiment *v1alp _, err = grpcClient.StartExperiment( ctx, req, - grpc_retry.WithMax(schedulerConnectMaxRetries), - grpc_retry.WithBackoff(grpc_retry.BackoffExponential(schedulerConnectBackoffScalar)), + grpc_retry.WithMax(SchedulerConnectMaxRetries), + grpc_retry.WithBackoff(grpc_retry.BackoffExponential(SchedulerConnectBackoffScalar)), ) return s.checkErrorRetryable(experiment.Kind, experiment.Name, err), err } @@ -66,8 +66,8 @@ func (s *SchedulerClient) StopExperiment(ctx context.Context, experiment *v1alph _, err = grpcClient.StopExperiment( ctx, req, - grpc_retry.WithMax(schedulerConnectMaxRetries), - grpc_retry.WithBackoff(grpc_retry.BackoffExponential(schedulerConnectBackoffScalar)), + grpc_retry.WithMax(SchedulerConnectMaxRetries), + grpc_retry.WithBackoff(grpc_retry.BackoffExponential(SchedulerConnectBackoffScalar)), ) return s.checkErrorRetryable(experiment.Kind, experiment.Name, err), err } @@ -79,8 +79,8 @@ func (s *SchedulerClient) SubscribeExperimentEvents(ctx context.Context, grpcCli stream, err := grpcClient.SubscribeExperimentStatus( ctx, &scheduler.ExperimentSubscriptionRequest{SubscriberName: "seldon manager"}, - grpc_retry.WithMax(schedulerConnectMaxRetries), - grpc_retry.WithBackoff(grpc_retry.BackoffExponential(schedulerConnectBackoffScalar)), + grpc_retry.WithMax(SchedulerConnectMaxRetries), + grpc_retry.WithBackoff(grpc_retry.BackoffExponential(SchedulerConnectBackoffScalar)), ) if err != nil { return err diff --git a/operator/scheduler/model.go b/operator/scheduler/model.go index f49217be38..767aabd84f 100644 --- a/operator/scheduler/model.go +++ b/operator/scheduler/model.go @@ -60,8 +60,8 @@ func (s *SchedulerClient) LoadModel(ctx context.Context, model *v1alpha1.Model, _, err = grpcClient.LoadModel( ctx, &loadModelRequest, - grpc_retry.WithMax(schedulerConnectMaxRetries), - grpc_retry.WithBackoff(grpc_retry.BackoffExponential(schedulerConnectBackoffScalar)), + grpc_retry.WithMax(SchedulerConnectMaxRetries), + grpc_retry.WithBackoff(grpc_retry.BackoffExponential(SchedulerConnectBackoffScalar)), ) if err != nil { return s.checkErrorRetryable(model.Kind, model.Name, err), err @@ -102,8 +102,8 @@ func (s *SchedulerClient) UnloadModel(ctx context.Context, model *v1alpha1.Model _, err = grpcClient.UnloadModel( ctx, modelRef, - grpc_retry.WithMax(schedulerConnectMaxRetries), - grpc_retry.WithBackoff(grpc_retry.BackoffExponential(schedulerConnectBackoffScalar)), + grpc_retry.WithMax(SchedulerConnectMaxRetries), + grpc_retry.WithBackoff(grpc_retry.BackoffExponential(SchedulerConnectBackoffScalar)), ) if err != nil { return s.checkErrorRetryable(model.Kind, model.Name, err), err @@ -117,8 +117,8 @@ func (s *SchedulerClient) SubscribeModelEvents(ctx context.Context, grpcClient s stream, err := grpcClient.SubscribeModelStatus( ctx, &scheduler.ModelSubscriptionRequest{SubscriberName: "seldon manager"}, - grpc_retry.WithMax(schedulerConnectMaxRetries), - grpc_retry.WithBackoff(grpc_retry.BackoffExponential(schedulerConnectBackoffScalar)), + grpc_retry.WithMax(SchedulerConnectMaxRetries), + grpc_retry.WithBackoff(grpc_retry.BackoffExponential(SchedulerConnectBackoffScalar)), ) if err != nil { return err diff --git a/operator/scheduler/pipeline.go b/operator/scheduler/pipeline.go index fce4af998c..586e84103d 100644 --- a/operator/scheduler/pipeline.go +++ b/operator/scheduler/pipeline.go @@ -42,8 +42,8 @@ func (s *SchedulerClient) LoadPipeline(ctx context.Context, pipeline *v1alpha1.P _, err = grpcClient.LoadPipeline( ctx, &req, - grpc_retry.WithMax(schedulerConnectMaxRetries), - grpc_retry.WithBackoff(grpc_retry.BackoffExponential(schedulerConnectBackoffScalar)), + grpc_retry.WithMax(SchedulerConnectMaxRetries), + grpc_retry.WithBackoff(grpc_retry.BackoffExponential(SchedulerConnectBackoffScalar)), ) return s.checkErrorRetryable(pipeline.Kind, pipeline.Name, err), err } @@ -62,8 +62,8 @@ func (s *SchedulerClient) UnloadPipeline(ctx context.Context, pipeline *v1alpha1 _, err = grpcClient.UnloadPipeline( ctx, &req, - grpc_retry.WithMax(schedulerConnectMaxRetries), - grpc_retry.WithBackoff(grpc_retry.BackoffExponential(schedulerConnectBackoffScalar)), + grpc_retry.WithMax(SchedulerConnectMaxRetries), + grpc_retry.WithBackoff(grpc_retry.BackoffExponential(SchedulerConnectBackoffScalar)), ) if err != nil { return err, s.checkErrorRetryable(pipeline.Kind, pipeline.Name, err) @@ -85,8 +85,8 @@ func (s *SchedulerClient) SubscribePipelineEvents(ctx context.Context, grpcClien stream, err := grpcClient.SubscribePipelineStatus( ctx, &scheduler.PipelineSubscriptionRequest{SubscriberName: "seldon manager"}, - grpc_retry.WithMax(schedulerConnectMaxRetries), - grpc_retry.WithBackoff(grpc_retry.BackoffExponential(schedulerConnectBackoffScalar)), + grpc_retry.WithMax(SchedulerConnectMaxRetries), + grpc_retry.WithBackoff(grpc_retry.BackoffExponential(SchedulerConnectBackoffScalar)), ) if err != nil { return err diff --git a/operator/scheduler/server.go b/operator/scheduler/server.go index 8ecf39bf17..af1f88fc19 100644 --- a/operator/scheduler/server.go +++ b/operator/scheduler/server.go @@ -64,8 +64,8 @@ func (s *SchedulerClient) ServerNotify(ctx context.Context, grpcClient scheduler _, err := grpcClient.ServerNotify( ctx, request, - grpc_retry.WithMax(schedulerConnectMaxRetries), - grpc_retry.WithBackoff(grpc_retry.BackoffExponential(schedulerConnectBackoffScalar)), + grpc_retry.WithMax(SchedulerConnectMaxRetries), + grpc_retry.WithBackoff(grpc_retry.BackoffExponential(SchedulerConnectBackoffScalar)), ) if err != nil { logger.Error(err, "Failed to send notify server to scheduler") @@ -82,8 +82,8 @@ func (s *SchedulerClient) SubscribeServerEvents(ctx context.Context, grpcClient stream, err := grpcClient.SubscribeServerStatus( ctx, &scheduler.ServerSubscriptionRequest{SubscriberName: "seldon manager"}, - grpc_retry.WithMax(schedulerConnectMaxRetries), - grpc_retry.WithBackoff(grpc_retry.BackoffExponential(schedulerConnectBackoffScalar)), + grpc_retry.WithMax(SchedulerConnectMaxRetries), + grpc_retry.WithBackoff(grpc_retry.BackoffExponential(SchedulerConnectBackoffScalar)), ) if err != nil { return err diff --git a/scheduler/config/envoy.yaml b/scheduler/config/envoy.yaml index fba6fa0de1..5275cc8b15 100644 --- a/scheduler/config/envoy.yaml +++ b/scheduler/config/envoy.yaml @@ -11,12 +11,7 @@ static_resources: socket_address: address: seldon-scheduler port_value: 9002 - http2_protocol_options: { - connection_keepalive: { - interval: 60s, - timeout: 2s, - } - } + http2_protocol_options: {} name: xds_cluster - connect_timeout: 0.250s type: LOGICAL_DNS diff --git a/scheduler/data-flow/src/main/kotlin/io/seldon/dataflow/Cli.kt b/scheduler/data-flow/src/main/kotlin/io/seldon/dataflow/Cli.kt index 94a5e393a5..1e5485aaea 100644 --- a/scheduler/data-flow/src/main/kotlin/io/seldon/dataflow/Cli.kt +++ b/scheduler/data-flow/src/main/kotlin/io/seldon/dataflow/Cli.kt @@ -11,7 +11,6 @@ package io.seldon.dataflow import com.natpryce.konfig.CommandLineOption import com.natpryce.konfig.Configuration -import com.natpryce.konfig.ConfigurationMap import com.natpryce.konfig.ConfigurationProperties import com.natpryce.konfig.EnvironmentVariables import com.natpryce.konfig.Key @@ -26,8 +25,6 @@ import io.klogging.Level import io.klogging.noCoLogger import io.seldon.dataflow.kafka.security.KafkaSaslMechanisms import io.seldon.dataflow.kafka.security.KafkaSecurityProtocols -import java.net.InetAddress -import java.util.UUID object Cli { private const val ENV_VAR_PREFIX = "SELDON_" @@ -37,7 +34,6 @@ object Cli { val logLevelApplication = Key("log.level.app", enumType(*Level.values())) val logLevelKafka = Key("log.level.kafka", enumType(*Level.values())) val namespace = Key("pod.namespace", stringType) - val dataflowReplicaId = Key("dataflow.replica.id", stringType) // Seldon components val upstreamHost = Key("upstream.host", stringType) @@ -79,7 +75,6 @@ object Cli { logLevelApplication, logLevelKafka, namespace, - dataflowReplicaId, upstreamHost, upstreamPort, kafkaBootstrapServers, @@ -110,26 +105,10 @@ object Cli { fun configWith(rawArgs: Array): Configuration { val fromProperties = ConfigurationProperties.fromResource("local.properties") - val fromSystem = getSystemConfig() val fromEnv = EnvironmentVariables(prefix = ENV_VAR_PREFIX) val fromArgs = parseArguments(rawArgs) - return fromArgs overriding fromEnv overriding fromSystem overriding fromProperties - } - - private fun getSystemConfig(): Configuration { - val dataflowIdPair = this.dataflowReplicaId to getNewDataflowId() - return ConfigurationMap(dataflowIdPair) - } - - fun getNewDataflowId(assignRandomUuid: Boolean = false): String { - if (!assignRandomUuid) { - try { - return InetAddress.getLocalHost().hostName - } catch (_: Exception) { - } - } - return "seldon-dataflow-engine-" + UUID.randomUUID().toString() + return fromArgs overriding fromEnv overriding fromProperties } private fun parseArguments(rawArgs: Array): Configuration { diff --git a/scheduler/data-flow/src/main/kotlin/io/seldon/dataflow/Main.kt b/scheduler/data-flow/src/main/kotlin/io/seldon/dataflow/Main.kt index b064a974f5..8d4a899eaa 100644 --- a/scheduler/data-flow/src/main/kotlin/io/seldon/dataflow/Main.kt +++ b/scheduler/data-flow/src/main/kotlin/io/seldon/dataflow/Main.kt @@ -102,11 +102,9 @@ object Main { describeRetries = config[Cli.topicDescribeRetries], describeRetryDelayMillis = config[Cli.topicDescribeRetryDelayMillis], ) - val subscriberId = config[Cli.dataflowReplicaId] - val subscriber = PipelineSubscriber( - subscriberId, + "seldon-dataflow-engine", kafkaProperties, kafkaAdminProperties, kafkaStreamsParams, diff --git a/scheduler/data-flow/src/main/kotlin/io/seldon/dataflow/PipelineSubscriber.kt b/scheduler/data-flow/src/main/kotlin/io/seldon/dataflow/PipelineSubscriber.kt index b9a2c16a30..f1d3de4066 100644 --- a/scheduler/data-flow/src/main/kotlin/io/seldon/dataflow/PipelineSubscriber.kt +++ b/scheduler/data-flow/src/main/kotlin/io/seldon/dataflow/PipelineSubscriber.kt @@ -37,7 +37,6 @@ import kotlinx.coroutines.flow.onCompletion import kotlinx.coroutines.flow.onEach import kotlinx.coroutines.runBlocking import java.util.concurrent.ConcurrentHashMap -import java.util.concurrent.TimeUnit import io.klogging.logger as coLogger @OptIn(FlowPreview::class) @@ -61,9 +60,6 @@ class PipelineSubscriber( .defaultServiceConfig(grpcServiceConfig) .usePlaintext() // Use TLS .enableRetry() - // these keep alive settings need to match the go counterpart in scheduler/pkg/util/constants.go - .keepAliveTime(60L, TimeUnit.SECONDS) - .keepAliveTimeout(2L, TimeUnit.SECONDS) .build() private val client = ChainerGrpcKt.ChainerCoroutineStub(channel) diff --git a/scheduler/data-flow/src/main/resources/local.properties b/scheduler/data-flow/src/main/resources/local.properties index 68b3bd408d..46a7218380 100644 --- a/scheduler/data-flow/src/main/resources/local.properties +++ b/scheduler/data-flow/src/main/resources/local.properties @@ -1,6 +1,5 @@ log.level.app=INFO log.level.kafka=WARN -dataflow.replica.id=seldon-dataflow-engine kafka.bootstrap.servers=localhost:9092 kafka.consumer.prefix= kafka.security.protocol=PLAINTEXT diff --git a/scheduler/data-flow/src/test/kotlin/io/seldon/dataflow/CliTest.kt b/scheduler/data-flow/src/test/kotlin/io/seldon/dataflow/CliTest.kt index 52a97fa4b5..9011ff3d4c 100644 --- a/scheduler/data-flow/src/test/kotlin/io/seldon/dataflow/CliTest.kt +++ b/scheduler/data-flow/src/test/kotlin/io/seldon/dataflow/CliTest.kt @@ -16,15 +16,9 @@ import org.junit.jupiter.params.provider.Arguments import org.junit.jupiter.params.provider.Arguments.arguments import org.junit.jupiter.params.provider.MethodSource import strikt.api.expectCatching -import strikt.api.expectThat -import strikt.assertions.hasLength import strikt.assertions.isEqualTo -import strikt.assertions.isNotEqualTo import strikt.assertions.isSuccess -import strikt.assertions.startsWith -import java.util.UUID import java.util.stream.Stream -import kotlin.test.Test internal class CliTest { @DisplayName("Passing auth mechanism via cli argument") @@ -42,31 +36,6 @@ internal class CliTest { .isEqualTo(expectedMechanism) } - @Test - fun `should handle dataflow replica id`() { - val cliDefault = Cli.configWith(arrayOf()) - val testReplicaId = "dataflow-id-1" - val cli = Cli.configWith(arrayOf("--dataflow-replica-id", testReplicaId)) - - expectThat(cliDefault[Cli.dataflowReplicaId]) { - isNotEqualTo("seldon-dataflow-engine") - } - expectThat(cli[Cli.dataflowReplicaId]) { - isEqualTo(testReplicaId) - } - - // test random Uuid (v4) - val expectedReplicaIdPrefix = "seldon-dataflow-engine-" - val uuidStringLength = 36 - val randomReplicaUuid = Cli.getNewDataflowId(true) - expectThat(randomReplicaUuid) { - startsWith(expectedReplicaIdPrefix) - hasLength(expectedReplicaIdPrefix.length + uuidStringLength) - } - expectCatching { UUID.fromString(randomReplicaUuid.removePrefix(expectedReplicaIdPrefix)) } - .isSuccess() - } - companion object { @JvmStatic private fun saslMechanisms(): Stream { diff --git a/scheduler/pkg/agent/client.go b/scheduler/pkg/agent/client.go index 79a388266d..7698df52ab 100644 --- a/scheduler/pkg/agent/client.go +++ b/scheduler/pkg/agent/client.go @@ -24,6 +24,7 @@ import ( "google.golang.org/grpc" "google.golang.org/grpc/credentials" "google.golang.org/grpc/credentials/insecure" + "google.golang.org/grpc/keepalive" "google.golang.org/protobuf/encoding/protojson" "github.com/seldonio/seldon-core/apis/go/v2/mlops/agent" @@ -236,7 +237,8 @@ func (c *Client) Start() error { logFailure := func(err error, delay time.Duration) { c.logger.WithError(err).Errorf("Scheduler not ready") } - backOffExp := util.GetClientExponentialBackoff() + backOffExp := backoff.NewExponentialBackOff() + backOffExp.MaxElapsedTime = 0 // Never stop due to large time between calls err := backoff.RetryNotify(c.StartService, backOffExp, logFailure) if err != nil { c.logger.WithError(err).Fatal("Failed to start client") @@ -415,7 +417,11 @@ func (c *Client) getConnection(host string, plainTxtPort int, tlsPort int) (*grp logger.Infof("Connecting (non-blocking) to scheduler at %s:%d", host, port) - kacp := util.GetClientKeepAliveParameters() + kacp := keepalive.ClientParameters{ + Time: util.ClientKeapAliveTime, + Timeout: util.ClientKeapAliveTimeout, + PermitWithoutStream: util.ClientKeapAlivePermit, + } opts := []grpc.DialOption{ grpc.WithTransportCredentials(transCreds), @@ -447,7 +453,7 @@ func (c *Client) StartService() error { Shared: true, AvailableMemoryBytes: c.stateManager.GetAvailableMemoryBytesWithOverCommit(), }, - grpc_retry.WithMax(util.MaxGRPCRetriesOnStream), + grpc_retry.WithMax(1), ) // TODO make configurable if err != nil { return err diff --git a/scheduler/pkg/agent/modelserver_controlplane/oip/v2.go b/scheduler/pkg/agent/modelserver_controlplane/oip/v2.go index 39fad22bf0..136de4e171 100644 --- a/scheduler/pkg/agent/modelserver_controlplane/oip/v2.go +++ b/scheduler/pkg/agent/modelserver_controlplane/oip/v2.go @@ -19,6 +19,7 @@ import ( "go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc" "google.golang.org/grpc" "google.golang.org/grpc/credentials/insecure" + "google.golang.org/grpc/keepalive" "google.golang.org/grpc/status" v2 "github.com/seldonio/seldon-core/apis/go/v2/mlops/v2_dataplane" @@ -50,12 +51,19 @@ func CreateV2GrpcConnection(v2Config V2Config) (*grpc.ClientConn, error) { grpc_retry.WithMax(v2Config.GRPRetryMaxCount), } + kacp := keepalive.ClientParameters{ + Time: util.ClientKeapAliveTime, + Timeout: util.ClientKeapAliveTimeout, + PermitWithoutStream: util.ClientKeapAlivePermit, + } + opts := []grpc.DialOption{ grpc.WithTransportCredentials(insecure.NewCredentials()), grpc.WithDefaultCallOptions(grpc.MaxCallRecvMsgSize(v2Config.GRPCMaxMsgSizeBytes), grpc.MaxCallSendMsgSize(v2Config.GRPCMaxMsgSizeBytes)), grpc.WithStreamInterceptor(grpc_retry.StreamClientInterceptor(retryOpts...)), grpc.WithUnaryInterceptor(grpc_retry.UnaryClientInterceptor(retryOpts...)), grpc.WithStatsHandler(otelgrpc.NewClientHandler()), + grpc.WithKeepaliveParams(kacp), } conn, err := grpc.NewClient(fmt.Sprintf("%s:%d", v2Config.Host, v2Config.GRPCPort), opts...) if err != nil { diff --git a/scheduler/pkg/agent/server.go b/scheduler/pkg/agent/server.go index 6744916ee9..5388b8d552 100644 --- a/scheduler/pkg/agent/server.go +++ b/scheduler/pkg/agent/server.go @@ -112,7 +112,6 @@ type Server struct { certificateStore *seldontls.CertificateStore waiter *modelRelocatedWaiter // waiter for when we want to drain a particular server replica autoscalingServiceEnabled bool - agentMutex sync.Map // to force a serial order per agent (serverName, replicaIdx) } type SchedulerAgent interface { @@ -139,7 +138,6 @@ func NewAgentServer( scheduler: scheduler, waiter: newModelRelocatedWaiter(), autoscalingServiceEnabled: autoscalingServiceEnabled, - agentMutex: sync.Map{}, } hub.RegisterModelEventHandler( @@ -167,16 +165,12 @@ func (s *Server) startServer(port uint, secure bool) error { if err != nil { return err } - - kaep := util.GetServerKeepAliveEnforcementPolicy() - opts := []grpc.ServerOption{} if secure { opts = append(opts, grpc.Creds(s.certificateStore.CreateServerTransportCredentials())) } opts = append(opts, grpc.MaxConcurrentStreams(grpcMaxConcurrentStreams)) opts = append(opts, grpc.StatsHandler(otelgrpc.NewServerHandler())) - opts = append(opts, grpc.KeepaliveEnforcementPolicy(kaep)) grpcServer := grpc.NewServer(opts...) pb.RegisterAgentServiceServer(grpcServer, s) s.logger.Printf("Agent server running on %d mtls:%v", port, secure) @@ -385,20 +379,12 @@ func (s *Server) ModelScalingTrigger(stream pb.AgentService_ModelScalingTriggerS func (s *Server) Subscribe(request *pb.AgentSubscribeRequest, stream pb.AgentService_SubscribeServer) error { logger := s.logger.WithField("func", "Subscribe") - key := ServerKey{serverName: request.ServerName, replicaIdx: request.ReplicaIdx} - - // this is forcing a serial order per agent (serverName, replicaIdx) - // in general this will make sure that a given agent disconnects fully before another agent is allowed to connect - mu, _ := s.agentMutex.LoadOrStore(key, &sync.Mutex{}) - mu.(*sync.Mutex).Lock() - defer mu.(*sync.Mutex).Unlock() - logger.Infof("Received subscribe request from %s:%d", request.ServerName, request.ReplicaIdx) fin := make(chan bool) s.mutex.Lock() - s.agents[key] = &AgentSubscriber{ + s.agents[ServerKey{serverName: request.ServerName, replicaIdx: request.ReplicaIdx}] = &AgentSubscriber{ finished: fin, stream: stream, } @@ -424,7 +410,7 @@ func (s *Server) Subscribe(request *pb.AgentSubscribeRequest, stream pb.AgentSer case <-ctx.Done(): logger.Infof("Client replica %s:%d has disconnected", request.ServerName, request.ReplicaIdx) s.mutex.Lock() - delete(s.agents, key) + delete(s.agents, ServerKey{serverName: request.ServerName, replicaIdx: request.ReplicaIdx}) s.mutex.Unlock() s.removeServerReplicaImpl(request.GetServerName(), int(request.GetReplicaIdx())) // this is non-blocking beyond rescheduling models on removed server return nil diff --git a/scheduler/pkg/agent/server_test.go b/scheduler/pkg/agent/server_test.go index 75b6adf56b..088c6e6237 100644 --- a/scheduler/pkg/agent/server_test.go +++ b/scheduler/pkg/agent/server_test.go @@ -10,40 +10,21 @@ the Change License after the Change Date as each is defined in accordance with t package agent import ( - "context" "fmt" - "sync" "testing" "time" . "github.com/onsi/gomega" log "github.com/sirupsen/logrus" "google.golang.org/grpc" - "google.golang.org/grpc/credentials/insecure" - "github.com/seldonio/seldon-core/apis/go/v2/mlops/agent" pb "github.com/seldonio/seldon-core/apis/go/v2/mlops/agent" pbs "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" "github.com/seldonio/seldon-core/scheduler/v2/pkg/coordinator" - testing_utils "github.com/seldonio/seldon-core/scheduler/v2/pkg/internal/testing_utils" - "github.com/seldonio/seldon-core/scheduler/v2/pkg/scheduler" "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" ) -type mockScheduler struct { -} - -var _ scheduler.Scheduler = (*mockScheduler)(nil) - -func (s mockScheduler) Schedule(_ string) error { - return nil -} - -func (s mockScheduler) ScheduleFailedModels() ([]string, error) { - return nil, nil -} - type mockStore struct { models map[string]*store.ModelSnapshot } @@ -110,7 +91,7 @@ func (m *mockStore) UpdateModelState(modelKey string, version uint32, serverKey } func (m *mockStore) AddServerReplica(request *pb.AgentSubscribeRequest) error { - return nil + panic("implement me") } func (m *mockStore) ServerNotify(request *pbs.ServerNotify) error { @@ -118,7 +99,7 @@ func (m *mockStore) ServerNotify(request *pbs.ServerNotify) error { } func (m *mockStore) RemoveServerReplica(serverName string, replicaIdx int) ([]string, error) { - return nil, nil + panic("implement me") } func (m *mockStore) DrainServerReplica(serverName string, replicaIdx int) ([]string, error) { @@ -962,122 +943,3 @@ func TestAutoscalingEnabled(t *testing.T) { } } - -func TestSubscribe(t *testing.T) { - log.SetLevel(log.DebugLevel) - g := NewGomegaWithT(t) - - type ag struct { - id uint32 - doClose bool - } - type test struct { - name string - agents []ag - expectedAgentsCount int - expectedAgentsCountAfterClose int - } - tests := []test{ - { - name: "simple", - agents: []ag{ - {1, true}, {2, true}, - }, - expectedAgentsCount: 2, - expectedAgentsCountAfterClose: 0, - }, - { - name: "simple - no close", - agents: []ag{ - {1, true}, {2, false}, - }, - expectedAgentsCount: 2, - expectedAgentsCountAfterClose: 1, - }, - { - name: "duplicates", - agents: []ag{ - {1, true}, {1, false}, - }, - expectedAgentsCount: 1, - expectedAgentsCountAfterClose: 1, - }, - { - name: "duplicates with all close", - agents: []ag{ - {1, true}, {1, true}, {1, true}, - }, - expectedAgentsCount: 1, - expectedAgentsCountAfterClose: 0, - }, - } - - getStream := func(id uint32, context context.Context, port int) *grpc.ClientConn { - conn, _ := grpc.NewClient(fmt.Sprintf(":%d", port), grpc.WithTransportCredentials(insecure.NewCredentials())) - grpcClient := agent.NewAgentServiceClient(conn) - _, _ = grpcClient.Subscribe( - context, - &agent.AgentSubscribeRequest{ - ServerName: "dummy", - ReplicaIdx: id, - ReplicaConfig: &agent.ReplicaConfig{}, - Shared: true, - AvailableMemoryBytes: 0, - }, - ) - return conn - } - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - logger := log.New() - eventHub, err := coordinator.NewEventHub(logger) - g.Expect(err).To(BeNil()) - server := NewAgentServer(logger, &mockStore{}, mockScheduler{}, eventHub, false) - port, err := testing_utils.GetFreePortForTest() - if err != nil { - t.Fatal(err) - } - err = server.startServer(uint(port), false) - if err != nil { - t.Fatal(err) - } - time.Sleep(100 * time.Millisecond) - - mu := sync.Mutex{} - streams := make([]*grpc.ClientConn, 0) - for _, a := range test.agents { - go func(id uint32) { - conn := getStream(id, context.Background(), port) - mu.Lock() - streams = append(streams, conn) - mu.Unlock() - }(a.id) - } - - maxCount := 10 - count := 0 - for len(server.agents) != test.expectedAgentsCount && count < maxCount { - time.Sleep(100 * time.Millisecond) - count++ - } - g.Expect(len(server.agents)).To(Equal(test.expectedAgentsCount)) - - for idx, s := range streams { - go func(idx int, s *grpc.ClientConn) { - if test.agents[idx].doClose { - s.Close() - } - }(idx, s) - } - - count = 0 - for len(server.agents) != test.expectedAgentsCountAfterClose && count < maxCount { - time.Sleep(100 * time.Millisecond) - count++ - } - g.Expect(len(server.agents)).To(Equal(test.expectedAgentsCountAfterClose)) - - server.StopAgentStreams() - }) - } -} diff --git a/scheduler/pkg/envoy/server/server.go b/scheduler/pkg/envoy/server/server.go index cb9d1bceed..a46eaff52c 100644 --- a/scheduler/pkg/envoy/server/server.go +++ b/scheduler/pkg/envoy/server/server.go @@ -24,8 +24,6 @@ import ( "google.golang.org/grpc" seldontls "github.com/seldonio/seldon-core/components/tls/v2/pkg/tls" - - "github.com/seldonio/seldon-core/scheduler/v2/pkg/util" ) const ( @@ -68,14 +66,12 @@ func (x *XDSServer) StartXDSServer(port uint) error { return err } } - kaep := util.GetServerKeepAliveEnforcementPolicy() secure := x.certificateStore != nil var grpcOptions []grpc.ServerOption if secure { grpcOptions = append(grpcOptions, grpc.Creds(x.certificateStore.CreateServerTransportCredentials())) } grpcOptions = append(grpcOptions, grpc.MaxConcurrentStreams(grpcMaxConcurrentStreams)) - grpcOptions = append(grpcOptions, grpc.KeepaliveEnforcementPolicy(kaep)) grpcServer := grpc.NewServer(grpcOptions...) lis, err := net.Listen("tcp", fmt.Sprintf(":%d", port)) diff --git a/scheduler/pkg/kafka/dataflow/server.go b/scheduler/pkg/kafka/dataflow/server.go index aeed2ce299..23ab122364 100644 --- a/scheduler/pkg/kafka/dataflow/server.go +++ b/scheduler/pkg/kafka/dataflow/server.go @@ -44,7 +44,6 @@ type ChainerServer struct { pipelineHandler pipeline.PipelineHandler topicNamer *kafka.TopicNamer loadBalancer util.LoadBalancer - chainerMutex sync.Map chainer.UnimplementedChainerServer } @@ -67,7 +66,6 @@ func NewChainerServer(logger log.FieldLogger, eventHub *coordinator.EventHub, pi pipelineHandler: pipelineHandler, topicNamer: topicNamer, loadBalancer: loadBalancer, - chainerMutex: sync.Map{}, } eventHub.RegisterPipelineEventHandler( @@ -84,12 +82,8 @@ func (c *ChainerServer) StartGrpcServer(agentPort uint) error { if err != nil { log.Fatalf("failed to create listener: %v", err) } - - kaep := util.GetServerKeepAliveEnforcementPolicy() - var grpcOptions []grpc.ServerOption grpcOptions = append(grpcOptions, grpc.MaxConcurrentStreams(grpcMaxConcurrentStreams)) - grpcOptions = append(grpcOptions, grpc.KeepaliveEnforcementPolicy(kaep)) grpcServer := grpc.NewServer(grpcOptions...) chainer.RegisterChainerServer(grpcServer, c) c.logger.Printf("Chainer server running on %d", agentPort) @@ -127,25 +121,17 @@ func (c *ChainerServer) PipelineUpdateEvent(ctx context.Context, message *chaine func (c *ChainerServer) SubscribePipelineUpdates(req *chainer.PipelineSubscriptionRequest, stream chainer.Chainer_SubscribePipelineUpdatesServer) error { logger := c.logger.WithField("func", "SubscribePipelineStatus") - - key := req.GetName() - // this is forcing a serial order per dataflow-engine - // in general this will make sure that a given dataflow-engine disconnects fully before another dataflow-engine is allowed to connect - mu, _ := c.chainerMutex.LoadOrStore(key, &sync.Mutex{}) - mu.(*sync.Mutex).Lock() - defer mu.(*sync.Mutex).Unlock() - logger.Infof("Received subscribe request from %s", req.GetName()) fin := make(chan bool) c.mu.Lock() - c.streams[key] = &ChainerSubscription{ - name: key, + c.streams[req.Name] = &ChainerSubscription{ + name: req.Name, stream: stream, fin: fin, } - c.loadBalancer.AddServer(key) + c.loadBalancer.AddServer(req.Name) c.mu.Unlock() // Handle addition of new server @@ -158,13 +144,13 @@ func (c *ChainerServer) SubscribePipelineUpdates(req *chainer.PipelineSubscripti for { select { case <-fin: - logger.Infof("Closing stream for %s", key) + logger.Infof("Closing stream for %s", req.GetName()) return nil case <-ctx.Done(): - logger.Infof("Stream disconnected %s", key) + logger.Infof("Stream disconnected %s", req.GetName()) c.mu.Lock() - c.loadBalancer.RemoveServer(key) - delete(c.streams, key) + c.loadBalancer.RemoveServer(req.Name) + delete(c.streams, req.Name) c.mu.Unlock() // Handle removal of server c.rebalance() diff --git a/scheduler/pkg/kafka/dataflow/server_test.go b/scheduler/pkg/kafka/dataflow/server_test.go index e52780a9a1..d7f0860f4b 100644 --- a/scheduler/pkg/kafka/dataflow/server_test.go +++ b/scheduler/pkg/kafka/dataflow/server_test.go @@ -10,23 +10,19 @@ the Change License after the Change Date as each is defined in accordance with t package dataflow import ( - "context" "fmt" "os" - "sync" "testing" "time" . "github.com/onsi/gomega" log "github.com/sirupsen/logrus" "google.golang.org/grpc" - "google.golang.org/grpc/credentials/insecure" "github.com/seldonio/seldon-core/apis/go/v2/mlops/chainer" "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" "github.com/seldonio/seldon-core/scheduler/v2/pkg/coordinator" - testing_utils "github.com/seldonio/seldon-core/scheduler/v2/pkg/internal/testing_utils" "github.com/seldonio/seldon-core/scheduler/v2/pkg/kafka" "github.com/seldonio/seldon-core/scheduler/v2/pkg/kafka/config" "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" @@ -325,7 +321,7 @@ func TestPipelineRollingUpgradeEvents(t *testing.T) { } // to allow events to propagate - time.Sleep(700 * time.Millisecond) + time.Sleep(500 * time.Millisecond) if test.connection { if test.loadReqV2 != nil { @@ -638,136 +634,6 @@ func TestPipelineRebalance(t *testing.T) { } } -func TestPipelineSubscribe(t *testing.T) { - g := NewGomegaWithT(t) - type ag struct { - id uint32 - doClose bool - } - - type test struct { - name string - agents []ag - expectedAgentsCount int - expectedAgentsCountAfterClose int - } - - tests := []test{ - { - name: "single connection", - agents: []ag{ - {id: 1, doClose: true}, - }, - expectedAgentsCount: 1, - expectedAgentsCountAfterClose: 0, - }, - { - name: "multiple connection - one not closed", - agents: []ag{ - {id: 1, doClose: false}, {id: 2, doClose: true}, - }, - expectedAgentsCount: 2, - expectedAgentsCountAfterClose: 1, - }, - { - name: "multiple connection - not closed", - agents: []ag{ - {id: 1, doClose: false}, {id: 2, doClose: false}, - }, - expectedAgentsCount: 2, - expectedAgentsCountAfterClose: 2, - }, - { - name: "multiple connection - closed", - agents: []ag{ - {id: 1, doClose: true}, {id: 2, doClose: true}, - }, - expectedAgentsCount: 2, - expectedAgentsCountAfterClose: 0, - }, - { - name: "multiple connection - duplicate", - agents: []ag{ - {id: 1, doClose: true}, {id: 1, doClose: true}, {id: 1, doClose: true}, - }, - expectedAgentsCount: 1, - expectedAgentsCountAfterClose: 0, - }, - { - name: "multiple connection - duplicate not closed", - agents: []ag{ - {id: 1, doClose: true}, {id: 1, doClose: false}, {id: 1, doClose: true}, - }, - expectedAgentsCount: 1, - expectedAgentsCountAfterClose: 1, - }, - } - - getStream := func(id uint32, context context.Context, port int) *grpc.ClientConn { - conn, _ := grpc.NewClient(fmt.Sprintf(":%d", port), grpc.WithTransportCredentials(insecure.NewCredentials())) - grpcClient := chainer.NewChainerClient(conn) - _, _ = grpcClient.SubscribePipelineUpdates( - context, - &chainer.PipelineSubscriptionRequest{ - Name: fmt.Sprintf("agent-%d", id), - }, - ) - return conn - } - - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - serverName := "dummy" - s, _ := createTestScheduler(t, serverName) - port, err := testing_utils.GetFreePortForTest() - if err != nil { - t.Fatal(err) - } - go func() { - _ = s.StartGrpcServer(uint(port)) - }() - - time.Sleep(100 * time.Millisecond) - - mu := sync.Mutex{} - streams := make([]*grpc.ClientConn, 0) - for _, a := range test.agents { - go func(id uint32) { - conn := getStream(id, context.Background(), port) - mu.Lock() - streams = append(streams, conn) - mu.Unlock() - }(a.id) - } - - maxCount := 10 - count := 0 - for len(s.streams) != test.expectedAgentsCount && count < maxCount { - time.Sleep(100 * time.Millisecond) - count++ - } - g.Expect(len(s.streams)).To(Equal(test.expectedAgentsCount)) - - for idx, s := range streams { - go func(idx int, s *grpc.ClientConn) { - if test.agents[idx].doClose { - s.Close() - } - }(idx, s) - } - - count = 0 - for len(s.streams) != test.expectedAgentsCountAfterClose && count < maxCount { - time.Sleep(100 * time.Millisecond) - count++ - } - g.Expect(len(s.streams)).To(Equal(test.expectedAgentsCountAfterClose)) - - s.StopSendPipelineEvents() - }) - } -} - type stubChainerServer struct { msgs chan *chainer.PipelineUpdateMessage grpc.ServerStream diff --git a/scheduler/pkg/kafka/gateway/client.go b/scheduler/pkg/kafka/gateway/client.go index f2c5c60131..def18eb33f 100644 --- a/scheduler/pkg/kafka/gateway/client.go +++ b/scheduler/pkg/kafka/gateway/client.go @@ -22,6 +22,7 @@ import ( "google.golang.org/grpc" "google.golang.org/grpc/credentials" "google.golang.org/grpc/credentials/insecure" + "google.golang.org/grpc/keepalive" "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" seldontls "github.com/seldonio/seldon-core/components/tls/v2/pkg/tls" @@ -80,7 +81,11 @@ func (kc *KafkaSchedulerClient) ConnectToScheduler(host string, plainTxtPort int port = tlsPort } - kacp := util.GetClientKeepAliveParameters() + kacp := keepalive.ClientParameters{ + Time: util.ClientKeapAliveTime, + Timeout: util.ClientKeapAliveTimeout, + PermitWithoutStream: util.ClientKeapAlivePermit, + } // note: retry is done in the caller opts := []grpc.DialOption{ @@ -118,7 +123,11 @@ func (kc *KafkaSchedulerClient) Start() error { logFailure := func(err error, delay time.Duration) { kc.logger.WithError(err).Errorf("Scheduler not ready") } - backOffExp := util.GetClientExponentialBackoff() + backOffExp := backoff.NewExponentialBackOff() + // Set some reasonable settings for trying to reconnect to scheduler + backOffExp.MaxElapsedTime = 0 // Never stop due to large time between calls + backOffExp.MaxInterval = time.Second * 15 + backOffExp.InitialInterval = time.Second err := backoff.RetryNotify(kc.SubscribeModelEvents, backOffExp, logFailure) if err != nil { kc.logger.WithError(err).Fatal("Failed to start modelgateway client") @@ -132,11 +141,7 @@ func (kc *KafkaSchedulerClient) SubscribeModelEvents() error { logger := kc.logger.WithField("func", "SubscribeModelEvents") grpcClient := scheduler.NewSchedulerClient(kc.conn) logger.Info("Subscribing to model status events") - stream, errSub := grpcClient.SubscribeModelStatus( - context.Background(), - &scheduler.ModelSubscriptionRequest{SubscriberName: SubscriberName}, - grpc_retry.WithMax(util.MaxGRPCRetriesOnStream), - ) + stream, errSub := grpcClient.SubscribeModelStatus(context.Background(), &scheduler.ModelSubscriptionRequest{SubscriberName: SubscriberName}, grpc_retry.WithMax(100)) if errSub != nil { return errSub } @@ -159,16 +164,6 @@ func (kc *KafkaSchedulerClient) SubscribeModelEvents() error { logger.Infof("Received event name %s version %d state %s", event.ModelName, latestVersionStatus.Version, latestVersionStatus.State.State.String()) - // if the model is in a failed state and the consumer exists then we skip the removal - // this is to prevent the consumer from being removed during transient failures of the control plane - // in this way data plane can potentially continue to serve requests - if latestVersionStatus.GetState().GetState() == scheduler.ModelStatus_ScheduleFailed || latestVersionStatus.GetState().GetState() == scheduler.ModelStatus_ModelProgressing { - if kc.consumerManager.Exists(event.ModelName) { - logger.Warnf("Model %s schedule failed / progressing and consumer exists, skipping from removal", event.ModelName) - continue - } - } - // if there are available replicas then we add the consumer for the model // note that this will also get triggered if the model is already added but there is a status change (e.g. due to scale up) // and in the case then it is a no-op diff --git a/scheduler/pkg/kafka/gateway/worker.go b/scheduler/pkg/kafka/gateway/worker.go index 6b544c01a1..3852556fdd 100644 --- a/scheduler/pkg/kafka/gateway/worker.go +++ b/scheduler/pkg/kafka/gateway/worker.go @@ -32,6 +32,7 @@ import ( "google.golang.org/grpc" "google.golang.org/grpc/credentials" "google.golang.org/grpc/credentials/insecure" + "google.golang.org/grpc/keepalive" "google.golang.org/grpc/metadata" "google.golang.org/protobuf/proto" @@ -123,6 +124,12 @@ func (iw *InferWorker) getGrpcClient(host string, port int) (v2.GRPCInferenceSer creds = insecure.NewCredentials() } + kacp := keepalive.ClientParameters{ + Time: util.ClientKeapAliveTime, + Timeout: util.ClientKeapAliveTimeout, + PermitWithoutStream: util.ClientKeapAlivePermit, + } + opts := []grpc.DialOption{ grpc.WithTransportCredentials(creds), grpc.WithDefaultCallOptions( @@ -135,6 +142,7 @@ func (iw *InferWorker) getGrpcClient(host string, port int) (v2.GRPCInferenceSer grpc.WithUnaryInterceptor( grpc_retry.UnaryClientInterceptor(retryOpts...), ), + grpc.WithKeepaliveParams(kacp), } conn, err := grpc.NewClient(fmt.Sprintf("%s:%d", host, port), opts...) diff --git a/scheduler/pkg/kafka/pipeline/status/client.go b/scheduler/pkg/kafka/pipeline/status/client.go index e9191bc4e1..df77b4d91d 100644 --- a/scheduler/pkg/kafka/pipeline/status/client.go +++ b/scheduler/pkg/kafka/pipeline/status/client.go @@ -22,6 +22,7 @@ import ( "google.golang.org/grpc" "google.golang.org/grpc/credentials" "google.golang.org/grpc/credentials/insecure" + "google.golang.org/grpc/keepalive" "google.golang.org/protobuf/encoding/protojson" "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" @@ -86,7 +87,11 @@ func (pc *PipelineSchedulerClient) connectToScheduler(host string, plainTxtPort port = tlsPort } - kacp := util.GetClientKeepAliveParameters() + kacp := keepalive.ClientParameters{ + Time: util.ClientKeapAliveTime, + Timeout: util.ClientKeapAliveTimeout, + PermitWithoutStream: util.ClientKeapAlivePermit, + } // note: retry is done in the caller opts := []grpc.DialOption{ @@ -124,7 +129,11 @@ func (pc *PipelineSchedulerClient) Start(host string, plainTxtPort int, tlsPort logFailure := func(err error, delay time.Duration) { logger.WithError(err).Errorf("Scheduler not ready") } - backOffExp := util.GetClientExponentialBackoff() + backOffExp := backoff.NewExponentialBackOff() + // Set some reasonable settings for trying to reconnect to scheduler + backOffExp.MaxElapsedTime = 0 // Never stop due to large time between calls + backOffExp.MaxInterval = time.Second * 15 + backOffExp.InitialInterval = time.Second err = backoff.RetryNotify(pc.SubscribePipelineEvents, backOffExp, logFailure) if err != nil { logger.WithError(err).Fatal("Failed to start pipeline gateway client") @@ -138,11 +147,7 @@ func (pc *PipelineSchedulerClient) SubscribePipelineEvents() error { logger := pc.logger.WithField("func", "SubscribePipelineEvents") grpcClient := scheduler.NewSchedulerClient(pc.conn) logger.Info("Subscribing to pipeline status events") - stream, errSub := grpcClient.SubscribePipelineStatus( - context.Background(), - &scheduler.PipelineSubscriptionRequest{SubscriberName: SubscriberName}, - grpc_retry.WithMax(util.MaxGRPCRetriesOnStream), - ) + stream, errSub := grpcClient.SubscribePipelineStatus(context.Background(), &scheduler.PipelineSubscriptionRequest{SubscriberName: SubscriberName}, grpc_retry.WithMax(100)) if errSub != nil { return errSub } diff --git a/scheduler/pkg/server/server.go b/scheduler/pkg/server/server.go index a9a369989c..3f74c40da1 100644 --- a/scheduler/pkg/server/server.go +++ b/scheduler/pkg/server/server.go @@ -32,7 +32,6 @@ import ( "github.com/seldonio/seldon-core/scheduler/v2/pkg/store/experiment" "github.com/seldonio/seldon-core/scheduler/v2/pkg/store/pipeline" "github.com/seldonio/seldon-core/scheduler/v2/pkg/synchroniser" - "github.com/seldonio/seldon-core/scheduler/v2/pkg/util" ) const ( @@ -132,16 +131,12 @@ func (s *SchedulerServer) startServer(port uint, secure bool) error { if err != nil { return err } - - kaep := util.GetServerKeepAliveEnforcementPolicy() - opts := []grpc.ServerOption{} if secure { opts = append(opts, grpc.Creds(s.certificateStore.CreateServerTransportCredentials())) } opts = append(opts, grpc.MaxConcurrentStreams(grpcMaxConcurrentStreams)) opts = append(opts, grpc.StatsHandler(otelgrpc.NewServerHandler())) - opts = append(opts, grpc.KeepaliveEnforcementPolicy(kaep)) grpcServer := grpc.NewServer(opts...) pb.RegisterSchedulerServer(grpcServer, s) s.logger.Printf("Scheduler server running on %d mtls:%v", port, secure) diff --git a/scheduler/pkg/store/experiment/db_test.go b/scheduler/pkg/store/experiment/db_test.go index b8085a1e46..44f99abe4a 100644 --- a/scheduler/pkg/store/experiment/db_test.go +++ b/scheduler/pkg/store/experiment/db_test.go @@ -12,6 +12,7 @@ package experiment import ( "fmt" "testing" + "time" "github.com/dgraph-io/badger/v3" "github.com/google/go-cmp/cmp" @@ -104,6 +105,7 @@ func TestSaveWithTTL(t *testing.T) { logger := log.New() db, err := newExperimentDbManager(getExperimentDbFolder(path), logger, 10) g.Expect(err).To(BeNil()) + experiment.DeletedAt = time.Now() err = db.save(experiment) g.Expect(err).To(BeNil()) @@ -115,18 +117,6 @@ func TestSaveWithTTL(t *testing.T) { g.Expect(err).To(BeNil()) g.Expect(item.ExpiresAt()).ToNot(BeZero()) - // check that the resource can be "undeleted" - experiment.Deleted = false - err = db.save(experiment) - g.Expect(err).To(BeNil()) - - err = db.db.View(func(txn *badger.Txn) error { - item, err = txn.Get(([]byte(experiment.Name))) - return err - }) - g.Expect(err).To(BeNil()) - g.Expect(item.ExpiresAt()).To(BeZero()) - err = db.Stop() g.Expect(err).To(BeNil()) } diff --git a/scheduler/pkg/store/experiment/store.go b/scheduler/pkg/store/experiment/store.go index d4ba9ed918..f5cf49de66 100644 --- a/scheduler/pkg/store/experiment/store.go +++ b/scheduler/pkg/store/experiment/store.go @@ -474,6 +474,7 @@ func (es *ExperimentStore) GetExperiments() ([]*Experiment, error) { func (es *ExperimentStore) cleanupDeletedExperiments() { es.mu.Lock() defer es.mu.Unlock() + es.logger.Info("cleaning up deleted experiments") for _, experiment := range es.experiments { if experiment.Deleted { if experiment.DeletedAt.IsZero() { @@ -486,7 +487,6 @@ func (es *ExperimentStore) cleanupDeletedExperiments() { } } else if experiment.DeletedAt.Add(es.db.deletedResourceTTL).Before(time.Now()) { delete(es.experiments, experiment.Name) - es.logger.Info("cleaning up deleted experiment: %s", experiment.Name) } } } diff --git a/scheduler/pkg/store/pipeline/db_test.go b/scheduler/pkg/store/pipeline/db_test.go index 414cf90638..ce3ca86cdb 100644 --- a/scheduler/pkg/store/pipeline/db_test.go +++ b/scheduler/pkg/store/pipeline/db_test.go @@ -67,18 +67,6 @@ func TestSaveWithTTL(t *testing.T) { g.Expect(err).To(BeNil()) g.Expect(item.ExpiresAt()).ToNot(BeZero()) - // check that the resource can be "undeleted" - pipeline.Deleted = false - err = db.save(pipeline) - g.Expect(err).To(BeNil()) - - err = db.db.View(func(txn *badger.Txn) error { - item, err = txn.Get(([]byte(pipeline.Name))) - return err - }) - g.Expect(err).To(BeNil()) - g.Expect(item.ExpiresAt()).To(BeZero()) - err = db.Stop() g.Expect(err).To(BeNil()) } diff --git a/scheduler/pkg/store/pipeline/store.go b/scheduler/pkg/store/pipeline/store.go index 36ddc16405..cd08ed553a 100644 --- a/scheduler/pkg/store/pipeline/store.go +++ b/scheduler/pkg/store/pipeline/store.go @@ -448,6 +448,7 @@ func (ps *PipelineStore) handleModelEvents(event coordinator.ModelEventMsg) { func (ps *PipelineStore) cleanupDeletedPipelines() { ps.mu.Lock() defer ps.mu.Unlock() + ps.logger.Info("cleaning up deleted pipelines") for _, pipeline := range ps.pipelines { if pipeline.Deleted { if pipeline.DeletedAt.IsZero() { @@ -460,7 +461,6 @@ func (ps *PipelineStore) cleanupDeletedPipelines() { } } else if pipeline.DeletedAt.Add(ps.db.deletedResourceTTL).Before(time.Now()) { delete(ps.pipelines, pipeline.Name) - ps.logger.Info("cleaning up deleted pipeline: %s", pipeline.Name) } } } diff --git a/scheduler/pkg/util/constants.go b/scheduler/pkg/util/constants.go index ea837db8ce..971c4a860c 100644 --- a/scheduler/pkg/util/constants.go +++ b/scheduler/pkg/util/constants.go @@ -36,14 +36,7 @@ const ( const ( EnvoyUpdateDefaultBatchWait = 250 * time.Millisecond - // note that we keep client and server keepalive times the same - // they need to match counterparts in controller client: operator/scheduler/client.go - // and dataflow-engine: data-flow/src/main/kotlin/io/seldon/dataflow/PipelineSubscriber.kt - gRPCKeepAliveTime = 60 * time.Second - clientKeepAliveTimeout = 2 * time.Second - gRPCKeepAlivePermit = false - MaxGRPCRetriesOnStream = 100 // this is at the grpc library level - backOffExpMaxElapsedTime = 0 // Never stop due to large time between calls - backOffExpMaxInterval = time.Second * 15 - backOffExpInitialInterval = time.Second + ClientKeapAliveTime = 60 * time.Second + ClientKeapAliveTimeout = 2 * time.Second + ClientKeapAlivePermit = true ) diff --git a/scheduler/pkg/util/grpc.go b/scheduler/pkg/util/grpc.go deleted file mode 100644 index 96aed017da..0000000000 --- a/scheduler/pkg/util/grpc.go +++ /dev/null @@ -1,38 +0,0 @@ -/* -Copyright (c) 2024 Seldon Technologies Ltd. - -Use of this software is governed by -(1) the license included in the LICENSE file or -(2) if the license included in the LICENSE file is the Business Source License 1.1, -the Change License after the Change Date as each is defined in accordance with the LICENSE file. -*/ - -package util - -import ( - "github.com/cenkalti/backoff/v4" - "google.golang.org/grpc/keepalive" -) - -func GetClientKeepAliveParameters() keepalive.ClientParameters { - return keepalive.ClientParameters{ - Time: gRPCKeepAliveTime, - Timeout: clientKeepAliveTimeout, - PermitWithoutStream: gRPCKeepAlivePermit, - } -} - -func GetServerKeepAliveEnforcementPolicy() keepalive.EnforcementPolicy { - return keepalive.EnforcementPolicy{ - MinTime: gRPCKeepAliveTime, - PermitWithoutStream: gRPCKeepAlivePermit, - } -} - -func GetClientExponentialBackoff() *backoff.ExponentialBackOff { - backOffExp := backoff.NewExponentialBackOff() - backOffExp.MaxElapsedTime = backOffExpMaxElapsedTime - backOffExp.MaxInterval = backOffExpMaxInterval - backOffExp.InitialInterval = backOffExpInitialInterval - return backOffExp -}