diff --git a/pkg/backend/backend.go b/pkg/backend/backend.go index 260b5a1..fb4a72e 100644 --- a/pkg/backend/backend.go +++ b/pkg/backend/backend.go @@ -28,6 +28,7 @@ type BackendOperations interface { UnassignMemory(context.Context, *ConfigurationSettings, *UnassignMemoryRequest) (*UnassignMemoryResponse, error) GetMemoryById(context.Context, *ConfigurationSettings, *GetMemoryByIdRequest) (*GetMemoryByIdResponse, error) GetBackendInfo(context.Context) *GetBackendInfoResponse + GetBackendStatus(context.Context) *GetBackendStatusResponse } type commonService struct { diff --git a/pkg/backend/httpfish.go b/pkg/backend/httpfish.go index 11dd3e8..02732aa 100644 --- a/pkg/backend/httpfish.go +++ b/pkg/backend/httpfish.go @@ -298,7 +298,7 @@ func (session *Session) queryWithJSON(operation HTTPOperationType, path string, TLSClientConfig: &tls.Config{InsecureSkipVerify: session.insecure}, } - session.client = &http.Client{Transport: tr, Timeout: 10 * time.Second} + session.client = &http.Client{Transport: tr, Timeout: 10 * time.Second} //device power off will present as timeout } httpresponse, err := session.client.Do(request) if err != nil { @@ -481,7 +481,7 @@ func (session *Session) auth() error { func (service *httpfishService) GetRootService(ctx context.Context, settings *ConfigurationSettings, req *GetRootServiceRequest) (*GetRootServiceResponse, error) { session := service.service.session.(*Session) - response := session.query(HTTPOperation.GET, redfish_serviceroot) + response := session.query(HTTPOperation.GET, redfish_serviceroot) //Eval http timeout. also combine with CheckSeesion() if response.err != nil { return nil, fmt.Errorf("failed to get root service: %w", response.err) } @@ -572,13 +572,17 @@ func (service *httpfishService) DeleteSession(ctx context.Context, settings *Con // CloseIdleConnections closes the idle connections that a session client may make use of // session.CloseIdleConnections() delete(activeSessions, session.SessionId) + deletedId := session.SessionId + + service.service.session.(*Session).SessionId = "" + service.service.session.(*Session).RedfishSessionId = "" // Let user know of delete backend failure. if response.err != nil { - return &DeleteSessionResponse{SessionId: session.SessionId, IpAddress: session.ip, Port: int32(session.port), Status: "Failure"}, response.err + return &DeleteSessionResponse{SessionId: deletedId, IpAddress: session.ip, Port: int32(session.port), Status: "Failure"}, response.err } - return &DeleteSessionResponse{SessionId: session.SessionId, IpAddress: session.ip, Port: int32(session.port), Status: "Success"}, nil + return &DeleteSessionResponse{SessionId: deletedId, IpAddress: session.ip, Port: int32(session.port), Status: "Success"}, nil } // This struct holds the detail info of a specific resource block @@ -1829,3 +1833,31 @@ func (service *httpfishService) GetMemory(ctx context.Context, settings *Configu func (service *httpfishService) GetBackendInfo(ctx context.Context) *GetBackendInfoResponse { return &GetBackendInfoResponse{BackendName: "httpfish", Version: "0.1", SessionId: service.service.session.(*Session).SessionId} } + +// GetBackendInfo: Get the information of this backend +func (service *httpfishService) GetBackendStatus(ctx context.Context) *GetBackendStatusResponse { + logger := klog.FromContext(ctx) + logger.V(4).Info("====== GetBackendStatus ======") + + status := GetBackendStatusResponse{} + session := service.service.session.(*Session) + + response := session.query(HTTPOperation.GET, redfish_serviceroot) + status.FoundRootService = response.err == nil + + if status.FoundRootService { + response := session.query(HTTPOperation.GET, session.buildPath(SessionServiceKey, session.RedfishSessionId)) + status.FoundSession = response.err == nil + + if status.FoundSession { + status.SessionId = session.SessionId + status.RedfishSessionId = session.RedfishSessionId + } + + logger.V(4).Info("GetBackendStatus", "session id", status.SessionId, "redfish session id", status.RedfishSessionId) + } + + logger.V(4).Info("GetBackendStatus", "found service root", status.FoundRootService, "found service session", status.FoundSession) + + return &status +} diff --git a/pkg/backend/ops.go b/pkg/backend/ops.go index 86e6098..4224504 100644 --- a/pkg/backend/ops.go +++ b/pkg/backend/ops.go @@ -262,6 +262,14 @@ type GetBackendInfoResponse struct { Version string SessionId string } + +type GetBackendStatusResponse struct { + FoundRootService bool + FoundSession bool + SessionId string + RedfishSessionId string +} + type GetRootServiceRequest struct { } diff --git a/pkg/common/common.go b/pkg/common/common.go index cad91de..82392f8 100644 --- a/pkg/common/common.go +++ b/pkg/common/common.go @@ -4,6 +4,7 @@ type ConnectionStatus string const ( ONLINE ConnectionStatus = "online" + FOUND ConnectionStatus = "found" OFFLINE ConnectionStatus = "offline" NOT_APPLICABLE ConnectionStatus = "n\\a" ) diff --git a/pkg/common/parameters.go b/pkg/common/parameters.go index 70ed12b..896f5ed 100644 --- a/pkg/common/parameters.go +++ b/pkg/common/parameters.go @@ -11,8 +11,8 @@ import ( ) const ( - NumUuidCharsForId = 4 // Number of chars to strip from an interally generated uuid (starting from the right) for use in the internally generated ID's for appliance, blade and host - SyncChekTimeoutSeconds = 30.0 // Number of seconds to check session timeout + NumUuidCharsForId = 4 // Number of chars to strip from an interally generated uuid (starting from the right) for use in the internally generated ID's for appliance, blade and host + SyncCheckTimeoutSeconds = 15.0 // Number of seconds to check session timeout ) const ( DefaultBackend = "httpfish" // Default backend interface diff --git a/pkg/manager/appliance.go b/pkg/manager/appliance.go index 1877d59..62c8c03 100644 --- a/pkg/manager/appliance.go +++ b/pkg/manager/appliance.go @@ -154,7 +154,7 @@ func (a *Appliance) AddBlade(ctx context.Context, c *openapi.Credentials) (*Blad // Add blade to appliance a.Blades[blade.Id] = blade - // Add host to datastore + // Add blade to datastore applianceDatum, _ := datastore.DStore().GetDataStore().GetApplianceDatumById(a.Id) applianceDatum.AddBladeDatum(c) datastore.DStore().Store() @@ -164,6 +164,82 @@ func (a *Appliance) AddBlade(ctx context.Context, c *openapi.Credentials) (*Blad return blade, nil } +// UpdateBlade: Open a new session with a blade, create the new Blade object and then cache it +func (a *Appliance) UpdateBladeById(ctx context.Context, bladeId string) (*Blade, error) { + logger := klog.FromContext(ctx) + logger.V(4).Info(">>>>>> UpdateBlade: ", "applianceId", a.Id, "bladeId", bladeId) + + // query for blade + blade, ok := a.Blades[bladeId] + if !ok { + newErr := fmt.Errorf("appliance [%s] blade [%s] not found during update by id", bladeId, a.Id) + logger.Error(newErr, "failure: update blade by id") + + return nil, &common.RequestError{StatusCode: common.StatusBladeIdDoesNotExist, Err: newErr} + } + + creds := blade.creds + ops := blade.backendOps + + req := backend.CreateSessionRequest{ + Ip: creds.IpAddress, + Port: creds.Port, + Username: creds.Username, + Password: creds.Password, + Insecure: creds.Insecure, + Protocol: creds.Protocol, + } + + settings := backend.ConfigurationSettings{} + + // Create a new session + response, err := ops.CreateSession(ctx, &settings, &req) + if err != nil || response == nil { + newErr := fmt.Errorf("create session failure at [%s:%d] using interface [%s]: %w", creds.IpAddress, creds.Port, ops.GetBackendInfo(ctx).BackendName, err) + logger.Error(newErr, "failure: update blade by id") + return nil, &common.RequestError{StatusCode: common.StatusBladeCreateSessionFailure, Err: newErr} + } + + // Create the new Blade + r := RequestNewBlade{ + BladeId: bladeId, + ApplianceId: a.Id, + Ip: creds.IpAddress, + Status: common.ONLINE, + Port: uint16(creds.Port), + BackendOps: ops, + Creds: creds, + } + + updatedBlade, err := NewBlade(ctx, &r) + if err != nil || updatedBlade == nil { + req := backend.DeleteSessionRequest{} + response, deleErr := ops.DeleteSession(ctx, &settings, &req) + if deleErr != nil || response == nil { + newErr := fmt.Errorf("failed to delete session [%s:%d] after failed blade [%s] object creation: %w", creds.IpAddress, creds.Port, bladeId, err) + logger.Error(newErr, "failure: add blade") + return nil, &common.RequestError{StatusCode: common.StatusBladeDeleteSessionFailure, Err: newErr} + } + + newErr := fmt.Errorf("appliance [%s] new blade object creation failure: %w", a.Id, err) + logger.Error(newErr, "failure: add blade") + return nil, &common.RequestError{StatusCode: common.StatusManagerInitializationFailure, Err: newErr} + } + + // Replace blade in appliance + a.Blades[blade.Id] = updatedBlade + + // Replace blade in datastore + applianceDatum, _ := datastore.DStore().GetDataStore().GetApplianceDatumById(a.Id) + applianceDatum.DeleteBladeDatumById(blade.Id) + applianceDatum.AddBladeDatum(creds) + datastore.DStore().Store() + + logger.V(2).Info("success: update blade", "bladeId", updatedBlade.Id, "applianceId", a.Id) + + return updatedBlade, nil +} + func (a *Appliance) DeleteAllBlades(ctx context.Context) { logger := klog.FromContext(ctx) logger.V(4).Info(">>>>>> DeleteAllBlades: ", "applianceId", a.Id) @@ -180,15 +256,33 @@ func (a *Appliance) DeleteBladeById(ctx context.Context, bladeId string) (*Blade logger := klog.FromContext(ctx) logger.V(4).Info(">>>>>> DeleteBladeById: ", "bladeId", bladeId, "applianceId", a.Id) + blade, err := a.DeleteBladeByIdBackend(ctx, bladeId) + if err != nil || blade == nil { + // Currently, backend ALWAYS deletes the blade session from the backend map. Do the same in the this (manager) layer + logger.V(2).Info("force complete appliance blade deletion after backend session failure", "bladeId", blade.Id, "applianceId", a.Id) + a.DeleteBladeByIdManager(ctx, bladeId) + + return blade, err + } + + a.DeleteBladeByIdManager(ctx, bladeId) + + logger.V(2).Info("success: delete blade by id", "bladeId", blade.Id, "applianceId", a.Id) + + return blade, nil +} + +// DeleteBladeByIdBackend: Delete the blade from backend only +func (a *Appliance) DeleteBladeByIdBackend(ctx context.Context, bladeId string) (*Blade, error) { + logger := klog.FromContext(ctx) + logger.V(4).Info(">>>>>> DeleteBladeBackendById: ", "bladeId", bladeId, "applianceId", a.Id) + // query for blade blade, ok := a.Blades[bladeId] if !ok { logger.V(2).Info("blade not found during delete:", "bladeId", bladeId, "applianceId", a.Id) newErr := fmt.Errorf("blade [%s] not found during delete", bladeId) - logger.V(2).Info("force complete appliance blade deletion after error", "bladeId", blade.Id, "applianceId", a.Id) - a.deleteBlade(bladeId) - return nil, &common.RequestError{StatusCode: common.StatusBladeIdDoesNotExist, Err: newErr} } @@ -202,22 +296,30 @@ func (a *Appliance) DeleteBladeById(ctx context.Context, bladeId string) (*Blade response, err := ops.DeleteSession(ctx, &settings, &req) if err != nil || response == nil { newErr := fmt.Errorf("failed to delete blade [%s] backend [%s] session [%s]: %w", blade.Id, ops.GetBackendInfo(ctx).BackendName, blade.Socket.String(), err) - logger.Error(newErr, "failure: delete blade by id") - - // Currently, backend ALWAYS deletes the blade session from the backend map. Do the same in the this (manager) layer - logger.V(2).Info("force complete appliance blade deletion after backend session failure", "bladeId", blade.Id, "applianceId", a.Id) - a.deleteBlade(bladeId) + logger.Error(newErr, "failure: delete blade by id (backend)") return blade, &common.RequestError{StatusCode: common.StatusBladeDeleteSessionFailure, Err: newErr} // Still return the blade for recovery } - a.deleteBlade(bladeId) - - logger.V(2).Info("success: delete blade by id", "bladeId", blade.Id, "applianceId", a.Id) + logger.V(2).Info("success: delete blade by id (backend)", "bladeId", blade.Id, "applianceId", a.Id) return blade, nil } +// DeleteBladeByIdManager: Delete the blade from manager layer (appliance blade map and datastore) +func (a *Appliance) DeleteBladeByIdManager(ctx context.Context, bladeId string) { + logger := klog.FromContext(ctx) + logger.V(4).Info(">>>>>> DeleteBladeByIdManager: ", "bladeId", bladeId, "applianceId", a.Id) + + // delete blade from manager cache + delete(a.Blades, bladeId) + + // delete blade from datastore + applianceDatum, _ := datastore.DStore().GetDataStore().GetApplianceDatumById(a.Id) + applianceDatum.DeleteBladeDatumById(bladeId) + datastore.DStore().Store() +} + func (a *Appliance) GetAllBladeIds() []string { var ids []string @@ -241,25 +343,23 @@ func (a *Appliance) GetBladeById(ctx context.Context, bladeId string) (*Blade, e } // Check for resync - if !blade.CheckSync(ctx) { - logger.V(2).Info("GetBladeById: blade might be out of sync", "bladeId", bladeId) - ok := blade.backendOps.CheckSession(ctx) - if !ok { + if blade.CheckSync(ctx) { + logger.V(2).Info("GetBladeById: blade might be out of sync", "bladeId", bladeId, "applianceId", a.Id) + blade.UpdateConnectionStatusBackend(ctx) + if blade.Status == common.FOUND { // good power, bad session blade, err = a.ResyncBladeById(ctx, bladeId) if err != nil { - newErr := fmt.Errorf("failed to resync host(add): host [%s]: %w", bladeId, err) - logger.Error(newErr, "failure: resync host") + newErr := fmt.Errorf("failed to resync blade by id [%s]: %w", bladeId, err) + logger.Error(newErr, "failure: get blade by id") return nil, &common.RequestError{StatusCode: err.(*common.RequestError).StatusCode, Err: newErr} - } else { - logger.V(2).Info("success: auto resync host", "bladeId", bladeId) } + + logger.V(2).Info("success: auto resync blade", "bladeId", bladeId) } else { blade.SetSync(ctx) } } - blade.UpdateConnectionStatusBackend(ctx) - logger.V(2).Info("success: get blade by id", "status", blade.Status, "bladeId", blade.Id, "applianceId", a.Id) return blade, nil @@ -346,7 +446,19 @@ func (a *Appliance) ResyncBladeById(ctx context.Context, bladeId string) (*Blade return nil, &common.RequestError{StatusCode: common.StatusBladeIdDoesNotExist, Err: newErr} } - blade.UpdateConnectionStatusBackend(ctx) + blade, err := a.DeleteBladeByIdBackend(ctx, bladeId) + if err != nil { + logger.Error(err, "resync blade: ignoring delete blade failure") + } + + blade.UpdateConnectionStatusBackend(ctx) // update status here in case of failure during update + + blade, err = a.UpdateBladeById(ctx, blade.Id) + if err != nil { + newErr := fmt.Errorf("failed to resync blade(add): appliance [%s] blade [%s]: %w", a.Id, bladeId, err) + logger.Error(newErr, "failure: resync blade") + return nil, &common.RequestError{StatusCode: err.(*common.RequestError).StatusCode, Err: newErr} + } logger.V(2).Info("success: resync blade", "status", blade.Status, "bladeId", bladeId, "applianceId", a.Id) @@ -356,13 +468,3 @@ func (a *Appliance) ResyncBladeById(ctx context.Context, bladeId string) (*Blade ///////////////////////////////////// //////// Private Functions ////////// ///////////////////////////////////// - -func (a *Appliance) deleteBlade(bladeId string) { - // delete blade from manager cache - delete(a.Blades, bladeId) - - // delete blade from datastore - applianceDatum, _ := datastore.DStore().GetDataStore().GetApplianceDatumById(a.Id) - applianceDatum.DeleteBladeDatumById(bladeId) - datastore.DStore().Store() -} diff --git a/pkg/manager/blade.go b/pkg/manager/blade.go index 77d9037..863adc8 100644 --- a/pkg/manager/blade.go +++ b/pkg/manager/blade.go @@ -83,20 +83,19 @@ func NewBlade(ctx context.Context, r *RequestNewBlade) (*Blade, error) { func (b *Blade) SetSync(ctx context.Context) { logger := klog.FromContext(ctx) - logger.V(3).Info(">>>>>> SetSyncFlag(Blade): ", "bladeId", b.Id) + logger.V(4).Info(">>>>>> SetSync: ", "bladeId", b.Id) b.lastSyncTimeStamp = time.Now() } func (b *Blade) CheckSync(ctx context.Context) bool { logger := klog.FromContext(ctx) - logger.V(2).Info(">>>>>> CheckSyncFlag(Blade): ", "bladeId", b.Id) + logger.V(4).Info(">>>>>> CheckSync: ", "bladeId", b.Id) - if time.Since(b.lastSyncTimeStamp).Seconds() > common.SyncChekTimeoutSeconds { - return false - } else { + if time.Since(b.lastSyncTimeStamp).Seconds() > common.SyncCheckTimeoutSeconds { b.SetSync(ctx) // renew the timestamp return true } + return false } type RequestAssignMemory struct { @@ -654,17 +653,20 @@ func (b *Blade) IsOnline(ctx context.Context) bool { return b.Status == common.ONLINE } -// UpdateConnectionStatusBackend - Query the blade root service to verify continued connection and update the object status accordingly. +// UpdateConnectionStatusBackend - Query the blade for backend root and sesssion status and then update the manager's blade status accordingly. func (b *Blade) UpdateConnectionStatusBackend(ctx context.Context) { logger := klog.FromContext(ctx) logger.V(4).Info(">>>>>> UpdateConnectionStatusBackend: ", "bladeId", b.Id) - req := backend.GetRootServiceRequest{} - response, err := b.backendOps.GetRootService(ctx, &backend.ConfigurationSettings{}, &req) - if err != nil || response == nil { - b.Status = common.OFFLINE + status := b.backendOps.GetBackendStatus(ctx) + if status.FoundRootService { + if status.FoundSession { + b.Status = common.ONLINE + } else { + b.Status = common.FOUND + } } else { - b.Status = common.ONLINE + b.Status = common.OFFLINE } // Update datastore status @@ -673,7 +675,7 @@ func (b *Blade) UpdateConnectionStatusBackend(ctx context.Context) { bladeDatum.SetConnectionStatus(&b.Status) datastore.DStore().Store() - logger.V(2).Info("update blade status(backend)", "status", b.Status, "bladeId", b.Id) + logger.V(2).Info("success: update blade status(backend)", "status", b.Status, "bladeId", b.Id) } ///////////////////////////////////// diff --git a/pkg/manager/host.go b/pkg/manager/host.go index d2ba403..3eb8b48 100644 --- a/pkg/manager/host.go +++ b/pkg/manager/host.go @@ -87,7 +87,7 @@ func (h *Host) CheckSync(ctx context.Context) bool { logger := klog.FromContext(ctx) logger.V(2).Info(">>>>>> CheckSyncFlag(Host): ", "hostId", h.Id) - if time.Since(h.lastSyncTimeStamp).Seconds() > common.SyncChekTimeoutSeconds { + if time.Since(h.lastSyncTimeStamp).Seconds() > common.SyncCheckTimeoutSeconds { return false } else { h.SetSync(ctx) // renew the timestamp