Skip to content

Commit

Permalink
fix: add auto-resync for blades
Browse files Browse the repository at this point in the history
Needed to combine new auto-resync with recently added online\offline status functionality.
New DeleteBladeByIdBackend() gives service the ability to only delete sessions from the backend.  This allows the webui to still see the blade, even if it's powered down.
New UpdateBladeById() give service the ability to update (resync) an existing blade that's already visible to the client.
Created a new "found" Status state for devices (which represents "powered" but with no valid redfish session id).
  • Loading branch information
scott-howe-1 committed Oct 25, 2024
1 parent 0e0cbf7 commit 9e40313
Show file tree
Hide file tree
Showing 8 changed files with 198 additions and 52 deletions.
1 change: 1 addition & 0 deletions pkg/backend/backend.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ type BackendOperations interface {
UnassignMemory(context.Context, *ConfigurationSettings, *UnassignMemoryRequest) (*UnassignMemoryResponse, error)
GetMemoryById(context.Context, *ConfigurationSettings, *GetMemoryByIdRequest) (*GetMemoryByIdResponse, error)
GetBackendInfo(context.Context) *GetBackendInfoResponse
GetBackendStatus(context.Context) *GetBackendStatusResponse
}

type commonService struct {
Expand Down
40 changes: 36 additions & 4 deletions pkg/backend/httpfish.go
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,7 @@ func (session *Session) queryWithJSON(operation HTTPOperationType, path string,
TLSClientConfig: &tls.Config{InsecureSkipVerify: session.insecure},
}

session.client = &http.Client{Transport: tr, Timeout: 10 * time.Second}
session.client = &http.Client{Transport: tr, Timeout: 10 * time.Second} //device power off will present as timeout
}
httpresponse, err := session.client.Do(request)
if err != nil {
Expand Down Expand Up @@ -481,7 +481,7 @@ func (session *Session) auth() error {
func (service *httpfishService) GetRootService(ctx context.Context, settings *ConfigurationSettings, req *GetRootServiceRequest) (*GetRootServiceResponse, error) {
session := service.service.session.(*Session)

response := session.query(HTTPOperation.GET, redfish_serviceroot)
response := session.query(HTTPOperation.GET, redfish_serviceroot) //Eval http timeout. also combine with CheckSeesion()
if response.err != nil {
return nil, fmt.Errorf("failed to get root service: %w", response.err)
}
Expand Down Expand Up @@ -572,13 +572,17 @@ func (service *httpfishService) DeleteSession(ctx context.Context, settings *Con
// CloseIdleConnections closes the idle connections that a session client may make use of
// session.CloseIdleConnections()
delete(activeSessions, session.SessionId)
deletedId := session.SessionId

service.service.session.(*Session).SessionId = ""
service.service.session.(*Session).RedfishSessionId = ""

// Let user know of delete backend failure.
if response.err != nil {
return &DeleteSessionResponse{SessionId: session.SessionId, IpAddress: session.ip, Port: int32(session.port), Status: "Failure"}, response.err
return &DeleteSessionResponse{SessionId: deletedId, IpAddress: session.ip, Port: int32(session.port), Status: "Failure"}, response.err
}

return &DeleteSessionResponse{SessionId: session.SessionId, IpAddress: session.ip, Port: int32(session.port), Status: "Success"}, nil
return &DeleteSessionResponse{SessionId: deletedId, IpAddress: session.ip, Port: int32(session.port), Status: "Success"}, nil
}

// This struct holds the detail info of a specific resource block
Expand Down Expand Up @@ -1829,3 +1833,31 @@ func (service *httpfishService) GetMemory(ctx context.Context, settings *Configu
func (service *httpfishService) GetBackendInfo(ctx context.Context) *GetBackendInfoResponse {
return &GetBackendInfoResponse{BackendName: "httpfish", Version: "0.1", SessionId: service.service.session.(*Session).SessionId}
}

// GetBackendInfo: Get the information of this backend
func (service *httpfishService) GetBackendStatus(ctx context.Context) *GetBackendStatusResponse {
logger := klog.FromContext(ctx)
logger.V(4).Info("====== GetBackendStatus ======")

status := GetBackendStatusResponse{}
session := service.service.session.(*Session)

response := session.query(HTTPOperation.GET, redfish_serviceroot)
status.FoundRootService = response.err == nil

if status.FoundRootService {
response := session.query(HTTPOperation.GET, session.buildPath(SessionServiceKey, session.RedfishSessionId))
status.FoundSession = response.err == nil

if status.FoundSession {
status.SessionId = session.SessionId
status.RedfishSessionId = session.RedfishSessionId
}

logger.V(4).Info("GetBackendStatus", "session id", status.SessionId, "redfish session id", status.RedfishSessionId)
}

logger.V(4).Info("GetBackendStatus", "found service root", status.FoundRootService, "found service session", status.FoundSession)

return &status
}
8 changes: 8 additions & 0 deletions pkg/backend/ops.go
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,14 @@ type GetBackendInfoResponse struct {
Version string
SessionId string
}

type GetBackendStatusResponse struct {
FoundRootService bool
FoundSession bool
SessionId string
RedfishSessionId string
}

type GetRootServiceRequest struct {
}

Expand Down
1 change: 1 addition & 0 deletions pkg/common/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ type ConnectionStatus string

const (
ONLINE ConnectionStatus = "online"
FOUND ConnectionStatus = "found"
OFFLINE ConnectionStatus = "offline"
NOT_APPLICABLE ConnectionStatus = "n\\a"
)
4 changes: 2 additions & 2 deletions pkg/common/parameters.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ import (
)

const (
NumUuidCharsForId = 4 // Number of chars to strip from an interally generated uuid (starting from the right) for use in the internally generated ID's for appliance, blade and host
SyncChekTimeoutSeconds = 30.0 // Number of seconds to check session timeout
NumUuidCharsForId = 4 // Number of chars to strip from an interally generated uuid (starting from the right) for use in the internally generated ID's for appliance, blade and host
SyncCheckTimeoutSeconds = 15.0 // Number of seconds to check session timeout
)
const (
DefaultBackend = "httpfish" // Default backend interface
Expand Down
168 changes: 135 additions & 33 deletions pkg/manager/appliance.go
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ func (a *Appliance) AddBlade(ctx context.Context, c *openapi.Credentials) (*Blad
// Add blade to appliance
a.Blades[blade.Id] = blade

// Add host to datastore
// Add blade to datastore
applianceDatum, _ := datastore.DStore().GetDataStore().GetApplianceDatumById(a.Id)
applianceDatum.AddBladeDatum(c)
datastore.DStore().Store()
Expand All @@ -164,6 +164,82 @@ func (a *Appliance) AddBlade(ctx context.Context, c *openapi.Credentials) (*Blad
return blade, nil
}

// UpdateBlade: Open a new session with a blade, create the new Blade object and then cache it
func (a *Appliance) UpdateBladeById(ctx context.Context, bladeId string) (*Blade, error) {
logger := klog.FromContext(ctx)
logger.V(4).Info(">>>>>> UpdateBlade: ", "applianceId", a.Id, "bladeId", bladeId)

// query for blade
blade, ok := a.Blades[bladeId]
if !ok {
newErr := fmt.Errorf("appliance [%s] blade [%s] not found during update by id", bladeId, a.Id)
logger.Error(newErr, "failure: update blade by id")

return nil, &common.RequestError{StatusCode: common.StatusBladeIdDoesNotExist, Err: newErr}
}

creds := blade.creds
ops := blade.backendOps

req := backend.CreateSessionRequest{
Ip: creds.IpAddress,
Port: creds.Port,
Username: creds.Username,
Password: creds.Password,
Insecure: creds.Insecure,
Protocol: creds.Protocol,
}

settings := backend.ConfigurationSettings{}

// Create a new session
response, err := ops.CreateSession(ctx, &settings, &req)
if err != nil || response == nil {
newErr := fmt.Errorf("create session failure at [%s:%d] using interface [%s]: %w", creds.IpAddress, creds.Port, ops.GetBackendInfo(ctx).BackendName, err)
logger.Error(newErr, "failure: update blade by id")
return nil, &common.RequestError{StatusCode: common.StatusBladeCreateSessionFailure, Err: newErr}
}

// Create the new Blade
r := RequestNewBlade{
BladeId: bladeId,
ApplianceId: a.Id,
Ip: creds.IpAddress,
Status: common.ONLINE,
Port: uint16(creds.Port),
BackendOps: ops,
Creds: creds,
}

updatedBlade, err := NewBlade(ctx, &r)
if err != nil || updatedBlade == nil {
req := backend.DeleteSessionRequest{}
response, deleErr := ops.DeleteSession(ctx, &settings, &req)
if deleErr != nil || response == nil {
newErr := fmt.Errorf("failed to delete session [%s:%d] after failed blade [%s] object creation: %w", creds.IpAddress, creds.Port, bladeId, err)
logger.Error(newErr, "failure: add blade")
return nil, &common.RequestError{StatusCode: common.StatusBladeDeleteSessionFailure, Err: newErr}
}

newErr := fmt.Errorf("appliance [%s] new blade object creation failure: %w", a.Id, err)
logger.Error(newErr, "failure: add blade")
return nil, &common.RequestError{StatusCode: common.StatusManagerInitializationFailure, Err: newErr}
}

// Replace blade in appliance
a.Blades[blade.Id] = updatedBlade

// Replace blade in datastore
applianceDatum, _ := datastore.DStore().GetDataStore().GetApplianceDatumById(a.Id)
applianceDatum.DeleteBladeDatumById(blade.Id)
applianceDatum.AddBladeDatum(creds)
datastore.DStore().Store()

logger.V(2).Info("success: update blade", "bladeId", updatedBlade.Id, "applianceId", a.Id)

return updatedBlade, nil
}

func (a *Appliance) DeleteAllBlades(ctx context.Context) {
logger := klog.FromContext(ctx)
logger.V(4).Info(">>>>>> DeleteAllBlades: ", "applianceId", a.Id)
Expand All @@ -180,15 +256,33 @@ func (a *Appliance) DeleteBladeById(ctx context.Context, bladeId string) (*Blade
logger := klog.FromContext(ctx)
logger.V(4).Info(">>>>>> DeleteBladeById: ", "bladeId", bladeId, "applianceId", a.Id)

blade, err := a.DeleteBladeByIdBackend(ctx, bladeId)
if err != nil || blade == nil {
// Currently, backend ALWAYS deletes the blade session from the backend map. Do the same in the this (manager) layer
logger.V(2).Info("force complete appliance blade deletion after backend session failure", "bladeId", blade.Id, "applianceId", a.Id)
a.DeleteBladeByIdManager(ctx, bladeId)

return blade, err
}

a.DeleteBladeByIdManager(ctx, bladeId)

logger.V(2).Info("success: delete blade by id", "bladeId", blade.Id, "applianceId", a.Id)

return blade, nil
}

// DeleteBladeByIdBackend: Delete the blade from backend only
func (a *Appliance) DeleteBladeByIdBackend(ctx context.Context, bladeId string) (*Blade, error) {
logger := klog.FromContext(ctx)
logger.V(4).Info(">>>>>> DeleteBladeBackendById: ", "bladeId", bladeId, "applianceId", a.Id)

// query for blade
blade, ok := a.Blades[bladeId]
if !ok {
logger.V(2).Info("blade not found during delete:", "bladeId", bladeId, "applianceId", a.Id)
newErr := fmt.Errorf("blade [%s] not found during delete", bladeId)

logger.V(2).Info("force complete appliance blade deletion after error", "bladeId", blade.Id, "applianceId", a.Id)
a.deleteBlade(bladeId)

return nil, &common.RequestError{StatusCode: common.StatusBladeIdDoesNotExist, Err: newErr}
}

Expand All @@ -202,22 +296,30 @@ func (a *Appliance) DeleteBladeById(ctx context.Context, bladeId string) (*Blade
response, err := ops.DeleteSession(ctx, &settings, &req)
if err != nil || response == nil {
newErr := fmt.Errorf("failed to delete blade [%s] backend [%s] session [%s]: %w", blade.Id, ops.GetBackendInfo(ctx).BackendName, blade.Socket.String(), err)
logger.Error(newErr, "failure: delete blade by id")

// Currently, backend ALWAYS deletes the blade session from the backend map. Do the same in the this (manager) layer
logger.V(2).Info("force complete appliance blade deletion after backend session failure", "bladeId", blade.Id, "applianceId", a.Id)
a.deleteBlade(bladeId)
logger.Error(newErr, "failure: delete blade by id (backend)")

return blade, &common.RequestError{StatusCode: common.StatusBladeDeleteSessionFailure, Err: newErr} // Still return the blade for recovery
}

a.deleteBlade(bladeId)

logger.V(2).Info("success: delete blade by id", "bladeId", blade.Id, "applianceId", a.Id)
logger.V(2).Info("success: delete blade by id (backend)", "bladeId", blade.Id, "applianceId", a.Id)

return blade, nil
}

// DeleteBladeByIdManager: Delete the blade from manager layer (appliance blade map and datastore)
func (a *Appliance) DeleteBladeByIdManager(ctx context.Context, bladeId string) {
logger := klog.FromContext(ctx)
logger.V(4).Info(">>>>>> DeleteBladeByIdManager: ", "bladeId", bladeId, "applianceId", a.Id)

// delete blade from manager cache
delete(a.Blades, bladeId)

// delete blade from datastore
applianceDatum, _ := datastore.DStore().GetDataStore().GetApplianceDatumById(a.Id)
applianceDatum.DeleteBladeDatumById(bladeId)
datastore.DStore().Store()
}

func (a *Appliance) GetAllBladeIds() []string {
var ids []string

Expand All @@ -241,25 +343,23 @@ func (a *Appliance) GetBladeById(ctx context.Context, bladeId string) (*Blade, e
}

// Check for resync
if !blade.CheckSync(ctx) {
logger.V(2).Info("GetBladeById: blade might be out of sync", "bladeId", bladeId)
ok := blade.backendOps.CheckSession(ctx)
if !ok {
if blade.CheckSync(ctx) {
logger.V(2).Info("GetBladeById: blade might be out of sync", "bladeId", bladeId, "applianceId", a.Id)
blade.UpdateConnectionStatusBackend(ctx)
if blade.Status == common.FOUND { // good power, bad session
blade, err = a.ResyncBladeById(ctx, bladeId)
if err != nil {
newErr := fmt.Errorf("failed to resync host(add): host [%s]: %w", bladeId, err)
logger.Error(newErr, "failure: resync host")
newErr := fmt.Errorf("failed to resync blade by id [%s]: %w", bladeId, err)
logger.Error(newErr, "failure: get blade by id")
return nil, &common.RequestError{StatusCode: err.(*common.RequestError).StatusCode, Err: newErr}
} else {
logger.V(2).Info("success: auto resync host", "bladeId", bladeId)
}

logger.V(2).Info("success: auto resync blade", "bladeId", bladeId)
} else {
blade.SetSync(ctx)
}
}

blade.UpdateConnectionStatusBackend(ctx)

logger.V(2).Info("success: get blade by id", "status", blade.Status, "bladeId", blade.Id, "applianceId", a.Id)

return blade, nil
Expand Down Expand Up @@ -346,7 +446,19 @@ func (a *Appliance) ResyncBladeById(ctx context.Context, bladeId string) (*Blade
return nil, &common.RequestError{StatusCode: common.StatusBladeIdDoesNotExist, Err: newErr}
}

blade.UpdateConnectionStatusBackend(ctx)
blade, err := a.DeleteBladeByIdBackend(ctx, bladeId)
if err != nil {
logger.Error(err, "resync blade: ignoring delete blade failure")
}

blade.UpdateConnectionStatusBackend(ctx) // update status here in case of failure during update

blade, err = a.UpdateBladeById(ctx, blade.Id)
if err != nil {
newErr := fmt.Errorf("failed to resync blade(add): appliance [%s] blade [%s]: %w", a.Id, bladeId, err)
logger.Error(newErr, "failure: resync blade")
return nil, &common.RequestError{StatusCode: err.(*common.RequestError).StatusCode, Err: newErr}
}

logger.V(2).Info("success: resync blade", "status", blade.Status, "bladeId", bladeId, "applianceId", a.Id)

Expand All @@ -356,13 +468,3 @@ func (a *Appliance) ResyncBladeById(ctx context.Context, bladeId string) (*Blade
/////////////////////////////////////
//////// Private Functions //////////
/////////////////////////////////////

func (a *Appliance) deleteBlade(bladeId string) {
// delete blade from manager cache
delete(a.Blades, bladeId)

// delete blade from datastore
applianceDatum, _ := datastore.DStore().GetDataStore().GetApplianceDatumById(a.Id)
applianceDatum.DeleteBladeDatumById(bladeId)
datastore.DStore().Store()
}
Loading

0 comments on commit 9e40313

Please sign in to comment.