-
Notifications
You must be signed in to change notification settings - Fork 176
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
chore: support leave member by using any pods #7890
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -618,9 +618,11 @@ func (r *componentWorkloadOps) leaveMember4ScaleIn() error { | |
|
||
// TODO: Move memberLeave to the ITS controller. Instead of performing a switchover, we can directly scale down the non-leader nodes. This is because the pod ordinal is not guaranteed to be continuous. | ||
podsToMemberLeave := make([]*corev1.Pod, 0) | ||
desiredPods := make([]*corev1.Pod, 0) | ||
for _, pod := range pods { | ||
// if the pod not exists in the generated pod names, it should be a member that needs to leave | ||
if _, ok := r.desiredCompPodNameSet[pod.Name]; ok { | ||
desiredPods = append(desiredPods, pod) | ||
continue | ||
} | ||
podsToMemberLeave = append(podsToMemberLeave, pod) | ||
|
@@ -644,12 +646,18 @@ func (r *componentWorkloadOps) leaveMember4ScaleIn() error { | |
return switchoverErr | ||
} | ||
|
||
if err2 := lorryCli.LeaveMember(r.reqCtx.Ctx); err2 != nil { | ||
if err2 := lorryCli.LeaveMember(r.reqCtx.Ctx, nil); err2 != nil { | ||
// For the purpose of upgrade compatibility, if the version of Lorry is 0.7 and | ||
// the version of KB is upgraded to 0.8 or newer, lorry client will return an NotImplemented error, | ||
// in this case, here just ignore it. | ||
if err2 == lorry.NotImplemented { | ||
r.reqCtx.Log.Info("lorry leave member api is not implemented") | ||
} else if unableToConnect(err2) { | ||
r.reqCtx.Log.Info(fmt.Sprintf("when leaving pod %s by lorry, can not connect lorry on pod %s, try to leave member by other pods", pod.Name, pod.Name)) | ||
err3 := r.leaveMemberByOtherPods(desiredPods, pod) | ||
if err == nil { | ||
err = err3 | ||
} | ||
Comment on lines
+649
to
+660
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. re-org |
||
} else if err == nil { | ||
err = err2 | ||
} | ||
|
@@ -658,6 +666,50 @@ func (r *componentWorkloadOps) leaveMember4ScaleIn() error { | |
return err // TODO: use requeue-after | ||
} | ||
|
||
func unableToConnect(err error) bool { | ||
if err == nil { | ||
return false | ||
} | ||
if strings.Contains(err.Error(), "i/o timeout") { | ||
return true | ||
} | ||
return false | ||
} | ||
|
||
// Try to leave `podToLeave` by pods in `desiredPods`, | ||
// if any error occurs not due to `unableToConnect` to pods in `desiredPods`, return it immediately. | ||
func (r *componentWorkloadOps) leaveMemberByOtherPods(desiredPods []*corev1.Pod, podToLeave *corev1.Pod) error { | ||
parameters := make(map[string]any) | ||
parameters["podName"] = podToLeave.Spec.Hostname | ||
|
||
for _, pod := range desiredPods { | ||
lorryCli, err1 := lorry.NewClient(*pod) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It should follow the API definition, otherwise there will be risks in executing the action on other pods. |
||
if err1 != nil { | ||
return fmt.Errorf("error when leaveMemberByOtherPods NewClient pod %v: %v", pod.Name, err1) | ||
} | ||
|
||
if intctrlutil.IsNil(lorryCli) { | ||
// no lorry in the pod | ||
continue | ||
} | ||
|
||
if err2 := lorryCli.LeaveMember(r.reqCtx.Ctx, parameters); err2 != nil { | ||
// For the purpose of upgrade compatibility, if the version of Lorry is 0.7 and | ||
// the version of KB is upgraded to 0.8 or newer, lorry client will return an NotImplemented error, | ||
// in this case, here just ignore it. | ||
if err2 == lorry.NotImplemented { | ||
r.reqCtx.Log.Info("lorry leave member api is not implemented") | ||
} else if unableToConnect(err2) { | ||
r.reqCtx.Log.Info(fmt.Sprintf("leaveMemberByOtherPods: can not connect lorry on pod %s", pod.Name)) | ||
} else { | ||
return fmt.Errorf("error when leaveMemberByOtherPods LeaveMember, try to leave pod %v on pod %v: %v", podToLeave.Name, pod.Name, err2) | ||
} | ||
} | ||
return nil | ||
} | ||
return fmt.Errorf("leaveMemberByOtherPods: try to leave pod %v by other pods fail", podToLeave.Name) | ||
} | ||
|
||
func (r *componentWorkloadOps) deletePVCs4ScaleIn(itsObj *workloads.InstanceSet) error { | ||
graphCli := model.NewGraphClient(r.cli) | ||
for _, podName := range r.runningItsPodNames { | ||
|
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -86,21 +86,28 @@ func (s *Leave) Do(ctx context.Context, req *operations.OpsRequest) (*operations | |
return nil, err | ||
} | ||
|
||
currentMember := cluster.GetMemberWithName(manager.GetCurrentMemberName()) | ||
if !cluster.HaConfig.IsDeleting(currentMember) { | ||
cluster.HaConfig.AddMemberToDelete(currentMember) | ||
var memberNameToLeave string | ||
if req.Parameters != nil && req.GetString("podName") != "" { | ||
memberNameToLeave = req.GetString("podName") | ||
} else { | ||
memberNameToLeave = manager.GetCurrentMemberName() | ||
} | ||
|
||
memberToLeave := cluster.GetMemberWithName(memberNameToLeave) | ||
if !cluster.HaConfig.IsDeleting(memberToLeave) { | ||
cluster.HaConfig.AddMemberToDelete(memberToLeave) | ||
_ = s.dcsStore.UpdateHaConfig() | ||
} | ||
|
||
// remove current member from db cluster | ||
err = manager.LeaveMemberFromCluster(ctx, cluster, manager.GetCurrentMemberName()) | ||
// remove member from db cluster, the member may be other pod, depending on if podName is assigned in req.Parameters | ||
err = manager.LeaveMemberFromCluster(ctx, cluster, memberNameToLeave) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This change in the API can be reflected in the DBManager interface. |
||
if err != nil { | ||
s.logger.Error(err, "Leave member from cluster failed") | ||
return nil, err | ||
} | ||
|
||
if cluster.HaConfig.IsDeleting(currentMember) { | ||
cluster.HaConfig.FinishDeleted(currentMember) | ||
if cluster.HaConfig.IsDeleting(memberToLeave) { | ||
cluster.HaConfig.FinishDeleted(memberToLeave) | ||
_ = s.dcsStore.UpdateHaConfig() | ||
} | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
When a pod is pending, new lorry clients may encounter issues. This situation can also be resolved here.