Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CSPL-2344 Upgrade Strategy for SHC #1197

Closed
wants to merge 20 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .env
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,5 @@ AWSCLI_URL=https://awscli.amazonaws.com/awscli-exe-linux-x86_64-2.8.6.zip
KUBECTL_VERSION=v1.25.3
AZ_CLI_VERSION=2.30.0
EKSCTL_VERSION=v0.143.0
EKS_CLUSTER_K8_VERSION=1.22
EKS_CLUSTER_K8_VERSION=1.26
SPLUNK_ENTERPRISE_RELEASE_IMAGE=docker.io/splunk/splunk:9.0.5
4 changes: 2 additions & 2 deletions .github/workflows/build-test-push-workflow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,7 @@ jobs:
kubectl apply -f https://github.com/kubernetes-sigs/metrics-server/releases/latest/download/components.yaml
- name: install k8s dashboard
run: |
kubectl apply -f https://raw.githubusercontent.com/kubernetes/dashboard/v2.0.5/aio/deploy/recommended.yaml
kubectl apply -f https://raw.githubusercontent.com/kubernetes/dashboard/v2.0.5/aio/deploy/recommended.yaml
- name: Setup Kustomize
run: |
sudo snap install kustomize
Expand All @@ -274,7 +274,7 @@ jobs:
run: |
make cleanup
make clean
- name: Cleanup up EKS cluster
- name: Cleanup up EKS cluster FIXME
if: ${{ always() }}
run: |
make cluster-down
Expand Down
2 changes: 1 addition & 1 deletion pkg/splunk/enterprise/clustermanager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -867,7 +867,7 @@ func TestAppFrameworkApplyClusterManagerShouldNotFail(t *testing.T) {
}
}

func TestApplyCLusterManagerDeletion(t *testing.T) {
func TestApplyClusterManagerDeletion(t *testing.T) {
ctx := context.TODO()
cm := enterpriseApi.ClusterManager{
ObjectMeta: metav1.ObjectMeta{
Expand Down
9 changes: 9 additions & 0 deletions pkg/splunk/enterprise/monitoringconsole.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,15 @@ func ApplyMonitoringConsole(ctx context.Context, client splcommon.ControllerClie
if cr.Status.Phase == enterpriseApi.PhaseReady {
finalResult := handleAppFrameworkActivity(ctx, client, cr, &cr.Status.AppContext, &cr.Spec.AppFrameworkConfig)
result = *finalResult

// TODO: CSPL-2434 Search Head Clusters restarting with change in annotation
// Fix the Change Annotation logic/ find an alternative (eg. state machine); right now the search head deployer
// starts terminating with this and few replicas do not come up properly after that

// err = changeSearchHeadAnnotations(ctx, client, cr)
gaurav-splunk marked this conversation as resolved.
Show resolved Hide resolved
// if err != nil {
// return result, err
// }
}
// RequeueAfter if greater than 0, tells the Controller to requeue the reconcile key after the Duration.
// Implies that Requeue is true, there is no need to set Requeue to true at the same time as RequeueAfter.
Expand Down
76 changes: 74 additions & 2 deletions pkg/splunk/enterprise/searchheadcluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ import (
splutil "github.com/splunk/splunk-operator/pkg/splunk/util"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
k8serrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/tools/remotecommand"
"sigs.k8s.io/controller-runtime/pkg/client"
Expand Down Expand Up @@ -160,6 +161,11 @@ func ApplySearchHeadCluster(ctx context.Context, client splcommon.ControllerClie
return result, err
}

continueReconcile, err := isSearchHeadReadyForUpgrade(ctx, client, cr)
if err != nil || !continueReconcile {
return result, err
}

deployerManager := splctrl.DefaultStatefulSetPodManager{}
phase, err := deployerManager.Update(ctx, client, statefulSet, 1)
if err != nil {
Expand All @@ -179,7 +185,7 @@ func ApplySearchHeadCluster(ctx context.Context, client splcommon.ControllerClie
return result, err
}

mgr := newSerachHeadClusterPodManager(client, scopedLog, cr, namespaceScopedSecret, splclient.NewSplunkClient)
mgr := newSearchHeadClusterPodManager(client, scopedLog, cr, namespaceScopedSecret, splclient.NewSplunkClient)
phase, err = mgr.Update(ctx, client, statefulSet, cr.Spec.Replicas)
if err != nil {
return result, err
Expand Down Expand Up @@ -247,7 +253,7 @@ type searchHeadClusterPodManager struct {
}

// newSerachHeadClusterPodManager function to create pod manager this is added to write unit test case
var newSerachHeadClusterPodManager = func(client splcommon.ControllerClient, log logr.Logger, cr *enterpriseApi.SearchHeadCluster, secret *corev1.Secret, newSplunkClient NewSplunkClientFunc) searchHeadClusterPodManager {
var newSearchHeadClusterPodManager = func(client splcommon.ControllerClient, log logr.Logger, cr *enterpriseApi.SearchHeadCluster, secret *corev1.Secret, newSplunkClient NewSplunkClientFunc) searchHeadClusterPodManager {
return searchHeadClusterPodManager{
log: log,
cr: cr,
Expand Down Expand Up @@ -667,3 +673,69 @@ func getSearchHeadClusterList(ctx context.Context, c splcommon.ControllerClient,

return objectList, nil
}

// isSearchHeadReadyForUpgrade checks if SearchHeadCluster can be upgraded if a version upgrade is in-progress
// No-operation otherwise; returns bool, err accordingly
func isSearchHeadReadyForUpgrade(ctx context.Context, c splcommon.ControllerClient, cr *enterpriseApi.SearchHeadCluster) (bool, error) {
reqLogger := log.FromContext(ctx)
scopedLog := reqLogger.WithName("isSearchHeadReadyForUpgrade").WithValues("name", cr.GetName(), "namespace", cr.GetNamespace())
eventPublisher, _ := newK8EventPublisher(c, cr)

// check if a MonitoringConsole is attached to the instance
monitoringConsoleRef := cr.Spec.MonitoringConsoleRef
if monitoringConsoleRef.Name == "" {
return true, nil
}

namespacedName := types.NamespacedName{
Namespace: cr.GetNamespace(),
Name: GetSplunkStatefulsetName(SplunkSearchHead, cr.GetName()),
}

// check if the stateful set is created at this instance
statefulSet := &appsv1.StatefulSet{}
err := c.Get(ctx, namespacedName, statefulSet)
if err != nil {
if k8serrors.IsNotFound(err) {
return true, nil
}
eventPublisher.Warning(ctx, "isSearchHeadReadyForUpgrade", fmt.Sprintf("Could not find the Search Head stateful set. Reason %v", err))
scopedLog.Error(err, "Unable to get Stateful Set")
return false, err
}

namespacedName = types.NamespacedName{Namespace: cr.GetNamespace(), Name: monitoringConsoleRef.Name}
monitoringConsole := &enterpriseApi.MonitoringConsole{}

// get the monitoring console referred in search head cluster
err = c.Get(ctx, namespacedName, monitoringConsole)
if err != nil {
if k8serrors.IsNotFound(err) {
tgarg-splunk marked this conversation as resolved.
Show resolved Hide resolved
return true, nil
}
eventPublisher.Warning(ctx, "isSearchHeadReadyForUpgrade", fmt.Sprintf("Could not find the Monitoring Console. Reason %v", err))
scopedLog.Error(err, "Unable to get Monitoring Console")
return false, err
}

mcImage, err := getCurrentImage(ctx, c, monitoringConsole, SplunkMonitoringConsole)
if err != nil {
eventPublisher.Warning(ctx, "isSearchHeadReadyForUpgrade", fmt.Sprintf("Could not get the Monitoring Console Image. Reason %v", err))
scopedLog.Error(err, "Unable to get Monitoring Console current image")
return false, err
}

shcImage, err := getCurrentImage(ctx, c, cr, SplunkSearchHead)
if err != nil {
eventPublisher.Warning(ctx, "isSearchHeadReadyForUpgrade", fmt.Sprintf("Could not get the Search Head Image. Reason %v", err))
scopedLog.Error(err, "Unable to get Search Head current image")
return false, err
}

// check if an image upgrade is happening and whether the SearchHeadCluster is ready for the upgrade
if (cr.Spec.Image != shcImage) && (monitoringConsole.Status.Phase != enterpriseApi.PhaseReady || mcImage != cr.Spec.Image) {
return false, nil
}

return true, nil
}
90 changes: 89 additions & 1 deletion pkg/splunk/enterprise/searchheadcluster_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1436,7 +1436,7 @@ func TestSearchHeadClusterWithReadyState(t *testing.T) {
}

// mock new search pod manager
newSerachHeadClusterPodManager = func(client splcommon.ControllerClient, log logr.Logger, cr *enterpriseApi.SearchHeadCluster, secret *corev1.Secret, newSplunkClient NewSplunkClientFunc) searchHeadClusterPodManager {
newSearchHeadClusterPodManager = func(client splcommon.ControllerClient, log logr.Logger, cr *enterpriseApi.SearchHeadCluster, secret *corev1.Secret, newSplunkClient NewSplunkClientFunc) searchHeadClusterPodManager {
return searchHeadClusterPodManager{
log: log,
cr: cr,
Expand Down Expand Up @@ -1873,3 +1873,91 @@ func TestSearchHeadClusterWithReadyState(t *testing.T) {
debug.PrintStack()
}
}

func TestIsSearchHeadReadyForUpgrade(t *testing.T) {
ctx := context.TODO()

builder := fake.NewClientBuilder()
client := builder.Build()
utilruntime.Must(enterpriseApi.AddToScheme(clientgoscheme.Scheme))

// Create License Manager
mc := enterpriseApi.MonitoringConsole{
ObjectMeta: metav1.ObjectMeta{
Name: "test",
Namespace: "test",
},
Spec: enterpriseApi.MonitoringConsoleSpec{
CommonSplunkSpec: enterpriseApi.CommonSplunkSpec{
Spec: enterpriseApi.Spec{
ImagePullPolicy: "Always",
Image: "splunk/splunk:latest",
},
Volumes: []corev1.Volume{},
},
},
}

err := client.Create(ctx, &mc)
_, err = ApplyMonitoringConsole(ctx, client, &mc)
if err != nil {
t.Errorf("applyMonitoringConsole should not have returned error; err=%v", err)
}
mc.Status.Phase = enterpriseApi.PhaseReady
err = client.Status().Update(ctx, &mc)
if err != nil {
t.Errorf("Unexpected status update %v", err)
debug.PrintStack()
}

// Create Search Head Cluster
shc := enterpriseApi.SearchHeadCluster{
ObjectMeta: metav1.ObjectMeta{
Name: "test",
Namespace: "test",
},
Spec: enterpriseApi.SearchHeadClusterSpec{
CommonSplunkSpec: enterpriseApi.CommonSplunkSpec{
Spec: enterpriseApi.Spec{
ImagePullPolicy: "Always",
Image: "splunk/splunk:latest",
},
Volumes: []corev1.Volume{},
MonitoringConsoleRef: corev1.ObjectReference{
Name: "test",
},
},
Replicas: int32(3),
},
}

err = client.Create(ctx, &shc)
_, err = ApplySearchHeadCluster(ctx, client, &shc)
if err != nil {
t.Errorf("applySearchHeadCluster should not have returned error; err=%v", err)
}

mc.Spec.Image = "splunk2"
shc.Spec.Image = "splunk2"
_, err = ApplyMonitoringConsole(ctx, client, &mc)

searchHeadCluster := &enterpriseApi.SearchHeadCluster{}
namespacedName := types.NamespacedName{
Name: shc.Name,
Namespace: shc.Namespace,
}
err = client.Get(ctx, namespacedName, searchHeadCluster)
if err != nil {
t.Errorf("Get Search Head Cluster should not have returned error=%v", err)
}

check, err := isSearchHeadReadyForUpgrade(ctx, client, searchHeadCluster)

if err != nil {
t.Errorf("Unexpected upgradeScenario error %v", err)
}

if !check {
t.Errorf("isSearchHeadReadyForUpgrade: SHC should be ready for upgrade")
}
}
36 changes: 34 additions & 2 deletions test/deploy-eks-cluster.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ if [[ -z "${ECR_REPOSITORY}" ]]; then
fi

if [[ -z "${EKS_CLUSTER_K8_VERSION}" ]]; then
echo "EKS_CLUSTER_K8_VERSION not set. Changing to 1.22"
export EKS_CLUSTER_K8_VERSION="1.22"
echo "EKS_CLUSTER_K8_VERSION not set. Changing to 1.26"
export EKS_CLUSTER_K8_VERSION="1.26"
fi

function deleteCluster() {
Expand All @@ -35,6 +35,8 @@ function deleteCluster() {
echo "Unable to delete cluster - ${TEST_CLUSTER_NAME}"
return 1
fi
rolename= echo ${TEST_CLUSTER_NAME} | awk -F- '{print "EBS_" $(NF-1) "_" $(NF)}'
aws iam delete-role --role-name ${rolename}

return 0
}
Expand All @@ -54,6 +56,36 @@ function createCluster() {
echo "Unable to create cluster - ${TEST_CLUSTER_NAME}"
return 1
fi
eksctl utils associate-iam-oidc-provider --cluster=${TEST_CLUSTER_NAME} --approve
oidc_id=$(aws eks describe-cluster --name ${TEST_CLUSTER_NAME} --query "cluster.identity.oidc.issuer" --output text | cut -d '/' -f 5)
account_id=$(aws sts get-caller-identity --query "Account" --output text)
oidc_provider=$(aws eks describe-cluster --name ${TEST_CLUSTER_NAME} --region "us-west-2" --query "cluster.identity.oidc.issuer" --output text | sed -e "s/^https:\/\///")
namespace=kube-system
service_account=ebs-csi-controller-sa
kubectl create serviceaccount ${service_account} --namespace ${namespace}
echo "{
\"Version\": \"2012-10-17\",
\"Statement\": [
{
\"Effect\": \"Allow\",
\"Principal\": {
\"Federated\": \"arn:aws:iam::$account_id:oidc-provider/$oidc_provider\"
},
\"Action\": \"sts:AssumeRoleWithWebIdentity\",
\"Condition\": {
\"StringEquals\": {
\"$oidc_provider:aud\": \"sts.amazonaws.com\",
\"$oidc_provider:sub\": \"system:serviceaccount:$namespace:$service_account\"
}
}
}
]
}" >aws-ebs-csi-driver-trust-policy.json
rolename=$(echo ${TEST_CLUSTER_NAME} | awk -F- '{print "EBS_" $(NF-1) "_" $(NF)}')
aws iam create-role --role-name ${rolename} --assume-role-policy-document file://aws-ebs-csi-driver-trust-policy.json --description "irsa role for ${TEST_CLUSTER_NAME}"
aws iam attach-role-policy --policy-arn arn:aws:iam::aws:policy/service-role/AmazonEBSCSIDriverPolicy --role-name ${rolename}
kubectl annotate serviceaccount -n $namespace $service_account eks.amazonaws.com/role-arn=arn:aws:iam::$account_id:role/${rolename}
eksctl create addon --name aws-ebs-csi-driver --cluster ${TEST_CLUSTER_NAME} --service-account-role-arn arn:aws:iam::$account_id:role/${rolename} --force
else
echo "Retrieving kubeconfig for ${TEST_CLUSTER_NAME}"
# Cluster exists but kubeconfig may not
Expand Down
2 changes: 1 addition & 1 deletion test/env.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
: "${ECR_REGISTRY:=}"
: "${VPC_PUBLIC_SUBNET_STRING:=}"
: "${VPC_PRIVATE_SUBNET_STRING:=}"
: "${EKS_CLUSTER_K8_VERSION:=1.22}"
: "${EKS_CLUSTER_K8_VERSION:=1.26}"
# Below env variables required to run license master test cases
: "${ENTERPRISE_LICENSE_S3_PATH:=}"
: "${TEST_S3_BUCKET:=}"
Expand Down
Loading