diff --git a/.github/actions/cloud-slack-e2e/action.yaml b/.github/actions/cloud-slack-e2e/action.yaml index 16b499f55..c6e37eb70 100644 --- a/.github/actions/cloud-slack-e2e/action.yaml +++ b/.github/actions/cloud-slack-e2e/action.yaml @@ -110,6 +110,8 @@ runs: - name: Dump cluster state if: ${{ failure() }} uses: ./.github/actions/dump-cluster + with: + name: cloud-slack-e2e - name: Detect failed jobs if: ${{ failure() }} diff --git a/.github/actions/dump-cluster/action.yaml b/.github/actions/dump-cluster/action.yaml index 1c8144da9..bbee69a90 100644 --- a/.github/actions/dump-cluster/action.yaml +++ b/.github/actions/dump-cluster/action.yaml @@ -1,6 +1,11 @@ name: Dump cluster state description: "Creates an artifacts with cluster dump" +inputs: + name: + description: "Cluster name" + required: true + runs: using: "composite" steps: @@ -15,8 +20,8 @@ runs: echo "Dumping cluster info into ${LOGS_DIR}" kubectl cluster-info dump --all-namespaces --output-directory="${LOGS_DIR}" - name: Upload artifacts - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v4 with: - name: cluster_dump_${{github.sha}} + name: cluster_dump_${{github.sha}}_${{ inputs.name }} path: "output" retention-days: 5 # Default 90 days diff --git a/.github/workflows/branch-build.yml b/.github/workflows/branch-build.yml index 29f1ee18f..bb9f705c7 100644 --- a/.github/workflows/branch-build.yml +++ b/.github/workflows/branch-build.yml @@ -194,6 +194,12 @@ jobs: KUBECONFIG=$(k3d kubeconfig write ${{ matrix.integration }}-test-cluster) \ make test-integration-${{ matrix.integration }} + - name: Dump cluster state + if: ${{ failure() }} + uses: ./.github/actions/dump-cluster + with: + name: ${{ matrix.integration }} + - name: Slack Notification uses: rtCamp/action-slack-notify@v2 if: ${{ failure() }} @@ -263,7 +269,7 @@ jobs: KUBECONFIG=$(k3d kubeconfig write cli-migration-e2e-cluster) make test-cli-migration-e2e - name: Upload artifacts - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 if: ${{ always() }} with: name: screenshots_dump_${{github.sha}} @@ -273,6 +279,8 @@ jobs: - name: Dump cluster state if: ${{ failure() }} uses: ./.github/actions/dump-cluster + with: + name: cli-migration-e2e - name: Slack Notification uses: rtCamp/action-slack-notify@v2 diff --git a/.github/workflows/pr-build.yaml b/.github/workflows/pr-build.yaml index 63e7f0c00..638c46671 100644 --- a/.github/workflows/pr-build.yaml +++ b/.github/workflows/pr-build.yaml @@ -69,7 +69,7 @@ jobs: make save-images - name: Upload artifact - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v4 with: name: botkube-${{github.sha}} path: ${{ env.IMAGE_SAVE_LOAD_DIR }} @@ -91,7 +91,7 @@ jobs: persist-credentials: false - name: Download artifact - uses: actions/download-artifact@v2 + uses: actions/download-artifact@v4 with: name: botkube-${{github.sha}} path: ${{ env.IMAGE_SAVE_LOAD_DIR }} @@ -304,3 +304,9 @@ jobs: run: | KUBECONFIG=$(k3d kubeconfig write ${{ matrix.integration }}-test-cluster) \ make test-integration-${{ matrix.integration }} + + - name: Dump cluster state + if: ${{ failure() }} + uses: ./.github/actions/dump-cluster + with: + name: ${{ matrix.integration }} diff --git a/test/cloud-slack-dev-e2e/cloud_slack_dev_e2e_test.go b/test/cloud-slack-dev-e2e/cloud_slack_dev_e2e_test.go index ca7e1fe41..d9395ab22 100644 --- a/test/cloud-slack-dev-e2e/cloud_slack_dev_e2e_test.go +++ b/test/cloud-slack-dev-e2e/cloud_slack_dev_e2e_test.go @@ -11,6 +11,7 @@ import ( "os" "path/filepath" "strings" + "sync/atomic" "testing" "time" @@ -88,9 +89,11 @@ func TestCloudSlackE2E(t *testing.T) { require.NoError(t, err) cfg.Slack.Tester.CloudBasedTestEnabled = false // override property used only in the Cloud Slack E2E tests + cfg.Slack.Tester.RecentMessagesLimit = 4 // this is used effectively only for the Botkube restarts. There are two of them in a short time window, so it shouldn't be higher than 5. authHeaderValue := "" - helmChartUninstalled := false + var botkubeDeploymentUninstalled atomic.Bool + botkubeDeploymentUninstalled.Store(true) // not yet installed gqlEndpoint := fmt.Sprintf("%s/%s", cfg.BotkubeCloud.APIBaseURL, cfg.BotkubeCloud.APIGraphQLEndpoint) if cfg.ScreenshotsDir != "" { @@ -236,7 +239,7 @@ func TestCloudSlackE2E(t *testing.T) { go router.Run() defer router.MustStop() - t.Log("Ensuring proper organizaton is selected") + t.Log("Ensuring proper organization is selected") botkubePage.MustWaitOpen() screenshotIfShould(t, cfg, botkubePage) botkubePage.MustElement("a.logo-link") @@ -299,6 +302,7 @@ func TestCloudSlackE2E(t *testing.T) { if !cfg.Slack.DisconnectWorkspaceAfterTests { return } + t.Log("Disconnecting Slack workspace...") gqlCli.MustDeleteSlackWorkspace(t, cfg.BotkubeCloud.TeamOrganizationID, slackWorkspace.ID) }) @@ -319,18 +323,8 @@ func TestCloudSlackE2E(t *testing.T) { t.Log("Creating deployment...") deployment := gqlCli.MustCreateBasicDeploymentWithCloudSlack(t, channel.Name(), slackWorkspace.TeamID, channel.Name()) t.Cleanup(func() { - // We have a glitch on backend side and the logic below is a workaround for that. - // Tl;dr uninstalling Helm chart reports "DISCONNECTED" status, and deplyment deletion reports "DELETED" status. - // If we do these two things too quickly, we'll run into resource version mismatch in repository logic. - // Read more here: https://github.com/kubeshop/botkube-cloud/pull/486#issuecomment-1604333794 - - for !helmChartUninstalled { - t.Log("Waiting for Helm chart uninstallation, in order to proceed with deleting the first deployment...") - time.Sleep(1 * time.Second) - } - - t.Log("Helm chart uninstalled. Waiting a bit...") - time.Sleep(3 * time.Second) // ugly, but at least we will be pretty sure we won't run into the resource version mismatch + err := helmx.WaitForUninstallation(context.Background(), t, &botkubeDeploymentUninstalled) + assert.NoError(t, err) t.Log("Deleting first deployment...") gqlCli.MustDeleteDeployment(t, graphql.ID(deployment.ID)) @@ -343,6 +337,7 @@ func TestCloudSlackE2E(t *testing.T) { gqlCli.MustDeleteDeployment(t, graphql.ID(deployment2.ID)) }) + botkubeDeploymentUninstalled.Store(false) // about to be installed params := helmx.InstallChartParams{ RepoURL: "https://storage.googleapis.com/botkube-latest-main-charts", RepoName: "botkube", @@ -353,9 +348,14 @@ func TestCloudSlackE2E(t *testing.T) { } helmInstallCallback := helmx.InstallChart(t, params) t.Cleanup(func() { - t.Log("Uninstalling Helm chart...") - helmInstallCallback(t) - helmChartUninstalled = true + if t.Failed() { + t.Log("Tests failed, keeping the Botkube instance installed for debugging purposes.") + } else { + t.Log("Uninstalling Helm chart...") + helmInstallCallback(t) + } + + botkubeDeploymentUninstalled.Store(true) }) t.Log("Waiting for help message...") @@ -472,9 +472,9 @@ func TestCloudSlackE2E(t *testing.T) { t.Run("Botkube Deployment -> Cloud sync", func(t *testing.T) { t.Log("Disabling notification...") tester.PostMessageToBot(t, channel.Identifier(), "disable notifications") + t.Log("Waiting for config reload message...") expectedReloadMsg := fmt.Sprintf(":arrows_counterclockwise: Configuration reload requested for cluster '%s'. Hold on a sec...", deployment.Name) - err = tester.OnChannel().WaitForMessagePostedRecentlyEqual(tester.BotUserID(), channel.ID(), expectedReloadMsg) require.NoError(t, err) diff --git a/test/e2e/bots_test.go b/test/e2e/bots_test.go index 5a68473b7..39140179f 100644 --- a/test/e2e/bots_test.go +++ b/test/e2e/bots_test.go @@ -10,10 +10,13 @@ import ( "regexp" "strconv" "strings" + "sync/atomic" "testing" "time" "unicode" + "botkube.io/botube/test/helmx" + "botkube.io/botube/test/botkubex" "botkube.io/botube/test/commplatform" "botkube.io/botube/test/diff" @@ -201,7 +204,9 @@ func runBotTest(t *testing.T, deployEnvSecondaryChannelIDName, deployEnvRbacChannelIDName string, ) { - botkubeDeploymentUninstalled := false + var botkubeDeploymentUninstalled atomic.Bool + botkubeDeploymentUninstalled.Store(true) // not yet installed + t.Logf("Creating API client with provided token for %s...", driverType) botDriver, err := newBotDriver(appCfg, driverType) require.NoError(t, err) @@ -259,22 +264,14 @@ func runBotTest(t *testing.T, gqlCli.MustCreateAlias(t, alias[0], alias[1], alias[2], deployment.ID) } t.Cleanup(func() { - // We have a glitch on backend side and the logic below is a workaround for that. - // Tl;dr uninstalling Helm chart reports "DISCONNECTED" status, and deployment deletion reports "DELETED" status. - // If we do these two things too quickly, we'll run into resource version mismatch in repository logic. - // Read more here: https://github.com/kubeshop/botkube-cloud/pull/486#issuecomment-1604333794 - for !botkubeDeploymentUninstalled { - t.Log("Waiting for Helm chart uninstallation, in order to proceed with deleting Botkube Cloud instance...") - time.Sleep(1 * time.Second) - } - - t.Log("Helm chart uninstalled. Waiting a bit...") - time.Sleep(3 * time.Second) // ugly, but at least we will be pretty sure we won't run into the resource version mismatch + err := helmx.WaitForUninstallation(context.Background(), t, &botkubeDeploymentUninstalled) + assert.NoError(t, err) t.Log("Deleting Botkube Cloud instance...") gqlCli.MustDeleteDeployment(t, graphql.ID(deployment.ID)) }) + botkubeDeploymentUninstalled.Store(false) // about to be installed err = botkubex.Install(t, botkubex.InstallParams{ BinaryPath: appCfg.ConfigProvider.BotkubeCliBinaryPath, HelmRepoDirectory: appCfg.ConfigProvider.HelmRepoDirectory, @@ -289,9 +286,14 @@ func runBotTest(t *testing.T, }) require.NoError(t, err) t.Cleanup(func() { - t.Log("Uninstalling Helm chart...") - botkubex.Uninstall(t, appCfg.ConfigProvider.BotkubeCliBinaryPath) - botkubeDeploymentUninstalled = true + if t.Failed() { + t.Log("Tests failed, keeping the Botkube instance installed for debugging purposes.") + } else { + t.Log("Uninstalling Helm chart...") + botkubex.Uninstall(t, appCfg.ConfigProvider.BotkubeCliBinaryPath) + } + + botkubeDeploymentUninstalled.Store(true) }) } @@ -838,10 +840,13 @@ func runBotTest(t *testing.T, expectedMessage = fmt.Sprintf("%s\n%s", cmdHeader(command), expectedBody) botDriver.PostMessageToBot(t, botDriver.SecondChannel().Identifier(), command) - err = botDriver.WaitForMessagePosted(botDriver.BotUserID(), botDriver.SecondChannel().ID(), limitMessages(), botDriver.AssertEquals(expectedMessage)) require.NoError(t, err) + if botDriver.Type().IsCloud() { + waitForRestart(t, botDriver, botDriver.BotUserID(), botDriver.FirstChannel().ID(), appCfg.ClusterName) + } + cfgMapCli := k8sCli.CoreV1().ConfigMaps(appCfg.Deployment.Namespace) t.Log("Creating ConfigMap...") @@ -947,12 +952,12 @@ func runBotTest(t *testing.T, expectedMessage = fmt.Sprintf("%s\n%s", cmdHeader(command), expectedBody) botDriver.PostMessageToBot(t, botDriver.FirstChannel().Identifier(), command) + err = botDriver.WaitForMessagePosted(botDriver.BotUserID(), botDriver.FirstChannel().ID(), limitMessages(), botDriver.AssertEquals(expectedMessage)) + assert.NoError(t, err) if botDriver.Type().IsCloud() { waitForRestart(t, botDriver, botDriver.BotUserID(), botDriver.FirstChannel().ID(), appCfg.ClusterName) } - err = botDriver.WaitForMessagePosted(botDriver.BotUserID(), botDriver.FirstChannel().ID(), limitMessages(), botDriver.AssertEquals(expectedMessage)) - assert.NoError(t, err) t.Log("Getting notifier status from second channel...") command = "status notifications" @@ -1078,8 +1083,7 @@ func runBotTest(t *testing.T, limitMessagesNo := 2 if botDriver.Type().IsCloud() { - // There are 2 config reload requested after second cm update - limitMessagesNo = 2 * limitLastMessageAfterCloudReload + limitMessagesNo = limitLastMessageAfterCloudReload } err = botDriver.OnChannel().WaitForMessagePostedWithAttachment(botDriver.BotUserID(), botDriver.SecondChannel().ID(), limitMessagesNo, secondCMUpdate) require.NoError(t, err) @@ -1682,16 +1686,11 @@ func crashConfigMapSourcePlugin(t *testing.T, cfgMapCli corev1.ConfigMapInterfac } func waitForRestart(t *testing.T, tester commplatform.BotDriver, userID, channel, clusterName string) { - t.Log("Waiting for restart...") + t.Logf("Waiting for restart (timestamp: %s)...", time.Now().Format(time.DateTime)) originalTimeout := tester.Timeout() - tester.SetTimeout(90 * time.Second) - if tester.Type() == commplatform.TeamsBot { - tester.SetTimeout(120 * time.Second) - } - // 2, since time to time latest message becomes upgrade message right after begin message + tester.SetTimeout(120 * time.Second) expMsg := fmt.Sprintf("My watch begins for cluster '%s'! :crossed_swords:", clusterName) - assertFn := tester.AssertEquals(expMsg) if tester.Type() == commplatform.TeamsBot { // Teams sends JSON (Adaptive Card), so we cannot do equal assertion expMsg = fmt.Sprintf("My watch begins for cluster '%s'!", clusterName) @@ -1700,14 +1699,16 @@ func waitForRestart(t *testing.T, tester commplatform.BotDriver, userID, channel } } + // 2, since from time to time latest message becomes upgrade message right after begin message err := tester.OnChannel().WaitForMessagePosted(userID, channel, 2, assertFn) - if err != nil && tester.Type() == commplatform.TeamsBot { - // TODO(https://github.com/kubeshop/botkube-cloud/issues/854): for some reason, Teams restarts are not deterministic and sometimes it doesn't happen - // We should add fetching Agent logs to see why it happens. - t.Logf("⚠️ Teams communication platform didn't restart on time: %v", err) - } else { - assert.NoError(t, err) - } + assert.NoError(t, err) + + t.Logf("Detected a successful restart (timestamp: %s).", time.Now().Format(time.DateTime)) + + t.Logf("Waiting a bit longer just to make sure Botkube connects to the Cloud Router...") + // Yes, it's ugly but "My watch begins..." doesn't really mean the Slack/Teams gRPC connection has been established. + // So we wait a bit longer to avoid a race condition. + time.Sleep(3 * time.Second) tester.SetTimeout(originalTimeout) } diff --git a/test/helmx/helm_helpers.go b/test/helmx/helm_helpers.go index 9b6c59802..828764e58 100644 --- a/test/helmx/helm_helpers.go +++ b/test/helmx/helm_helpers.go @@ -1,12 +1,17 @@ package helmx import ( + "context" "encoding/json" "fmt" "os/exec" "regexp" "strings" + "sync/atomic" "testing" + "time" + + "k8s.io/apimachinery/pkg/util/wait" "github.com/stretchr/testify/require" ) @@ -108,3 +113,35 @@ func redactAPIKey(in []string) []string { } return dst } + +const ( + uninstallPollInterval = 1 * time.Second + uninstallTimeout = 30 * time.Second +) + +// WaitForUninstallation waits until a Helm chart is uninstalled, based on the atomic value. +// It's a workaround for the Helm chart uninstallation issue. +// We have a glitch on backend side and the logic below is a workaround for that. +// Tl;dr uninstalling Helm chart reports "DISCONNECTED" status, and deployment deletion reports "DELETED" status. +// If we do these two things too quickly, we'll run into resource version mismatch in repository logic. +// Read more here: https://github.com/kubeshop/botkube-cloud/pull/486#issuecomment-1604333794 +func WaitForUninstallation(ctx context.Context, t *testing.T, alreadyUninstalled *atomic.Bool) error { + t.Helper() + t.Log("Waiting for Helm chart uninstallation, in order to proceed with deleting Botkube Cloud instance...") + err := wait.PollUntilContextTimeout(ctx, uninstallPollInterval, uninstallTimeout, false, func(ctx context.Context) (done bool, err error) { + return alreadyUninstalled.Load(), nil + }) + waitInterrupted := wait.Interrupted(err) + if err != nil && !waitInterrupted { + return err + } + + if waitInterrupted { + t.Log("Waiting for Helm chart uninstallation timed out. Proceeding with deleting other resources...") + return nil + } + + t.Log("Waiting a bit more...") + time.Sleep(3 * time.Second) // ugly, but at least we will be pretty sure we won't run into the resource version mismatch + return nil +} diff --git a/test/helmx/redact_test.go b/test/helmx/redact_test.go index 14701cfbe..34064d8d1 100644 --- a/test/helmx/redact_test.go +++ b/test/helmx/redact_test.go @@ -7,7 +7,6 @@ import ( ) func TestRedactAPIKey(t *testing.T) { - tests := []struct { name string input []string