Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve resilience of E2E and integration tests #1406

Merged
merged 8 commits into from
Mar 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/actions/cloud-slack-e2e/action.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,8 @@ runs:
- name: Dump cluster state
if: ${{ failure() }}
uses: ./.github/actions/dump-cluster
with:
name: cloud-slack-e2e

- name: Detect failed jobs
if: ${{ failure() }}
Expand Down
9 changes: 7 additions & 2 deletions .github/actions/dump-cluster/action.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
name: Dump cluster state
description: "Creates an artifacts with cluster dump"

inputs:
name:
description: "Cluster name"
required: true

runs:
using: "composite"
steps:
Expand All @@ -15,8 +20,8 @@ runs:
echo "Dumping cluster info into ${LOGS_DIR}"
kubectl cluster-info dump --all-namespaces --output-directory="${LOGS_DIR}"
- name: Upload artifacts
uses: actions/upload-artifact@v2
uses: actions/upload-artifact@v4
with:
name: cluster_dump_${{github.sha}}
name: cluster_dump_${{github.sha}}_${{ inputs.name }}
path: "output"
retention-days: 5 # Default 90 days
10 changes: 9 additions & 1 deletion .github/workflows/branch-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,12 @@ jobs:
KUBECONFIG=$(k3d kubeconfig write ${{ matrix.integration }}-test-cluster) \
make test-integration-${{ matrix.integration }}

- name: Dump cluster state
if: ${{ failure() }}
uses: ./.github/actions/dump-cluster
with:
name: ${{ matrix.integration }}

- name: Slack Notification
uses: rtCamp/action-slack-notify@v2
if: ${{ failure() }}
Expand Down Expand Up @@ -263,7 +269,7 @@ jobs:
KUBECONFIG=$(k3d kubeconfig write cli-migration-e2e-cluster) make test-cli-migration-e2e

- name: Upload artifacts
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
if: ${{ always() }}
with:
name: screenshots_dump_${{github.sha}}
Expand All @@ -273,6 +279,8 @@ jobs:
- name: Dump cluster state
if: ${{ failure() }}
uses: ./.github/actions/dump-cluster
with:
name: cli-migration-e2e

- name: Slack Notification
uses: rtCamp/action-slack-notify@v2
Expand Down
10 changes: 8 additions & 2 deletions .github/workflows/pr-build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ jobs:
make save-images

- name: Upload artifact
uses: actions/upload-artifact@v2
uses: actions/upload-artifact@v4
with:
name: botkube-${{github.sha}}
path: ${{ env.IMAGE_SAVE_LOAD_DIR }}
Expand All @@ -91,7 +91,7 @@ jobs:
persist-credentials: false

- name: Download artifact
uses: actions/download-artifact@v2
uses: actions/download-artifact@v4
with:
name: botkube-${{github.sha}}
path: ${{ env.IMAGE_SAVE_LOAD_DIR }}
Expand Down Expand Up @@ -304,3 +304,9 @@ jobs:
run: |
KUBECONFIG=$(k3d kubeconfig write ${{ matrix.integration }}-test-cluster) \
make test-integration-${{ matrix.integration }}

- name: Dump cluster state
if: ${{ failure() }}
uses: ./.github/actions/dump-cluster
with:
name: ${{ matrix.integration }}
36 changes: 18 additions & 18 deletions test/cloud-slack-dev-e2e/cloud_slack_dev_e2e_test.go
pkosiec marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"os"
"path/filepath"
"strings"
"sync/atomic"
"testing"
"time"

Expand Down Expand Up @@ -88,9 +89,11 @@ func TestCloudSlackE2E(t *testing.T) {
require.NoError(t, err)

cfg.Slack.Tester.CloudBasedTestEnabled = false // override property used only in the Cloud Slack E2E tests
cfg.Slack.Tester.RecentMessagesLimit = 4 // this is used effectively only for the Botkube restarts. There are two of them in a short time window, so it shouldn't be higher than 5.

authHeaderValue := ""
helmChartUninstalled := false
var botkubeDeploymentUninstalled atomic.Bool
botkubeDeploymentUninstalled.Store(true) // not yet installed
gqlEndpoint := fmt.Sprintf("%s/%s", cfg.BotkubeCloud.APIBaseURL, cfg.BotkubeCloud.APIGraphQLEndpoint)

if cfg.ScreenshotsDir != "" {
Expand Down Expand Up @@ -236,7 +239,7 @@ func TestCloudSlackE2E(t *testing.T) {
go router.Run()
defer router.MustStop()

t.Log("Ensuring proper organizaton is selected")
t.Log("Ensuring proper organization is selected")
botkubePage.MustWaitOpen()
screenshotIfShould(t, cfg, botkubePage)
botkubePage.MustElement("a.logo-link")
Expand Down Expand Up @@ -299,6 +302,7 @@ func TestCloudSlackE2E(t *testing.T) {
if !cfg.Slack.DisconnectWorkspaceAfterTests {
return
}
t.Log("Disconnecting Slack workspace...")
gqlCli.MustDeleteSlackWorkspace(t, cfg.BotkubeCloud.TeamOrganizationID, slackWorkspace.ID)
})

Expand All @@ -319,18 +323,8 @@ func TestCloudSlackE2E(t *testing.T) {
t.Log("Creating deployment...")
deployment := gqlCli.MustCreateBasicDeploymentWithCloudSlack(t, channel.Name(), slackWorkspace.TeamID, channel.Name())
t.Cleanup(func() {
// We have a glitch on backend side and the logic below is a workaround for that.
// Tl;dr uninstalling Helm chart reports "DISCONNECTED" status, and deplyment deletion reports "DELETED" status.
// If we do these two things too quickly, we'll run into resource version mismatch in repository logic.
// Read more here: https://github.com/kubeshop/botkube-cloud/pull/486#issuecomment-1604333794

for !helmChartUninstalled {
t.Log("Waiting for Helm chart uninstallation, in order to proceed with deleting the first deployment...")
time.Sleep(1 * time.Second)
}

t.Log("Helm chart uninstalled. Waiting a bit...")
time.Sleep(3 * time.Second) // ugly, but at least we will be pretty sure we won't run into the resource version mismatch
err := helmx.WaitForUninstallation(context.Background(), t, &botkubeDeploymentUninstalled)
assert.NoError(t, err)

t.Log("Deleting first deployment...")
gqlCli.MustDeleteDeployment(t, graphql.ID(deployment.ID))
Expand All @@ -343,6 +337,7 @@ func TestCloudSlackE2E(t *testing.T) {
gqlCli.MustDeleteDeployment(t, graphql.ID(deployment2.ID))
})

botkubeDeploymentUninstalled.Store(false) // about to be installed
params := helmx.InstallChartParams{
RepoURL: "https://storage.googleapis.com/botkube-latest-main-charts",
RepoName: "botkube",
Expand All @@ -353,9 +348,14 @@ func TestCloudSlackE2E(t *testing.T) {
}
helmInstallCallback := helmx.InstallChart(t, params)
t.Cleanup(func() {
t.Log("Uninstalling Helm chart...")
helmInstallCallback(t)
helmChartUninstalled = true
if t.Failed() {
t.Log("Tests failed, keeping the Botkube instance installed for debugging purposes.")
} else {
t.Log("Uninstalling Helm chart...")
helmInstallCallback(t)
}

botkubeDeploymentUninstalled.Store(true)
})

t.Log("Waiting for help message...")
Expand Down Expand Up @@ -472,9 +472,9 @@ func TestCloudSlackE2E(t *testing.T) {
t.Run("Botkube Deployment -> Cloud sync", func(t *testing.T) {
t.Log("Disabling notification...")
tester.PostMessageToBot(t, channel.Identifier(), "disable notifications")

t.Log("Waiting for config reload message...")
expectedReloadMsg := fmt.Sprintf(":arrows_counterclockwise: Configuration reload requested for cluster '%s'. Hold on a sec...", deployment.Name)

err = tester.OnChannel().WaitForMessagePostedRecentlyEqual(tester.BotUserID(), channel.ID(), expectedReloadMsg)
require.NoError(t, err)

Expand Down
69 changes: 35 additions & 34 deletions test/e2e/bots_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,13 @@ import (
"regexp"
"strconv"
"strings"
"sync/atomic"
"testing"
"time"
"unicode"

"botkube.io/botube/test/helmx"

"botkube.io/botube/test/botkubex"
"botkube.io/botube/test/commplatform"
"botkube.io/botube/test/diff"
Expand Down Expand Up @@ -201,7 +204,9 @@ func runBotTest(t *testing.T,
deployEnvSecondaryChannelIDName,
deployEnvRbacChannelIDName string,
) {
botkubeDeploymentUninstalled := false
var botkubeDeploymentUninstalled atomic.Bool
botkubeDeploymentUninstalled.Store(true) // not yet installed

t.Logf("Creating API client with provided token for %s...", driverType)
botDriver, err := newBotDriver(appCfg, driverType)
require.NoError(t, err)
Expand Down Expand Up @@ -259,22 +264,14 @@ func runBotTest(t *testing.T,
gqlCli.MustCreateAlias(t, alias[0], alias[1], alias[2], deployment.ID)
}
t.Cleanup(func() {
// We have a glitch on backend side and the logic below is a workaround for that.
// Tl;dr uninstalling Helm chart reports "DISCONNECTED" status, and deployment deletion reports "DELETED" status.
// If we do these two things too quickly, we'll run into resource version mismatch in repository logic.
// Read more here: https://github.com/kubeshop/botkube-cloud/pull/486#issuecomment-1604333794
for !botkubeDeploymentUninstalled {
t.Log("Waiting for Helm chart uninstallation, in order to proceed with deleting Botkube Cloud instance...")
time.Sleep(1 * time.Second)
}

t.Log("Helm chart uninstalled. Waiting a bit...")
time.Sleep(3 * time.Second) // ugly, but at least we will be pretty sure we won't run into the resource version mismatch
err := helmx.WaitForUninstallation(context.Background(), t, &botkubeDeploymentUninstalled)
assert.NoError(t, err)

t.Log("Deleting Botkube Cloud instance...")
gqlCli.MustDeleteDeployment(t, graphql.ID(deployment.ID))
})

botkubeDeploymentUninstalled.Store(false) // about to be installed
err = botkubex.Install(t, botkubex.InstallParams{
BinaryPath: appCfg.ConfigProvider.BotkubeCliBinaryPath,
HelmRepoDirectory: appCfg.ConfigProvider.HelmRepoDirectory,
Expand All @@ -289,9 +286,14 @@ func runBotTest(t *testing.T,
})
require.NoError(t, err)
t.Cleanup(func() {
t.Log("Uninstalling Helm chart...")
botkubex.Uninstall(t, appCfg.ConfigProvider.BotkubeCliBinaryPath)
botkubeDeploymentUninstalled = true
if t.Failed() {
t.Log("Tests failed, keeping the Botkube instance installed for debugging purposes.")
} else {
t.Log("Uninstalling Helm chart...")
botkubex.Uninstall(t, appCfg.ConfigProvider.BotkubeCliBinaryPath)
}

botkubeDeploymentUninstalled.Store(true)
})
}

Expand Down Expand Up @@ -838,10 +840,13 @@ func runBotTest(t *testing.T,
expectedMessage = fmt.Sprintf("%s\n%s", cmdHeader(command), expectedBody)

botDriver.PostMessageToBot(t, botDriver.SecondChannel().Identifier(), command)

err = botDriver.WaitForMessagePosted(botDriver.BotUserID(), botDriver.SecondChannel().ID(), limitMessages(), botDriver.AssertEquals(expectedMessage))
require.NoError(t, err)

if botDriver.Type().IsCloud() {
waitForRestart(t, botDriver, botDriver.BotUserID(), botDriver.FirstChannel().ID(), appCfg.ClusterName)
}

cfgMapCli := k8sCli.CoreV1().ConfigMaps(appCfg.Deployment.Namespace)

t.Log("Creating ConfigMap...")
Expand Down Expand Up @@ -947,12 +952,12 @@ func runBotTest(t *testing.T,
expectedMessage = fmt.Sprintf("%s\n%s", cmdHeader(command), expectedBody)

botDriver.PostMessageToBot(t, botDriver.FirstChannel().Identifier(), command)
err = botDriver.WaitForMessagePosted(botDriver.BotUserID(), botDriver.FirstChannel().ID(), limitMessages(), botDriver.AssertEquals(expectedMessage))
assert.NoError(t, err)

if botDriver.Type().IsCloud() {
waitForRestart(t, botDriver, botDriver.BotUserID(), botDriver.FirstChannel().ID(), appCfg.ClusterName)
}
err = botDriver.WaitForMessagePosted(botDriver.BotUserID(), botDriver.FirstChannel().ID(), limitMessages(), botDriver.AssertEquals(expectedMessage))
assert.NoError(t, err)

t.Log("Getting notifier status from second channel...")
command = "status notifications"
Expand Down Expand Up @@ -1078,8 +1083,7 @@ func runBotTest(t *testing.T,

limitMessagesNo := 2
if botDriver.Type().IsCloud() {
// There are 2 config reload requested after second cm update
limitMessagesNo = 2 * limitLastMessageAfterCloudReload
limitMessagesNo = limitLastMessageAfterCloudReload
}
err = botDriver.OnChannel().WaitForMessagePostedWithAttachment(botDriver.BotUserID(), botDriver.SecondChannel().ID(), limitMessagesNo, secondCMUpdate)
require.NoError(t, err)
Expand Down Expand Up @@ -1682,16 +1686,11 @@ func crashConfigMapSourcePlugin(t *testing.T, cfgMapCli corev1.ConfigMapInterfac
}

func waitForRestart(t *testing.T, tester commplatform.BotDriver, userID, channel, clusterName string) {
t.Log("Waiting for restart...")
t.Logf("Waiting for restart (timestamp: %s)...", time.Now().Format(time.DateTime))

originalTimeout := tester.Timeout()
tester.SetTimeout(90 * time.Second)
if tester.Type() == commplatform.TeamsBot {
tester.SetTimeout(120 * time.Second)
}
// 2, since time to time latest message becomes upgrade message right after begin message
tester.SetTimeout(120 * time.Second)
expMsg := fmt.Sprintf("My watch begins for cluster '%s'! :crossed_swords:", clusterName)

assertFn := tester.AssertEquals(expMsg)
if tester.Type() == commplatform.TeamsBot { // Teams sends JSON (Adaptive Card), so we cannot do equal assertion
expMsg = fmt.Sprintf("My watch begins for cluster '%s'!", clusterName)
Expand All @@ -1700,14 +1699,16 @@ func waitForRestart(t *testing.T, tester commplatform.BotDriver, userID, channel
}
}

// 2, since from time to time latest message becomes upgrade message right after begin message
err := tester.OnChannel().WaitForMessagePosted(userID, channel, 2, assertFn)
if err != nil && tester.Type() == commplatform.TeamsBot {
// TODO(https://github.com/kubeshop/botkube-cloud/issues/854): for some reason, Teams restarts are not deterministic and sometimes it doesn't happen
// We should add fetching Agent logs to see why it happens.
t.Logf("⚠️ Teams communication platform didn't restart on time: %v", err)
} else {
assert.NoError(t, err)
}
assert.NoError(t, err)

t.Logf("Detected a successful restart (timestamp: %s).", time.Now().Format(time.DateTime))

t.Logf("Waiting a bit longer just to make sure Botkube connects to the Cloud Router...")
// Yes, it's ugly but "My watch begins..." doesn't really mean the Slack/Teams gRPC connection has been established.
// So we wait a bit longer to avoid a race condition.
time.Sleep(3 * time.Second)

tester.SetTimeout(originalTimeout)
}
Expand Down
37 changes: 37 additions & 0 deletions test/helmx/helm_helpers.go
Original file line number Diff line number Diff line change
@@ -1,12 +1,17 @@
package helmx

import (
"context"
"encoding/json"
"fmt"
"os/exec"
"regexp"
"strings"
"sync/atomic"
"testing"
"time"

"k8s.io/apimachinery/pkg/util/wait"

"github.com/stretchr/testify/require"
)
Expand Down Expand Up @@ -108,3 +113,35 @@ func redactAPIKey(in []string) []string {
}
return dst
}

const (
uninstallPollInterval = 1 * time.Second
uninstallTimeout = 30 * time.Second
)

// WaitForUninstallation waits until a Helm chart is uninstalled, based on the atomic value.
// It's a workaround for the Helm chart uninstallation issue.
// We have a glitch on backend side and the logic below is a workaround for that.
// Tl;dr uninstalling Helm chart reports "DISCONNECTED" status, and deployment deletion reports "DELETED" status.
// If we do these two things too quickly, we'll run into resource version mismatch in repository logic.
// Read more here: https://github.com/kubeshop/botkube-cloud/pull/486#issuecomment-1604333794
func WaitForUninstallation(ctx context.Context, t *testing.T, alreadyUninstalled *atomic.Bool) error {
t.Helper()
t.Log("Waiting for Helm chart uninstallation, in order to proceed with deleting Botkube Cloud instance...")
err := wait.PollUntilContextTimeout(ctx, uninstallPollInterval, uninstallTimeout, false, func(ctx context.Context) (done bool, err error) {
return alreadyUninstalled.Load(), nil
})
waitInterrupted := wait.Interrupted(err)
if err != nil && !waitInterrupted {
return err
}

if waitInterrupted {
t.Log("Waiting for Helm chart uninstallation timed out. Proceeding with deleting other resources...")
return nil
}

t.Log("Waiting a bit more...")
time.Sleep(3 * time.Second) // ugly, but at least we will be pretty sure we won't run into the resource version mismatch
return nil
}
Loading
Loading