100 Nodes Scale Test (scale-100) #113
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: 100 Nodes Scale Test (scale-100) | |
on: | |
schedule: | |
- cron: '39 0 * * 1-5' | |
workflow_dispatch: | |
inputs: | |
PR-number: | |
description: "Pull request number." | |
required: true | |
context-ref: | |
description: "Context in which the workflow runs. If PR is from a fork, will be the PR target branch (general case). If PR is NOT from a fork, will be the PR branch itself (this allows committers to test changes to workflows directly from PRs)." | |
required: true | |
SHA: | |
description: "SHA under test (head of the PR branch)." | |
required: true | |
extra-args: | |
description: "[JSON object] Arbitrary arguments passed from the trigger comment via regex capture group. Parse with 'fromJson(inputs.extra-args).argName' in workflow." | |
required: false | |
default: '{}' | |
# For testing uncomment following lines: | |
# push: | |
# branches: | |
# - your_branch_name | |
permissions: | |
# To be able to access the repository with actions/checkout | |
contents: read | |
# To be able to request the JWT from GitHub's OIDC provider | |
id-token: write | |
# To allow retrieving information from the PR API | |
pull-requests: read | |
# To be able to set commit status | |
statuses: write | |
concurrency: | |
# Structure: | |
# - Workflow name | |
# - Event type | |
# - A unique identifier depending on event type: | |
# - schedule: SHA | |
# - workflow_dispatch: PR number | |
# | |
# This structure ensures a unique concurrency group name is generated for each | |
# type of testing, such that re-runs will cancel the previous run. | |
group: | | |
${{ github.workflow }} | |
${{ github.event_name }} | |
${{ | |
(github.event_name == 'schedule' && github.sha) || | |
(github.event_name == 'workflow_dispatch' && github.event.inputs.PR-number) | |
}} | |
cancel-in-progress: true | |
env: | |
# renovate: datasource=golang-version depName=go | |
go_version: 1.23.1 | |
# Adding k8s.local to the end makes kops happy- | |
# has stricter DNS naming requirements. | |
test_name: scale-100 | |
cluster_name: ${{ github.run_id }}-${{ github.run_attempt }} | |
# renovate: datasource=docker depName=google/cloud-sdk | |
gcloud_version: 494.0.0 | |
jobs: | |
echo-inputs: | |
if: ${{ github.event_name == 'workflow_dispatch' }} | |
name: Echo Workflow Dispatch Inputs | |
runs-on: ubuntu-24.04 | |
steps: | |
- name: Echo Workflow Dispatch Inputs | |
run: | | |
echo '${{ tojson(inputs) }}' | |
commit-status-start: | |
name: Commit Status Start | |
runs-on: ubuntu-latest | |
steps: | |
- name: Set initial commit status | |
uses: myrotvorets/set-commit-status-action@3730c0a348a2ace3c110851bed53331bc6406e9f # v2.0.1 | |
with: | |
sha: ${{ inputs.SHA || github.sha }} | |
install-and-scaletest: | |
runs-on: ubuntu-latest | |
name: Install and Scale Test | |
timeout-minutes: 150 | |
steps: | |
- name: Checkout context ref (trusted) | |
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 | |
with: | |
ref: ${{ inputs.context-ref || github.sha }} | |
persist-credentials: false | |
- name: Set Environment Variables | |
uses: ./.github/actions/set-env-variables | |
- name: Set up job variables | |
id: vars | |
run: | | |
if [ "${{ github.event_name }}" = "workflow_dispatch" ] ; then | |
SHA="${{ inputs.SHA }}" | |
else | |
SHA="${{ github.sha }}" | |
fi | |
# Adding k8s.local to the end makes kops happy | |
# has stricter DNS naming requirements. | |
CLUSTER_NAME="${{ env.test_name }}-${{ env.cluster_name }}.k8s.local" | |
CILIUM_INSTALL_DEFAULTS="--chart-directory=install/kubernetes/cilium \ | |
--set pprof.enabled=true \ | |
--helm-set=prometheus.enabled=true \ | |
--helm-set=cluster.name=${{ env.cluster_name }} \ | |
--helm-set=k8sServiceHost=api.internal.${CLUSTER_NAME} \ | |
--helm-set=k8sServicePort=443 \ | |
--helm-set=kubeProxyReplacement=true \ | |
--helm-set=operator.replicas=1 \ | |
--wait=false" | |
# only add SHA to the image tags if it was set | |
if [ -n "${SHA}" ]; then | |
echo sha=${SHA} >> $GITHUB_OUTPUT | |
CILIUM_INSTALL_DEFAULTS+=" --helm-set=image.repository=quay.io/${{ env.QUAY_ORGANIZATION_DEV }}/cilium-ci \ | |
--helm-set=image.useDigest=false \ | |
--helm-set=image.tag=${SHA} \ | |
--helm-set=operator.image.repository=quay.io/${{ env.QUAY_ORGANIZATION_DEV }}/operator \ | |
--helm-set=operator.image.suffix=-ci \ | |
--helm-set=operator.image.tag=${SHA} \ | |
--helm-set=operator.image.useDigest=false \ | |
--helm-set=clustermesh.apiserver.image.repository=quay.io/${{ env.QUAY_ORGANIZATION_DEV }}/clustermesh-apiserver-ci \ | |
--helm-set=clustermesh.apiserver.image.tag=${SHA} \ | |
--helm-set=clustermesh.apiserver.image.useDigest=false \ | |
--helm-set=hubble.relay.image.repository=quay.io/${{ env.QUAY_ORGANIZATION_DEV }}/hubble-relay-ci \ | |
--helm-set=hubble.relay.image.tag=${SHA} \ | |
--helm-set=hubble.relay.image.useDigest=false" | |
fi | |
echo SHA=${SHA} >> $GITHUB_OUTPUT | |
echo cilium_install_defaults=${CILIUM_INSTALL_DEFAULTS} >> $GITHUB_OUTPUT | |
echo CLUSTER_NAME=${CLUSTER_NAME} >> $GITHUB_OUTPUT | |
- name: Wait for images to be available | |
timeout-minutes: 30 | |
shell: bash | |
run: | | |
for image in cilium-ci operator-generic-ci hubble-relay-ci ; do | |
until docker manifest inspect quay.io/${{ env.QUAY_ORGANIZATION_DEV }}/$image:${{ steps.vars.outputs.SHA }} &> /dev/null; do sleep 45s; done | |
done | |
- name: Install Go | |
uses: actions/setup-go@0a12ed9d6a96ab950c8f026ed9f722fe0da7ef32 # v5.0.2 | |
with: | |
go-version: ${{ env.go_version }} | |
- name: Install Kops | |
uses: cilium/scale-tests-action/install-kops@d3ecfd83003f3e9c98ba125ca14933401d44918f # main | |
- name: Setup gcloud credentials | |
uses: google-github-actions/auth@62cf5bd3e4211a0a0b51f2c6d6a37129d828611d # v2.1.5 | |
with: | |
workload_identity_provider: ${{ secrets.GCP_PERF_WORKLOAD_IDENTITY_PROVIDER }} | |
service_account: ${{ secrets.GCP_PERF_SA }} | |
create_credentials_file: true | |
export_environment_variables: true | |
- name: Setup gcloud CLI | |
uses: google-github-actions/setup-gcloud@f0990588f1e5b5af6827153b93673613abdc6ec7 # v2.1.1 | |
with: | |
project_id: ${{ secrets.GCP_PERF_PROJECT_ID }} | |
version: ${{ env.gcloud_version }} | |
- name: Clone ClusterLoader2 | |
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 | |
with: | |
repository: kubernetes/perf-tests | |
# Avoid using renovate to update this dependency because: (1) | |
# perf-tests does not tag or release, so renovate will pull | |
# all updates to the default branch and (2) continually | |
# updating CL2 may impact the stability of the scale test | |
# results. | |
ref: 6eb52ac89d5de15a0ad13cfeb2b2026e57ce4f64 | |
persist-credentials: false | |
sparse-checkout: clusterloader2 | |
path: perf-tests | |
- name: Setup CL2 | |
run: | | |
# CL2 needs ssh access to control plane nodes | |
gcloud compute config-ssh | |
# Copy the custom configs to the folder where CL2 expects them. | |
cp -r .github/actions/cl2-modules ./perf-tests/clusterloader2/testing/custom | |
cd ./perf-tests/clusterloader2 | |
# CL2 hardcodes module paths to live in ./testing/load, even | |
# if the path given is relative. | |
cp ../../.github/actions/cl2-modules/cilium-agent-pprofs.yaml ./testing/load/ | |
cp ../../.github/actions/cl2-modules/cilium-metrics.yaml ./testing/load/ | |
echo \ | |
'{"CL2_ADDITIONAL_MEASUREMENT_MODULES": ["./cilium-agent-pprofs.yaml", "./cilium-metrics.yaml"]}' \ | |
> modules.yaml | |
go build ./cmd/clusterloader.go | |
- name: Deploy cluster | |
id: deploy-cluster | |
uses: cilium/scale-tests-action/create-cluster@d3ecfd83003f3e9c98ba125ca14933401d44918f # main | |
timeout-minutes: 30 | |
with: | |
cluster_name: ${{ steps.vars.outputs.cluster_name }} | |
control_plane_size: n1-standard-8 | |
control_plane_count: 1 | |
node_size: e2-standard-8 | |
node_count: 1 | |
kops_state: ${{ secrets.GCP_PERF_KOPS_STATE_STORE }} | |
project_id: ${{ secrets.GCP_PERF_PROJECT_ID }} | |
kube_proxy_enabled: false | |
- name: Setup firewall rules | |
uses: cilium/scale-tests-action/setup-firewall@d3ecfd83003f3e9c98ba125ca14933401d44918f # main | |
with: | |
cluster_name: ${{ steps.vars.outputs.cluster_name }} | |
- name: Install Cilium CLI | |
uses: cilium/cilium-cli@c39ea5e50210fde2ccfe54d07122c48fd680ac8d # v0.16.18 | |
with: | |
skip-build: ${{ env.CILIUM_CLI_SKIP_BUILD }} | |
image-repo: ${{ env.CILIUM_CLI_IMAGE_REPO }} | |
image-tag: ${{ inputs.SHA || github.sha }} | |
- name: Display version info of installed tools | |
run: | | |
echo "--- go ---" | |
go version | |
echo "--- cilium-cli ---" | |
cilium version --client | |
echo "--- kops ---" | |
./kops version | |
echo "--- gcloud ---" | |
gcloud version | |
- name: Install Cilium | |
run: | | |
cilium install --dry-run-helm-values ${{ steps.vars.outputs.cilium_install_defaults }} | |
cilium install ${{ steps.vars.outputs.cilium_install_defaults }} | |
- name: Wait for cluster to be ready | |
uses: cilium/scale-tests-action/validate-cluster@d3ecfd83003f3e9c98ba125ca14933401d44918f # main | |
timeout-minutes: 20 | |
with: | |
cluster_name: ${{ steps.vars.outputs.cluster_name }} | |
kops_state: ${{ secrets.GCP_PERF_KOPS_STATE_STORE }} | |
interval: 10s | |
- name: Run CL2 to setup prometheus | |
shell: bash | |
working-directory: ./perf-tests/clusterloader2 | |
env: | |
CL2_PROMETHEUS_PVC_ENABLED: "false" | |
CL2_PROMETHEUS_SCRAPE_CILIUM_OPERATOR: "true" | |
CL2_PROMETHEUS_SCRAPE_CILIUM_AGENT: "true" | |
CL2_PROMETHEUS_MEMORY_SCALE_FACTOR: 2.0 | |
timeout-minutes: 10 | |
run: | | |
# Don't run any tasks at this point, just setup the monitoring stack | |
./clusterloader \ | |
-v=2 \ | |
--testconfig=./testing/custom/common/setup.yaml \ | |
--testoverrides=./testing/prometheus/not-scrape-kube-proxy.yaml \ | |
--provider=gce \ | |
--enable-exec-service=false \ | |
--enable-prometheus-server \ | |
--tear-down-prometheus-server=false \ | |
--kubeconfig=$HOME/.kube/config \ | |
2>&1 | tee cl2-setup.txt | |
- name: Create Instance Group for workload deployments | |
uses: cilium/scale-tests-action/create-instance-group@d3ecfd83003f3e9c98ba125ca14933401d44918f # main | |
timeout-minutes: 30 | |
with: | |
cluster_name: ${{ steps.vars.outputs.cluster_name }} | |
node_size: e2-medium | |
node_count: 100 | |
ig_name: workloads | |
kops_state: ${{ secrets.GCP_PERF_KOPS_STATE_STORE }} | |
- name: Wait for cluster to be ready | |
uses: cilium/scale-tests-action/validate-cluster@d3ecfd83003f3e9c98ba125ca14933401d44918f # main | |
timeout-minutes: 20 | |
with: | |
cluster_name: ${{ steps.vars.outputs.cluster_name }} | |
kops_state: ${{ secrets.GCP_PERF_KOPS_STATE_STORE }} | |
interval: 10s | |
- name: Setup firewall rules | |
uses: cilium/scale-tests-action/setup-firewall@d3ecfd83003f3e9c98ba125ca14933401d44918f # main | |
with: | |
cluster_name: ${{ steps.vars.outputs.cluster_name }} | |
create_native_routing_firewall: 'false' | |
- name: Wait for Cilium status to be ready | |
run: | | |
cilium status --wait | |
- name: Run CL2 | |
id: run-cl2 | |
working-directory: ./perf-tests/clusterloader2 | |
shell: bash | |
timeout-minutes: 40 | |
env: | |
CL2_ENABLE_PVS: "false" | |
CL2_ENABLE_NETWORKPOLICIES: "true" | |
CL2_ALLOWED_SLOW_API_CALLS: 1 | |
CL2_SCHEDULER_THROUGHPUT_THRESHOLD: 0 | |
CL2_PROMETHEUS_PVC_ENABLED: "false" | |
CL2_PROMETHEUS_SCRAPE_CILIUM_OPERATOR: "true" | |
CL2_PROMETHEUS_SCRAPE_CILIUM_AGENT: "true" | |
CL2_PROMETHEUS_MEMORY_SCALE_FACTOR: 2.0 | |
run: | | |
./clusterloader \ | |
-v=2 \ | |
--testconfig=./testing/load/config.yaml \ | |
--provider=gce \ | |
--enable-prometheus-server \ | |
--tear-down-prometheus-server=false \ | |
--nodes=100 \ | |
--report-dir=./report \ | |
--experimental-prometheus-snapshot-to-report-dir=true \ | |
--kubeconfig=$HOME/.kube/config \ | |
--testoverrides=./testing/overrides/load_throughput.yaml \ | |
--testoverrides=./testing/experiments/use_simple_latency_query.yaml \ | |
--testoverrides=./testing/prometheus/not-scrape-kube-proxy.yaml \ | |
--testoverrides=./modules.yaml \ | |
2>&1 | tee cl2-output.txt | |
- name: Get sysdump | |
if: ${{ always() && steps.run-cl2.outcome != 'skipped' && steps.run-cl2.outcome != 'cancelled' }} | |
run: | | |
cilium status | |
cilium sysdump --output-filename cilium-sysdump-final | |
sudo chmod +r cilium-sysdump-final.zip | |
- name: Cleanup cluster | |
if: ${{ always() && steps.deploy-cluster.outcome != 'skipped' }} | |
uses: cilium/scale-tests-action/cleanup-cluster@d3ecfd83003f3e9c98ba125ca14933401d44918f # main | |
with: | |
cluster_name: ${{ steps.vars.outputs.cluster_name }} | |
kops_state: ${{ secrets.GCP_PERF_KOPS_STATE_STORE }} | |
- name: Export results and sysdump to GS bucket | |
if: ${{ always() && steps.run-cl2.outcome != 'skipped' && steps.run-cl2.outcome != 'cancelled' }} | |
uses: cilium/scale-tests-action/export-results@d3ecfd83003f3e9c98ba125ca14933401d44918f # main | |
with: | |
test_name: ${{ env.test_name }} | |
results_bucket: ${{ env.GCP_PERF_RESULTS_BUCKET }} | |
artifacts: ./perf-tests/clusterloader2/report/* | |
other_files: cilium-sysdump-final.zip ./perf-tests/clusterloader2/cl2-output.txt | |
commit-status-final: | |
if: ${{ always() }} | |
name: Commit Status Final | |
needs: install-and-scaletest | |
runs-on: ubuntu-latest | |
steps: | |
- name: Set final commit status | |
uses: myrotvorets/set-commit-status-action@3730c0a348a2ace3c110851bed53331bc6406e9f # v2.0.1 | |
with: | |
sha: ${{ inputs.SHA || github.sha }} | |
status: ${{ needs.install-and-scaletest.result }} |