diff --git a/.circleci/config.yml b/.circleci/config.yml deleted file mode 100644 index bf3f2192e4a..00000000000 --- a/.circleci/config.yml +++ /dev/null @@ -1,476 +0,0 @@ -version: 2.1 - -orbs: - go: circleci/go@1.7.3 - -commands: - install-protobuf: - description: Install protobuf (protoc command and standard libs) - parameters: - prefix: - default: /usr/local - type: string - version: - default: 3.17.3 - type: string - steps: - - run: - command: | - archive=protoc-<>-linux-x86_64 - curl -O -L https://github.com/protocolbuffers/protobuf/releases/download/v<>/$archive.zip - sudo unzip -d '/usr/local' $archive.zip 'bin/*' 'include/*' - sudo chmod +x /usr/local/bin/protoc - rm -rf $archive.zip - name: 'Install protobuf' - deploy-executor: - parameters: - worker-id: - type: string - default: "1" - steps: - - run: - name: "Deploy to worker << parameters.worker-id >>" - command: | - echo $KUBECONFIG_WORKER_<< parameters.worker-id >> | base64 -d > kubeconfig_worker_<< parameters.worker-id >>_decoded.yaml - export KUBECONFIG=./kubeconfig_worker_<< parameters.worker-id >>_decoded.yaml - if timeout 15 helm list; then - helm upgrade --install armada-executor --namespace=armada ./deployment/executor/ -f ./executor_config.yaml \ - --set applicationConfig.application.clusterId="worker-pool-<< parameters.worker-id >>" \ - --set image.repository=gresearchdev/armada-executor-dev \ - --set image.tag="${CIRCLE_SHA1}" - - helm upgrade --install executor-cluster-monitoring --namespace=common ./deployment/executor-cluster-monitoring --set interval=5s - else - echo Unable to connect to worker << parameters.worker-id >> - fi - - install-helm-client: - description: | - Install the Helm v3 client - Download from: https://get.helm.sh - steps: - - run: - name: Install and init the helm client - command: | - if which helm > /dev/null; then - echo "Helm is already installed.. skipping install" - else - HELM_VERSION="3.2.4" - HELM_PLATFORM="linux-amd64" - - # Create local bin directory for later convenience - mkdir -p /home/circleci/bin/ - echo 'export PATH=/home/circleci/bin/:$PATH' >> $BASH_ENV - source $BASH_ENV - - # Get helm - curl -LSs https://get.helm.sh/helm-v$HELM_VERSION-$HELM_PLATFORM.tar.gz | tar xvz --strip-components=1 -C /home/circleci/bin $HELM_PLATFORM/helm - fi - - checkout-charts: - description: | - Checkout G-Research charts repo so we can update them later - Checkout from: https://github.com/G-Research/charts - steps: - - run: - name: Set .ssh directory and GIT_SSH_COMMAND - command: | - echo 'SSH_CONFIG_DIR="/home/circleci/.ssh"' >> $BASH_ENV - echo 'GIT_SSH_COMMAND="ssh -o UserKnownHostsFile=$SSH_CONFIG_DIR/known_hosts"' >> $BASH_ENV - source $BASH_ENV - - - run: - name: Create known_hosts file for github.com - command: | - mkdir -p $SSH_CONFIG_DIR - echo 'github.com ssh-rsa AAAAB3NzaC1yc2EAAAABIwAAAQEAq2A7hRGmdnm9tUDbO9IDSwBK6TbQa+PXYPCPy6rbTrTtw7PHkccKrpp0yVhp5HdEIcKr6pLlVDBfOLX9QUsyCOV0wzfjIJNlGEYsdlLJizHhbn2mUjvSAHQqZETYP81eFzLQNnPHt4EVVUh7VfDESU84KezmD5QlWpXLmvU31/yMf+Se8xhHTvKSCZIFImWwoG6mbUoWf9nzpIoaSjB+weqqUUmpaaasXVal72J+UX2B+2RPW3RcT0eOzQgqlJL3RKrTJvdsjE3JEAvGq3lGHSZXy28G3skua2SmVi/w4yCE6gbODqnTWlg7+wC604ydGXA8VJiS5ap43JXiUFFAaQ==' >> $SSH_CONFIG_DIR/known_hosts - - - run: - name: Clone the G-Research common charts repo - command: | - eval "$(ssh-agent -s)" - echo -e "$ARMADA_CHART_UPDATE_KEY" | ssh-add - > /dev/null - git clone -q git@github.com:G-Research/charts.git - - update-charts: - description: | - Update Armada charts in the common G-Research charts repo - steps: - - run: - name: Update version, appVersion and image tag to match current $RELEASE_TAG - command: | - RELEASE_TAG=${CIRCLE_TAG} - echo release version is $RELEASE_TAG - find . \( -name "Chart.yaml" -o -name "values.yaml" \) -exec sed -i s/0.0.0-latest/$RELEASE_TAG/ {} + - - - run: - name: Build new packages and index.yaml - command: | - helm package deployment/armada/ -d charts/armada/ - helm package deployment/executor -d charts/armada/ - helm package deployment/executor-cluster-monitoring/ -d charts/armada/ - helm package deployment/lookout/ -d charts/armada/ - helm package deployment/lookout-v2/ -d charts/armada/ - helm package deployment/lookout-migration/ -d charts/armada/ - helm package deployment/lookout-migration-v2/ -d charts/armada/ - helm package deployment/binoculars/ -d charts/armada/ - helm package deployment/jobservice/ -d charts/armada/ - helm package deployment/lookout-ingester/ -d charts/armada/ - helm package deployment/lookout-ingester-v2/ -d charts/armada/ - helm package deployment/event-ingester/ -d charts/armada/ - helm package deployment/scheduler/ -d charts/armada/ - helm package deployment/scheduler-migration/ -d charts/armada/ - helm repo index charts/ - - - run: - name: Commit and push updated charts - command: | - RELEASE_TAG=${CIRCLE_TAG} - cd charts - git checkout -b circlci-armada_$RELEASE_TAG - git add ./armada - git -c user.name='GR OSS' -c user.email=github@gr-oss.io commit -qam "Pushing new helm charts at version $RELEASE_TAG" - eval "$(ssh-agent -s)" - echo -e "$ARMADA_CHART_UPDATE_KEY" | ssh-add - > /dev/null - git push -q origin HEAD - -jobs: - build_and_integration_tests: - machine: - docker_layer_caching: true - image: ubuntu-2204:2022.07.1 - resource_class: large - # resource_class: xlarge - environment: - GO111MODULE: "on" - GOPATH: "/home/circleci/go" - GOCACHE: "/home/circleci/go/cache" - working_directory: ~/go/src/github.com/armadaproject/armada - steps: - - checkout - - install-protobuf - - go/install: - version: '1.20.2' - - restore_cache: # restore dependencies - keys: - - go-mod-v3-{{ checksum "go.sum" }} - - run: - name: Download dependencies - command: | - curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" - chmod +x kubectl - mv kubectl /home/circleci/bin/ - make download # no-op if we restored from cache - - - run: - name: Build - command: | - make build-ci - make build-docker-full-bundle - - - run: - name: e2e test environment setup - command: make tests-e2e-setup - - - run: - name: Enable integration tests - command: echo 'export INTEGRATION_ENABLED="true"' >> $BASH_ENV - - - run: - name: e2e tests - command: make tests-e2e-no-setup - - - run: - name: e2e test environment teardown - command: make tests-e2e-teardown - - - run: - name: Prepare reports - command: make junit-report - when: always - - - store_test_results: - path: test_reports/junit.xml - - - run: - name: Push Image - command: | - - if [ -z "${DOCKERHUB_USER}" ] - then - echo "Do not push image inside fork." - exit 0 - fi - - TAG=${CIRCLE_SHA1} - - if [ ${CIRCLE_BRANCH} != master ] - then - TAG=branch-$(echo -n $CIRCLE_BRANCH | sed 's|/|-|g')-${CIRCLE_SHA1} - fi - - echo ${DOCKERHUB_PASS} | docker login -u ${DOCKERHUB_USER} --password-stdin - - docker tag armada gresearchdev/armada-server-dev:${TAG} - docker push gresearchdev/armada-server-dev:${TAG} - - docker tag armada-executor gresearchdev/armada-executor-dev:${TAG} - docker push gresearchdev/armada-executor-dev:${TAG} - - docker tag armadactl gresearchdev/armada-armadactl-dev:${TAG} - docker push gresearchdev/armada-armadactl-dev:${TAG} - - docker tag testsuite gresearchdev/armada-testsuite-dev:${TAG} - docker push gresearchdev/armada-testsuite-dev:${TAG} - - docker tag armada-load-tester gresearchdev/armada-load-tester-dev:${TAG} - docker push gresearchdev/armada-load-tester-dev:${TAG} - - docker tag armada-fakeexecutor gresearchdev/armada-fakeexecutor-dev:${TAG} - docker push gresearchdev/armada-fakeexecutor-dev:${TAG} - - docker tag armada-lookout gresearchdev/armada-lookout-dev:${TAG} - docker push gresearchdev/armada-lookout-dev:${TAG} - - docker tag armada-lookout-v2 gresearchdev/armada-lookout-v2-dev:${TAG} - docker push gresearchdev/armada-lookout-v2-dev:${TAG} - - docker tag armada-lookout-ingester gresearchdev/armada-lookout-ingester-dev:${TAG} - docker push gresearchdev/armada-lookout-ingester-dev:${TAG} - - docker tag armada-lookout-ingester-v2 gresearchdev/armada-lookout-ingester-v2-dev:${TAG} - docker push gresearchdev/armada-lookout-ingester-v2-dev:${TAG} - - docker tag armada-event-ingester gresearchdev/armada-event-ingester-dev:${TAG} - docker push gresearchdev/armada-event-ingester-dev:${TAG} - - docker tag armada-binoculars gresearchdev/armada-binoculars-dev:${TAG} - docker push gresearchdev/armada-binoculars-dev:${TAG} - - docker tag armadactl gresearchdev/armada-armadactl-dev:${TAG} - docker push gresearchdev/armada-armadactl-dev:${TAG} - - docker tag armada-jobservice gresearchdev/armada-jobservice-dev:${TAG} - docker push gresearchdev/armada-jobservice-dev:${TAG} - - docker tag armada-scheduler gresearchdev/armada-scheduler-dev:${TAG} - docker push gresearchdev/armada-scheduler-dev:${TAG} - - docker tag armada-scheduler-ingester gresearchdev/armada-scheduler-ingester-dev:${TAG} - docker push gresearchdev/armada-scheduler-ingester-dev:${TAG} - - docker tag armada-full-bundle gresearchdev/armada-full-bundle-dev:${TAG} - docker push gresearchdev/armada-full-bundle-dev:${TAG} - - - release-armadactl: - machine: - docker_layer_caching: true - image: ubuntu-2204:2023.02.1 - resource_class: large - # resource_class: xlarge - environment: - GO111MODULE: "on" - GOPATH: "/home/circleci/go" - GOCACHE: "/home/circleci/go/cache" - working_directory: ~/go/src/github.com/armadaproject/armada - steps: - - checkout - - install-protobuf - - - restore_cache: # restore dependencies - keys: - - go-mod-v3-{{ checksum "go.sum" }} - - - run: - name: Download dependencies - command: | - curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" - chmod +x kubectl - sudo mv kubectl /usr/local/bin/ - make download # no-op if we restored from cache - - - run: - name: Build armadactl release artifacts - command: make build-armadactl-release RELEASE_VERSION=${CIRCLE_TAG} - - - store_artifacts: - path: dist - - - run: - name: Upload artifacts to Github release - command: | - for artifact in ./dist/*; do - ./scripts/upload-github-release-asset.sh ${GITHUB_TOKEN} ${CIRCLE_TAG} $artifact - done - - ./scripts/add-checksum-summary.sh ${GITHUB_TOKEN} ${CIRCLE_TAG} - - release-docker-images: - machine: - docker_layer_caching: true - image: ubuntu-2204:2022.07.1 - resource_class: medium - environment: - GO111MODULE: "on" - GOPATH: "/home/circleci/go" - GOCACHE: "/home/circleci/go/cache" - working_directory: ~/go/src/github.com/armadaproject/armada - steps: - - checkout - - run: - name: Publish images - command: | - TAG=${CIRCLE_SHA1} - RELEASE_TAG=${CIRCLE_TAG} - - echo ${DOCKERHUB_PASS} | docker login -u ${DOCKERHUB_USER} --password-stdin - - docker pull gresearchdev/armada-server-dev:${TAG} - docker tag gresearchdev/armada-server-dev:${TAG} gresearchdev/armada-server:${RELEASE_TAG} - docker push gresearchdev/armada-server:${RELEASE_TAG} - - docker pull gresearchdev/armada-executor-dev:${TAG} - docker tag gresearchdev/armada-executor-dev:${TAG} gresearchdev/armada-executor:${RELEASE_TAG} - docker push gresearchdev/armada-executor:${RELEASE_TAG} - - docker pull gresearchdev/armada-armadactl-dev:${TAG} - docker tag gresearchdev/armada-armadactl-dev:${TAG} gresearchdev/armada-armadactl:${RELEASE_TAG} - docker push gresearchdev/armada-armadactl:${RELEASE_TAG} - - docker pull gresearchdev/armada-testsuite-dev:${TAG} - docker tag gresearchdev/armada-testsuite-dev:${TAG} gresearchdev/armada-testsuite:${RELEASE_TAG} - docker push gresearchdev/armada-testsuite:${RELEASE_TAG} - - docker pull gresearchdev/armada-lookout-dev:${TAG} - docker tag gresearchdev/armada-lookout-dev:${TAG} gresearchdev/armada-lookout:${RELEASE_TAG} - docker push gresearchdev/armada-lookout:${RELEASE_TAG} - - docker pull gresearchdev/armada-lookout-v2-dev:${TAG} - docker tag gresearchdev/armada-lookout-v2-dev:${TAG} gresearchdev/armada-lookout-v2:${RELEASE_TAG} - docker push gresearchdev/armada-lookout-v2:${RELEASE_TAG} - - docker pull gresearchdev/armada-binoculars-dev:${TAG} - docker tag gresearchdev/armada-binoculars-dev:${TAG} gresearchdev/armada-binoculars:${RELEASE_TAG} - docker push gresearchdev/armada-binoculars:${RELEASE_TAG} - - docker pull gresearchdev/armada-jobservice-dev:${TAG} - docker tag gresearchdev/armada-jobservice-dev:${TAG} gresearchdev/armada-jobservice:${RELEASE_TAG} - docker push gresearchdev/armada-jobservice:${RELEASE_TAG} - - docker pull gresearchdev/armada-lookout-ingester-dev:${TAG} - docker tag gresearchdev/armada-lookout-ingester-dev:${TAG} gresearchdev/armada-lookout-ingester:${RELEASE_TAG} - docker push gresearchdev/armada-lookout-ingester:${RELEASE_TAG} - - docker pull gresearchdev/armada-lookout-ingester-v2-dev:${TAG} - docker tag gresearchdev/armada-lookout-ingester-v2-dev:${TAG} gresearchdev/armada-lookout-ingester-v2:${RELEASE_TAG} - docker push gresearchdev/armada-lookout-ingester-v2:${RELEASE_TAG} - - docker pull gresearchdev/armada-event-ingester-dev:${TAG} - docker tag gresearchdev/armada-event-ingester-dev:${TAG} gresearchdev/armada-event-ingester:${RELEASE_TAG} - docker push gresearchdev/armada-event-ingester:${RELEASE_TAG} - - docker pull gresearchdev/armada-scheduler-dev:${TAG} - docker tag gresearchdev/armada-scheduler-dev:${TAG} gresearchdev/armada-scheduler:${RELEASE_TAG} - docker push gresearchdev/armada-scheduler:${RELEASE_TAG} - - docker pull gresearchdev/armada-scheduler-ingester-dev:${TAG} - docker tag gresearchdev/armada-scheduler-ingester-dev:${TAG} gresearchdev/armada-scheduler-ingester:${RELEASE_TAG} - docker push gresearchdev/armada-scheduler-ingester:${RELEASE_TAG} - - docker pull gresearchdev/armada-full-bundle-dev:${TAG} - docker tag gresearchdev/armada-full-bundle-dev:${TAG} gresearchdev/armada-full-bundle:${RELEASE_TAG} - docker push gresearchdev/armada-full-bundle:${RELEASE_TAG} - - release-charts: - machine: - docker_layer_caching: true - image: ubuntu-2204:2022.07.1 - resource_class: medium - environment: - GO111MODULE: "on" - GOPATH: "/home/circleci/go" - GOCACHE: "/home/circleci/go/cache" - working_directory: ~/go/src/github.com/armadaproject/armada - steps: - - checkout - - - install-helm-client - - - checkout-charts - - - update-charts - - release-dotnet-client: - machine: - docker_layer_caching: true - image: ubuntu-2204:2023.02.1 - resource_class: large - # resource_class: xlarge - environment: - GO111MODULE: "on" - GOPATH: "/home/circleci/go" - GOCACHE: "/home/circleci/go/cache" - working_directory: ~/go/src/github.com/armadaproject/armada - steps: - - checkout - - install-protobuf - - - restore_cache: # restore dependencies - keys: - - go-mod-v3-{{ checksum "go.sum" }} - - - run: - name: Download dependencies - command: | - export GOPATH="/home/circleci/go" - export GOCACHE="/home/circleci/go/cache" - export PATH=$PATH:$GOPATH/bin:/usr/local/go/bin - curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" - chmod +x kubectl - sudo mv kubectl /usr/local/bin/ - make download # no-op if we restored from cache - - - run: - name: Push dotnet clients to nuget - command: | - export GOPATH="/home/circleci/go" - export GOCACHE="/home/circleci/go/cache" - export PATH=$PATH:$GOPATH/bin:/usr/local/go/bin - go mod download - go run github.com/magefile/mage@v1.14.0 -v BootstrapTools - go run github.com/magefile/mage@v1.14.0 -v proto - RELEASE_TAG=${CIRCLE_TAG#"v"} make push-nuget - - - store_artifacts: - path: bin/client/DotNet - -workflows: - version: 2 - build_and_deploy: - jobs: - - build_and_integration_tests - - release-armadactl: - filters: - tags: - only: /v[0-9]+\.[0-9]+\.[0-9]+/ - branches: - ignore: /.*/ - - release-docker-images: - filters: - tags: - only: /v[0-9]+\.[0-9]+\.[0-9]+/ - branches: - ignore: /.*/ - - release-charts: - filters: - tags: - only: /v[0-9]+\.[0-9]+\.[0-9]+/ - branches: - ignore: /.*/ - - release-dotnet-client: - filters: - tags: - only: /v[0-9]+\.[0-9]+\.[0-9]+/ - branches: - ignore: /.*/ diff --git a/.github/workflows/airflow-operator-release-to-pypi.yml b/.github/workflows/airflow-operator-release-to-pypi.yml index 5b04d3acc0f..03883b9d62a 100644 --- a/.github/workflows/airflow-operator-release-to-pypi.yml +++ b/.github/workflows/airflow-operator-release-to-pypi.yml @@ -13,11 +13,11 @@ jobs: - uses: actions/checkout@v3.3.0 - uses: ./.github/workflows/go-setup - name: Install Protoc - uses: arduino/setup-protoc@v1 + uses: arduino/setup-protoc@v2 with: - version: '3.17.3' + version: '23.3' repo-token: ${{ secrets.GITHUB_TOKEN }} - - run: make airflow-operator + - run: go run github.com/magefile/mage@v1.14.0 -v airflowOperator - uses: ./.github/workflows/python-tests with: python-version: '3.8' diff --git a/.github/workflows/airflow-operator.yml b/.github/workflows/airflow-operator.yml index b1dce1e24f3..39850d57941 100644 --- a/.github/workflows/airflow-operator.yml +++ b/.github/workflows/airflow-operator.yml @@ -14,7 +14,6 @@ on: - 'docs/python_airflow_operator.md' - 'docs/python_armada_client.md' - 'internal/jobservice/*' - - 'makefile' - 'pkg/api/*.proto' - 'pkg/api/jobservice/*.proto' - 'scripts/build-airflow-operator.sh' @@ -34,7 +33,6 @@ on: - 'docs/python_airflow_operator.md' - 'docs/python_armada_client.md' - 'internal/jobservice/*' - - 'makefile' - 'pkg/api/*.proto' - 'pkg/api/jobservice/*.proto' - 'scripts/build-airflow-operator.sh' @@ -58,11 +56,11 @@ jobs: - uses: actions/checkout@v3.3.0 - uses: ./.github/workflows/go-setup - name: Install Protoc - uses: arduino/setup-protoc@v1 + uses: arduino/setup-protoc@v2 with: - version: '3.17.3' + version: '23.3' repo-token: ${{ secrets.GITHUB_TOKEN }} - - run: make airflow-operator + - run: go run github.com/magefile/mage@v1.14.0 -v airflowOperator - uses: ./.github/workflows/python-tests with: python-version: ${{ matrix.python }} @@ -102,9 +100,9 @@ jobs: mkdir -p .kube/external go run github.com/magefile/mage@v1.14.0 -v localdev minimal - name: Install Protoc - uses: arduino/setup-protoc@v1 + uses: arduino/setup-protoc@v2 with: - version: '3.17.3' + version: '23.3' repo-token: ${{ secrets.GITHUB_TOKEN }} - - run: make tests-e2e-airflow + - run: go run github.com/magefile/mage@v1.14.0 -v teste2eAirflow diff --git a/.github/workflows/autoupdate.yml b/.github/workflows/autoupdate.yml new file mode 100644 index 00000000000..de45e651c8e --- /dev/null +++ b/.github/workflows/autoupdate.yml @@ -0,0 +1,22 @@ +name: autoupdate +on: + # This will trigger on all pushes to all branches. + push: {} + # Alternatively, you can only trigger if commits are pushed to certain branches, e.g.: + # push: + # branches: + # - master + # - unstable +jobs: + autoupdate: + name: autoupdate + runs-on: ubuntu-22.04 + steps: + - uses: docker://chinthakagodawita/autoupdate-action:v1 + env: + GITHUB_TOKEN: '${{ secrets.GITHUB_TOKEN }}' + PR_LABELS: "auto-update" + MERGE_MSG: "Branch was auto-updated." + RETRY_COUNT: "5" + RETRY_SLEEP: "300" + MERGE_CONFLICT_ACTION: "fail" diff --git a/.github/workflows/build-release-images.yml b/.github/workflows/build-release-images.yml deleted file mode 100644 index eaa1befa9e6..00000000000 --- a/.github/workflows/build-release-images.yml +++ /dev/null @@ -1,57 +0,0 @@ -name: Build Release Images - -on: - push: - branches: - - master - -jobs: - test-and-build-images: - if: github.repository_owner == 'armadaproject' - runs-on: ubuntu-22.04 - # runs-on: buildjet-4vcpu-ubuntu-2204 - strategy: - fail-fast: true - matrix: - go: [ '1.20' ] - steps: - - uses: actions/checkout@v3.3.0 - - uses: ./.github/workflows/go-setup - - run: make build-ci - - run: make tests-e2e-setup - - run: make tests-e2e-no-setup - env: - INTEGRATION_ENABLED: true - - run: make junit-report - - name: Upload junit report - uses: actions/upload-artifact@v3.1.1 - with: - name: junit.xml - path: test_reports/junit.xml - if-no-files-found: error - - - name: Save docker images to artifact - run: | - mkdir -p docker-images - docker save armada | gzip > docker-images/armada.tar.gz - docker save armada-executor | gzip > docker-images/armada-executor.tar.gz - docker save armadactl | gzip > docker-images/armadactl.tar.gz - docker save testsuite | gzip > docker-images/testsuite.tar.gz - docker save armada-lookout | gzip > docker-images/armada-lookout.tar.gz - docker save armada-lookout-ingester | gzip > docker-images/armada-lookout-ingester.tar.gz - docker save armada-lookout-v2 | gzip > docker-images/armada-lookout-v2.tar.gz - docker save armada-lookout-ingester-v2 | gzip > docker-images/armada-lookout-ingester-v2.tar.gz - docker save armada-event-ingester | gzip > docker-images/armada-event-ingester.tar.gz - docker save armada-scheduler | gzip > docker-images/armada-scheduler.tar.gz - docker save armada-scheduler-ingester | gzip > docker-images/armada-scheduler-ingester.tar.gz - docker save armada-binoculars | gzip > docker-images/armada-binoculars.tar.gz - docker save armada-jobservice | gzip > docker-images/armada-jobservice.tar.gz - - tar -czf docker-images.tar.gz docker-images/* - - - name: Upload docker image tarball to artifacts - uses: actions/upload-artifact@v3.1.1 - with: - name: docker-images - path: docker-images.tar.gz - if-no-files-found: error diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 576405791ec..6d185cb7bba 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -35,12 +35,18 @@ jobs: # Virtual job that can be configured as a required check before a PR can be merged. all-required-checks-done: name: All required checks done + if: ${{ always() }} needs: - lint - - codeql - test runs-on: ubuntu-22.04 steps: - - run: | - echo "All required checks done" - + - uses: actions/github-script@v6 + with: + script: | + const results = ${{ toJSON(needs.*.result) }}; + if (results.every(res => res === 'success')) { + core.info('All required checks succeeded'); + } else { + core.setFailed('Some required checks failed'); + } diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index 09fd5eeae32..a9a16deb668 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -41,13 +41,13 @@ jobs: uses: actions/checkout@v3.3.0 # The ArmadaProject.Io.Client needs the generated proto files - name: Install Protoc - uses: arduino/setup-protoc@v1 + uses: arduino/setup-protoc@v2 with: - version: '3.17.3' + version: '23.3' repo-token: ${{ secrets.GITHUB_TOKEN }} - uses: ./.github/workflows/go-setup - - name: Make Proto - run: make proto + - name: Mage Proto + run: go run github.com/magefile/mage@v1.14.0 -v proto # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL diff --git a/.github/workflows/go-setup/action.yml b/.github/workflows/go-setup/action.yml index de35127b92d..7e964bc5123 100644 --- a/.github/workflows/go-setup/action.yml +++ b/.github/workflows/go-setup/action.yml @@ -9,4 +9,5 @@ runs: go-version: ${{ matrix.go }} - name: Setup dependencies shell: bash - run: make download + run: go run github.com/magefile/mage@v1.14.0 download + diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 07ff1099616..ff2ff99f513 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -26,14 +26,29 @@ jobs: - name: Check TypeScript formatting run: | - yarn install --frozen-lockfile && yarn run fmt || exit 1 + yarn install --frozen-lockfile && yarn run fmt || true exit $(git status -s -uno | wc -l) working-directory: ./internal/lookout/ui + continue-on-error: true + + - name: Generating TypeScript lint results as summary + working-directory: ./internal/lookout/ui + if: ${{ always() }} + run: | + yarn run lint > lint_results.txt || true + lint_results=$(cat lint_results.txt) + echo -e "## 🪧 Typescript Lint Results\n" >> $GITHUB_STEP_SUMMARY + if [[ $lint_results =~ "problem" ]]; then + echo -e "### List of Lint Issues \n" >> $GITHUB_STEP_SUMMARY + echo -e "${lint_results}" >> $GITHUB_STEP_SUMMARY + else + echo -e "### No Lint issues found.\n" >> $GITHUB_STEP_SUMMARY + fi + continue-on-error: true go-lint: name: Lint Go runs-on: ubuntu-22.04 - steps: - name: Checkout uses: actions/checkout@v3.3.0 @@ -51,3 +66,17 @@ jobs: version: v1.52.2 only-new-issues: true args: --timeout=10m --issues-exit-code=1 --sort-results ./... + + - name: Generating Golang lint results as summary + if: ${{ always() }} + run: | + golangci-lint run > lint_results.txt || true + lint_results=$(cat lint_results.txt) + echo -e "## 🪧 Go Lint Results\n" >> $GITHUB_STEP_SUMMARY + if [ -z "$lint_results" ]; then + echo -e "### No Lint Issues found.\n" >> $GITHUB_STEP_SUMMARY + else + echo -e "### List of Lint Issues \n" >> $GITHUB_STEP_SUMMARY + echo -e "${lint_results}" >> $GITHUB_STEP_SUMMARY + fi + continue-on-error: true diff --git a/.github/workflows/pages.yml b/.github/workflows/pages.yml index 6a09667f565..8f89ed5d5ca 100644 --- a/.github/workflows/pages.yml +++ b/.github/workflows/pages.yml @@ -32,6 +32,7 @@ jobs: cp master/docs/*.md gh-pages/ rm gh-pages/docs-readme.md cp -r master/docs/quickstart gh-pages/ + cp master/docs/quickstart/index.md gh-pages/quickstart.md cp -r master/docs/developer gh-pages/ cp -r master/docs/design gh-pages/ cp master/CODE_OF_CONDUCT.md master/CONTRIBUTING.md gh-pages/_pages/ diff --git a/.github/workflows/python-client-release-to-pypi.yml b/.github/workflows/python-client-release-to-pypi.yml index dea92e7e2ca..fe5c265ca10 100644 --- a/.github/workflows/python-client-release-to-pypi.yml +++ b/.github/workflows/python-client-release-to-pypi.yml @@ -13,9 +13,9 @@ jobs: - uses: actions/checkout@v3.3.0 - uses: ./.github/workflows/go-setup - name: Install Protoc - uses: arduino/setup-protoc@v1 + uses: arduino/setup-protoc@v2 with: - version: '3.17.3' + version: '23.3' repo-token: ${{ secrets.GITHUB_TOKEN }} - uses: ./.github/workflows/python-tests with: diff --git a/.github/workflows/python-client.yml b/.github/workflows/python-client.yml index ed71d2eee3e..f8044d3c236 100644 --- a/.github/workflows/python-client.yml +++ b/.github/workflows/python-client.yml @@ -83,8 +83,8 @@ jobs: mkdir -p .kube/external go run github.com/magefile/mage@v1.14.0 -v localdev minimal - name: Install Protoc - uses: arduino/setup-protoc@v1 + uses: arduino/setup-protoc@v2 with: - version: '3.17.3' + version: '23.3' repo-token: ${{ secrets.GITHUB_TOKEN }} - - run: make tests-e2e-python + - run: go run github.com/magefile/mage@v1.14.0 -v teste2epython diff --git a/.github/workflows/python-tests/action.yml b/.github/workflows/python-tests/action.yml index d9f68202c43..645f386bf98 100644 --- a/.github/workflows/python-tests/action.yml +++ b/.github/workflows/python-tests/action.yml @@ -5,36 +5,31 @@ inputs: path: description: "Path to python package root relative to repo root" required: true - type: string python-version: description: "Version of python to setup and run tests against" required: true - type: string tox-env: description: "Tox environment to use for running the tests" required: true - type: string github-token: description: "Token for authenticated github requests" required: true - type: string - runs: using: "composite" steps: - - uses: actions/setup-python@v2 + - uses: actions/setup-python@v4 with: python-version: ${{ inputs.python-version }} # Tox to run tests; build to build the wheel after tests pass - run: pip install tox==3.27.1 build twine shell: bash - name: Install Protoc - uses: arduino/setup-protoc@v1 + uses: arduino/setup-protoc@v2 with: - version: '3.17.3' + version: '23.3' repo-token: ${{ inputs.github-token }} # Generate the proto files for python, required for later steps - - run: make python + - run: go run github.com/magefile/mage@v1.14.0 -v buildPython shell: bash - name: Run tox format environment run: tox -e format diff --git a/.github/workflows/slack-alerts.yml b/.github/workflows/slack-alerts.yml index 3575873bcd3..7c799ce5470 100644 --- a/.github/workflows/slack-alerts.yml +++ b/.github/workflows/slack-alerts.yml @@ -2,38 +2,18 @@ name: Slack CI Alerts on: workflow_run: - workflows: [CI, Python Airflow Operator, Build Release Images] + workflows: [CI, Python Airflow Operator, Build Release Images, Release Armada components, Release Armada components - RC] types: [completed] jobs: - on_push_failure: + on-failure: runs-on: ubuntu-latest - if: github.event.workflow_run.conclusion == 'failure' && github.event.workflow_run.event == 'push' && github.event.workflow_run.head_branch == 'master' - steps: - - name: "Master Notification" - uses: Mo-Fatah/ci-alerts@v1 - env: - webhook: ${{ secrets.SLACK_WEBHOOK }} - event: push - commit: ${{ github.sha }} - commit_url: https://github.com/armadaproject/armada/commit/${{ github.sha }} - author: ${{ github.actor }} - workflow_name: ${{ github.event.workflow_run.name }} - workflow_url: ${{ github.event.workflow_run.html_url}} - - on_pull_request_failure: - runs-on: ubuntu-latest - if: github.event.workflow_run.conclusion == 'failure' && github.event.workflow_run.event == 'pull_request' + if: github.event.workflow_run.conclusion == 'failure' steps: - uses: actions/checkout@v3.3.0 - - name: "Pull Request Notification" - uses: Mo-Fatah/ci-alerts@v1 + - name: "Send Notification" + uses: Mo-Fatah/ci-alerts@v2 env: webhook: ${{ secrets.SLACK_WEBHOOK }} - event: pr - commit: ${{ github.sha }} - commit_url: https://github.com/armadaproject/armada/commit/${{ github.sha }} - author: ${{ github.actor }} - workflow_name: ${{ github.event.workflow_run.name }} - workflow_url: ${{ github.event.workflow_run.html_url}} - users_path: ${{github.workspace}}/.github/gh-to-slackid + github_context: ${{ toJSON(github) }} + users_path: ${{github.workspace}}/.github/gh-to-slackid \ No newline at end of file diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index fa4acb3f8fc..4b0ea22a381 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -46,15 +46,6 @@ jobs: path: ./internal/lookout/ui/junit.xml if-no-files-found: error - - name: Send Coverage Report to Codecov - if: always() - uses: codecov/codecov-action@v3 - with: - file: ./internal/lookout/ui/coverage/cobertura-coverage.xml - flags: unittests - name: codecov-armada-ts-unit-tests - verbose: true - go-unit-tests: name: Golang Unit Tests runs-on: ubuntu-22.04 @@ -70,11 +61,11 @@ jobs: - name: Setup dependencies shell: bash - run: make download + run: go run github.com/magefile/mage@v1.14.0 -v download - name: Unit Tests id: unit_test - run: make tests + run: go run github.com/magefile/mage@v1.14.0 -v tests - name: Publish JUnit Report uses: mikepenz/action-junit-report@v3 @@ -94,15 +85,6 @@ jobs: path: test-reports/ if-no-files-found: error - - name: Send Coverage Report to Codecov - if: always() - uses: codecov/codecov-action@v3 - with: - file: ./test-reports/coverage.out - flags: unittests - name: codecov-armada-go-unit-tests - verbose: true - go-integration-tests: name: Golang Integration Tests runs-on: ubuntu-22.04 @@ -133,7 +115,7 @@ jobs: - name: Setup dependencies shell: bash - run: make download + run: go run github.com/magefile/mage@v1.14.0 -v download - name: Setup and Run Integration Tests run: | @@ -207,9 +189,9 @@ jobs: uses: actions/checkout@v3.3.0 - name: Install Protoc - uses: arduino/setup-protoc@v1 + uses: arduino/setup-protoc@v2 with: - version: '3.17.3' + version: '23.3' repo-token: ${{ secrets.GITHUB_TOKEN }} - name: Set up Go @@ -219,14 +201,14 @@ jobs: - name: Setup dependencies shell: bash - run: make download + run: go run github.com/magefile/mage@v1.14.0 -v download # TODO(JayF): Consider moving this into its own job, that runs under a larger set of circumstances # since it's possible for this to fail without any go changes being made. - name: Validate no changes in generated proto files run: | - make proto - make dotnet + go run github.com/magefile/mage@v1.14.0 -v proto + go run github.com/magefile/mage@v1.14.0 -v dotnet changed=$(git status -s -uno | wc -l) diff --git a/.gitignore b/.gitignore index 60f429425d7..d3f2315c6b8 100644 --- a/.gitignore +++ b/.gitignore @@ -91,13 +91,13 @@ typings.py third_party/airflow/docs/build/ third_party/airflow/build/ build/ssl/certs/ca-certificates.crt -localdev/volumes/go/pkg/ +developer/volumes/go/pkg/ .coverage coverage.xml # Yarn -localdev/yarn.lock -localdev/node_modules/ +developer/yarn.lock +developer/node_modules/ # Vim .*.sw* diff --git a/.goreleaser.yml b/.goreleaser.yml index 0ca2581001b..d66bb74201b 100644 --- a/.goreleaser.yml +++ b/.goreleaser.yml @@ -46,6 +46,33 @@ builds: - linux goarch: - amd64 + - env: [CGO_ENABLED=0] + id: armadaloadtester + binary: armada-load-tester + main: ./cmd/armada-load-tester/main.go + mod_timestamp: '{{ .CommitTimestamp }}' + goos: + - linux + goarch: + - amd64 + - env: [CGO_ENABLED=0] + id: pulsartest + binary: pulsartest + main: ./cmd/pulsartest/main.go + mod_timestamp: '{{ .CommitTimestamp }}' + goos: + - linux + goarch: + - amd64 + - env: [CGO_ENABLED=0] + id: testsuite + binary: testsuite + main: ./cmd/testsuite/main.go + mod_timestamp: '{{ .CommitTimestamp }}' + goos: + - linux + goarch: + - amd64 - env: [CGO_ENABLED=0] id: binoculars binary: binoculars @@ -160,11 +187,14 @@ archives: - armadactl allow_different_binary_count: true name_template: 'armadactl_{{ replace .Version "-" "_" }}_{{ .Os }}_{{ .Arch }}' + format: tar.gz format_overrides: - goos: windows format: zip files: - LICENSE + - README.md + - MAINTAINERS.md # macOS Universal Binaries-* universal_binaries: @@ -196,8 +226,7 @@ dockers: - --label=org.opencontainers.image.version={{ .Version }} - --label=org.opencontainers.image.created={{ time "2006-01-02T15:04:05Z07:00" }} - --label=org.opencontainers.image.revision={{ .FullCommit }} - - --label=org.opencontainers.image.base.name=alpine:3.17.0 - - --label=org.opencontainers.image.base.digest=c0d488a800e4 + - --label=org.opencontainers.image.base.name=alpine:3.18.3 - --label=org.opencontainers.image.licenses=Apache-2.0 - --label=org.opencontainers.image.vendor=G-Research ids: @@ -326,6 +355,30 @@ dockers: - config/executor/config.yaml dockerfile: ./build_goreleaser/fakeexecutor/Dockerfile + - id: armadaloadtester + use: buildx + goos: linux + goarch: amd64 + image_templates: + - "{{ .Env.DOCKER_REPO }}armada-loadtester:latest" + - "{{ .Env.DOCKER_REPO }}armada-loadtester:{{ .Version }}" + build_flag_templates: *BUILD_FLAG_TEMPLATES + ids: + - armadaloadtester + dockerfile: ./build_goreleaser/loadtester/Dockerfile + + - id: testsuite + use: buildx + goos: linux + goarch: amd64 + image_templates: + - "{{ .Env.DOCKER_REPO }}armada-testsuite:latest" + - "{{ .Env.DOCKER_REPO }}armada-testsuite:{{ .Version }}" + build_flag_templates: *BUILD_FLAG_TEMPLATES + ids: + - testsuite + dockerfile: ./build_goreleaser/testsuite/Dockerfile + - id: lookoutingester use: buildx goos: linux diff --git a/.mergify.yml b/.mergify.yml index 43178bbed9a..1f49c428ff2 100644 --- a/.mergify.yml +++ b/.mergify.yml @@ -9,6 +9,6 @@ pull_request_rules: - "#approved-reviews-by>=2" - and: - "#approved-reviews-by>=1" - - "author~=^(JamesMurkin|severinson|d80tb7|carlocamurri|kannon92|dejanzele|Sharpz7|ClifHouck|robertdavidsmith|theAntiYeti|richscott|suprjinx|zuqq)" + - "author~=^(JamesMurkin|severinson|d80tb7|carlocamurri|dejanzele|Sharpz7|ClifHouck|robertdavidsmith|theAntiYeti|richscott|suprjinx|zuqq)" title: Two are checks required. diff --git a/build/airflow-operator/Dockerfile b/build/airflow-operator/Dockerfile index 7a6ab2f8b6c..ff086bf72be 100644 --- a/build/airflow-operator/Dockerfile +++ b/build/airflow-operator/Dockerfile @@ -1,4 +1,4 @@ -ARG PYTHON_VERSION=3.8.10 +ARG PYTHON_VERSION=3.8.15 FROM --platform=x86_64 python:${PYTHON_VERSION}-buster @@ -6,15 +6,7 @@ RUN mkdir /proto COPY third_party/airflow/pyproject.toml /code/pyproject.toml -# Note that --use-feature=in-tree-build is needed until pip 21.3 -# (https://github.com/libAtoms/QUIP/issues/345) -# -# To make sure that the workdir is not changed (This might break something in the future), -# We pip install from /code -# -# Then to install the `test` optional dependencies, we pip install from /code[test] -# See https://stackoverflow.com/questions/46775346/what-do-square-brackets-mean-in-pip-install -RUN pip install "/code[test]" --use-feature=in-tree-build +RUN pip install "/code[test]" # Creating folders, and files for a project: COPY third_party/airflow /code diff --git a/build/armada-load-tester/Dockerfile b/build/armada-load-tester/Dockerfile index d75b78b21f5..09b8b4aeac9 100644 --- a/build/armada-load-tester/Dockerfile +++ b/build/armada-load-tester/Dockerfile @@ -1,4 +1,4 @@ -FROM alpine:3.10 +FROM alpine:3.18.3 RUN addgroup -S -g 2000 armada && adduser -S -u 1000 armada -G armada diff --git a/build/armada/Dockerfile b/build/armada/Dockerfile index ce43648ac10..6614890e768 100644 --- a/build/armada/Dockerfile +++ b/build/armada/Dockerfile @@ -1,4 +1,4 @@ -FROM alpine:3.10 +FROM alpine:3.18.3 RUN addgroup -S -g 2000 armada && adduser -S -u 1000 armada -G armada diff --git a/build/armadactl/Dockerfile b/build/armadactl/Dockerfile index f202a42d3f9..1fb97defb9e 100644 --- a/build/armadactl/Dockerfile +++ b/build/armadactl/Dockerfile @@ -1,4 +1,4 @@ -FROM alpine:3.10 +FROM alpine:3.18.3 RUN addgroup -S -g 2000 armada && adduser -S -u 1000 armada -G armada diff --git a/build/binoculars/Dockerfile b/build/binoculars/Dockerfile index a3bea576184..640fd53b986 100644 --- a/build/binoculars/Dockerfile +++ b/build/binoculars/Dockerfile @@ -1,4 +1,4 @@ -FROM alpine:3.10 +FROM alpine:3.18.3 RUN addgroup -S -g 2000 armada && adduser -S -u 1000 armada -G armada diff --git a/build/eventingester/Dockerfile b/build/eventingester/Dockerfile index ea5a76c78ea..ea77f3c9ca0 100644 --- a/build/eventingester/Dockerfile +++ b/build/eventingester/Dockerfile @@ -1,4 +1,4 @@ -FROM alpine:3.10 +FROM alpine:3.18.3 RUN addgroup -S -g 2000 armada && adduser -S -u 1000 armada -G armada diff --git a/build/executor/Dockerfile b/build/executor/Dockerfile index 6442098eb71..9a139fffbed 100644 --- a/build/executor/Dockerfile +++ b/build/executor/Dockerfile @@ -1,4 +1,4 @@ -FROM alpine:3.10 +FROM alpine:3.18.3 RUN addgroup -S -g 2000 armada && adduser -S -u 1000 armada -G armada diff --git a/build/fakeexecutor/Dockerfile b/build/fakeexecutor/Dockerfile index 29a3ebf1dda..8f822b59581 100644 --- a/build/fakeexecutor/Dockerfile +++ b/build/fakeexecutor/Dockerfile @@ -1,4 +1,4 @@ -FROM alpine:3.10 +FROM alpine:3.18.3 RUN addgroup -S -g 2000 armada && adduser -S -u 1000 armada -G armada diff --git a/build/jobservice/Dockerfile b/build/jobservice/Dockerfile index 3de235591ca..1f5bc9a9af2 100644 --- a/build/jobservice/Dockerfile +++ b/build/jobservice/Dockerfile @@ -1,4 +1,4 @@ -FROM alpine:3.10 +FROM alpine:3.18.3 RUN addgroup -S -g 2000 armada && adduser -S -u 1000 armada -G armada diff --git a/build/lookout/Dockerfile b/build/lookout/Dockerfile index 57be2ed9bef..6cedd276f5d 100644 --- a/build/lookout/Dockerfile +++ b/build/lookout/Dockerfile @@ -1,4 +1,4 @@ -FROM alpine:3.10 +FROM alpine:3.18.3 RUN addgroup -S -g 2000 armada && adduser -S -u 1000 armada -G armada diff --git a/build/lookoutingester/Dockerfile b/build/lookoutingester/Dockerfile index 32fb929ba89..18e053fd613 100644 --- a/build/lookoutingester/Dockerfile +++ b/build/lookoutingester/Dockerfile @@ -1,4 +1,4 @@ -FROM alpine:3.10 +FROM alpine:3.18.3 RUN addgroup -S -g 2000 armada && adduser -S -u 1000 armada -G armada diff --git a/build/lookoutingesterv2/Dockerfile b/build/lookoutingesterv2/Dockerfile index 642f7d2823d..f8128d0cc9a 100644 --- a/build/lookoutingesterv2/Dockerfile +++ b/build/lookoutingesterv2/Dockerfile @@ -1,4 +1,4 @@ -FROM alpine:3.10 +FROM alpine:3.18.3 RUN addgroup -S -g 2000 armada && adduser -S -u 1000 armada -G armada diff --git a/build/lookoutv2/Dockerfile b/build/lookoutv2/Dockerfile index bedc92f7fd4..4c789ab2ffd 100644 --- a/build/lookoutv2/Dockerfile +++ b/build/lookoutv2/Dockerfile @@ -1,4 +1,4 @@ -FROM alpine:3.10 +FROM alpine:3.18.3 RUN addgroup -S -g 2000 armada && adduser -S -u 1000 armada -G armada diff --git a/build/python-client/Dockerfile b/build/python-client/Dockerfile index 9156c13fc27..ca182957059 100644 --- a/build/python-client/Dockerfile +++ b/build/python-client/Dockerfile @@ -1,4 +1,4 @@ -ARG PYTHON_VERSION=3.8.10 +ARG PYTHON_VERSION=3.8.15 FROM --platform=x86_64 python:${PYTHON_VERSION}-buster @@ -6,15 +6,7 @@ RUN mkdir /proto COPY client/python/pyproject.toml /code/pyproject.toml -# Note that --use-feature=in-tree-build is needed until pip 21.3 -# (https://github.com/libAtoms/QUIP/issues/345) -# -# To make sure that the workdir is not changed (This might break something in the future), -# We pip install from /code -# -# Then to install the `test` optional dependencies, we pip install from /code[test] -# See https://stackoverflow.com/questions/46775346/what-do-square-brackets-mean-in-pip-install -RUN pip install "/code[test]" --use-feature=in-tree-build +RUN pip install "/code[test]" # Creating folders, and files for a project: COPY client/python /code diff --git a/build/scheduler/Dockerfile b/build/scheduler/Dockerfile index c745bc1eed6..b9cab04aebc 100644 --- a/build/scheduler/Dockerfile +++ b/build/scheduler/Dockerfile @@ -1,4 +1,4 @@ -FROM alpine:3.10 +FROM alpine:3.18.3 RUN addgroup -S -g 2000 armada && adduser -S -u 1000 armada -G armada diff --git a/build/scheduleringester/Dockerfile b/build/scheduleringester/Dockerfile index 4eb2ef88d91..810c76e9a01 100644 --- a/build/scheduleringester/Dockerfile +++ b/build/scheduleringester/Dockerfile @@ -1,4 +1,4 @@ -FROM alpine:3.10 +FROM alpine:3.18.3 RUN addgroup -S -g 2000 armada && adduser -S -u 1000 armada -G armada diff --git a/build/testsuite/Dockerfile b/build/testsuite/Dockerfile index 62688591494..b3a69121166 100644 --- a/build/testsuite/Dockerfile +++ b/build/testsuite/Dockerfile @@ -1,4 +1,4 @@ -FROM alpine:3.10 +FROM alpine:3.18.3 RUN addgroup -S -g 2000 armada && adduser -S -u 1000 armada -G armada diff --git a/build_goreleaser/armadactl/Dockerfile b/build_goreleaser/armadactl/Dockerfile index e45b233ac5f..b286a5ae77d 100644 --- a/build_goreleaser/armadactl/Dockerfile +++ b/build_goreleaser/armadactl/Dockerfile @@ -1,4 +1,4 @@ -ARG BASE_IMAGE=alpine:3.17.0 +ARG BASE_IMAGE=alpine:3.18.3 FROM ${BASE_IMAGE} LABEL org.opencontainers.image.title=armadactl diff --git a/build_goreleaser/binoculars/Dockerfile b/build_goreleaser/binoculars/Dockerfile index cd29a88d2d2..a64955d0003 100644 --- a/build_goreleaser/binoculars/Dockerfile +++ b/build_goreleaser/binoculars/Dockerfile @@ -1,4 +1,4 @@ -ARG BASE_IMAGE=alpine:3.17.0 +ARG BASE_IMAGE=alpine:3.18.3 FROM ${BASE_IMAGE} LABEL org.opencontainers.image.title=binoculars LABEL org.opencontainers.image.description="binoculars" diff --git a/build_goreleaser/bundles/armada/Dockerfile b/build_goreleaser/bundles/armada/Dockerfile index 6c2261b3a41..133a11e853f 100644 --- a/build_goreleaser/bundles/armada/Dockerfile +++ b/build_goreleaser/bundles/armada/Dockerfile @@ -1,4 +1,4 @@ -ARG BASE_IMAGE=alpine:3.17.0 +ARG BASE_IMAGE=alpine:3.18.3 FROM ${BASE_IMAGE} LABEL org.opencontainers.image.title=armada LABEL org.opencontainers.image.description="Armada Bundle" diff --git a/build_goreleaser/bundles/full/Dockerfile b/build_goreleaser/bundles/full/Dockerfile index d69f67370c3..924474c0a6d 100644 --- a/build_goreleaser/bundles/full/Dockerfile +++ b/build_goreleaser/bundles/full/Dockerfile @@ -1,6 +1,6 @@ ARG NODE_BUILD_IMAGE=node:16.14-buster ARG OPENAPI_BUILD_IMAGE=openapitools/openapi-generator-cli:v5.4.0 -ARG BASE_IMAGE=alpine:3.17.0 +ARG BASE_IMAGE=alpine:3.18.3 FROM ${OPENAPI_BUILD_IMAGE} AS OPENAPI LABEL org.opencontainers.image.title=armada-full-bundle diff --git a/build_goreleaser/bundles/lookout/Dockerfile b/build_goreleaser/bundles/lookout/Dockerfile index e1c0d540133..e6620f8f6f5 100644 --- a/build_goreleaser/bundles/lookout/Dockerfile +++ b/build_goreleaser/bundles/lookout/Dockerfile @@ -1,4 +1,4 @@ -ARG BASE_IMAGE=alpine:3.17.0 +ARG BASE_IMAGE=alpine:3.18.3 FROM ${BASE_IMAGE} LABEL org.opencontainers.image.title=armada-lookout-bundle diff --git a/build_goreleaser/eventingester/Dockerfile b/build_goreleaser/eventingester/Dockerfile index 9fa9b2fe95d..db665ce0fb0 100644 --- a/build_goreleaser/eventingester/Dockerfile +++ b/build_goreleaser/eventingester/Dockerfile @@ -1,4 +1,4 @@ -ARG BASE_IMAGE=alpine:3.17.0 +ARG BASE_IMAGE=alpine:3.18.3 FROM ${BASE_IMAGE} LABEL org.opencontainers.image.title=eventingester diff --git a/build_goreleaser/executor/Dockerfile b/build_goreleaser/executor/Dockerfile index 7ab16df768b..36d7ceeb679 100644 --- a/build_goreleaser/executor/Dockerfile +++ b/build_goreleaser/executor/Dockerfile @@ -1,4 +1,4 @@ -ARG BASE_IMAGE=alpine:3.17.0 +ARG BASE_IMAGE=alpine:3.18.3 FROM ${BASE_IMAGE} LABEL org.opencontainers.image.title=executor diff --git a/build_goreleaser/fakeexecutor/Dockerfile b/build_goreleaser/fakeexecutor/Dockerfile index 6b3e178eb7b..d7fa88edb17 100644 --- a/build_goreleaser/fakeexecutor/Dockerfile +++ b/build_goreleaser/fakeexecutor/Dockerfile @@ -1,4 +1,4 @@ -ARG BASE_IMAGE=alpine:3.17.0 +ARG BASE_IMAGE=alpine:3.18.3 FROM ${BASE_IMAGE} LABEL org.opencontainers.image.title=fakeexecutor LABEL org.opencontainers.image.description="Fake Executor" diff --git a/build_goreleaser/jobservice/Dockerfile b/build_goreleaser/jobservice/Dockerfile index 998f99be4ad..9da0241774b 100644 --- a/build_goreleaser/jobservice/Dockerfile +++ b/build_goreleaser/jobservice/Dockerfile @@ -1,4 +1,4 @@ -ARG BASE_IMAGE=alpine:3.17.0 +ARG BASE_IMAGE=alpine:3.18.3 FROM ${BASE_IMAGE} LABEL org.opencontainers.image.title=jobservice diff --git a/build_goreleaser/loadtester/Dockerfile b/build_goreleaser/loadtester/Dockerfile new file mode 100644 index 00000000000..e716e2fa7e1 --- /dev/null +++ b/build_goreleaser/loadtester/Dockerfile @@ -0,0 +1,15 @@ +ARG BASE_IMAGE=alpine:3.18.3 + +FROM ${BASE_IMAGE} +LABEL org.opencontainers.image.title=loadtester +LABEL org.opencontainers.image.description="Load Tester" +LABEL org.opencontainers.image.url=https://hub.docker.com/r/gresearch/loadtester + +RUN addgroup -S -g 2000 armada && adduser -S -u 1000 armada -G armada +USER armada + +COPY armada-load-tester /app/ + +WORKDIR /app + +ENTRYPOINT ["./armada-load-tester"] diff --git a/build_goreleaser/lookout/Dockerfile b/build_goreleaser/lookout/Dockerfile index a82178a5764..6431402eb74 100644 --- a/build_goreleaser/lookout/Dockerfile +++ b/build_goreleaser/lookout/Dockerfile @@ -1,6 +1,6 @@ ARG NODE_BUILD_IMAGE=node:16.14-buster ARG OPENAPI_BUILD_IMAGE=openapitools/openapi-generator-cli:v5.4.0 -ARG BASE_IMAGE=alpine:3.17.0 +ARG BASE_IMAGE=alpine:3.18.3 FROM ${OPENAPI_BUILD_IMAGE} AS OPENAPI diff --git a/build_goreleaser/lookoutingester/Dockerfile b/build_goreleaser/lookoutingester/Dockerfile index d5f35b9c348..36132a7e67a 100644 --- a/build_goreleaser/lookoutingester/Dockerfile +++ b/build_goreleaser/lookoutingester/Dockerfile @@ -1,4 +1,4 @@ -ARG BASE_IMAGE=alpine:3.17.0 +ARG BASE_IMAGE=alpine:3.18.3 FROM ${BASE_IMAGE} LABEL org.opencontainers.image.title=lookoutingester diff --git a/build_goreleaser/lookoutingesterv2/Dockerfile b/build_goreleaser/lookoutingesterv2/Dockerfile index 3b6c6fab2d7..be74008b091 100644 --- a/build_goreleaser/lookoutingesterv2/Dockerfile +++ b/build_goreleaser/lookoutingesterv2/Dockerfile @@ -1,4 +1,4 @@ -ARG BASE_IMAGE=alpine:3.17.0 +ARG BASE_IMAGE=alpine:3.18.3 FROM ${BASE_IMAGE} LABEL org.opencontainers.image.title=lookoutingesterv2 diff --git a/build_goreleaser/lookoutv2/Dockerfile b/build_goreleaser/lookoutv2/Dockerfile index 874105c809f..e4dff417504 100644 --- a/build_goreleaser/lookoutv2/Dockerfile +++ b/build_goreleaser/lookoutv2/Dockerfile @@ -1,4 +1,4 @@ -ARG BASE_IMAGE=alpine:3.17.0 +ARG BASE_IMAGE=alpine:3.18.3 FROM ${BASE_IMAGE} RUN addgroup -S -g 2000 armada && adduser -S -u 1000 armada -G armada LABEL org.opencontainers.image.title=lookoutv2 diff --git a/build_goreleaser/scheduler/Dockerfile b/build_goreleaser/scheduler/Dockerfile index 77498d4a2d1..6922cd3be2e 100644 --- a/build_goreleaser/scheduler/Dockerfile +++ b/build_goreleaser/scheduler/Dockerfile @@ -1,4 +1,4 @@ -ARG BASE_IMAGE=alpine:3.17.0 +ARG BASE_IMAGE=alpine:3.18.3 FROM ${BASE_IMAGE} LABEL org.opencontainers.image.title=scheduler diff --git a/build_goreleaser/scheduleringester/Dockerfile b/build_goreleaser/scheduleringester/Dockerfile index 10c8012ee85..40a58a9e5b7 100644 --- a/build_goreleaser/scheduleringester/Dockerfile +++ b/build_goreleaser/scheduleringester/Dockerfile @@ -1,4 +1,4 @@ -ARG BASE_IMAGE=alpine:3.17.0 +ARG BASE_IMAGE=alpine:3.18.3 FROM ${BASE_IMAGE} LABEL org.opencontainers.image.title=scheduleringester diff --git a/build_goreleaser/server/Dockerfile b/build_goreleaser/server/Dockerfile index e5ac4518213..9568aa50aad 100644 --- a/build_goreleaser/server/Dockerfile +++ b/build_goreleaser/server/Dockerfile @@ -1,4 +1,4 @@ -ARG BASE_IMAGE=alpine:3.17.0 +ARG BASE_IMAGE=alpine:3.18.3 FROM ${BASE_IMAGE} LABEL org.opencontainers.image.title=armada-server diff --git a/build_goreleaser/testsuite/Dockerfile b/build_goreleaser/testsuite/Dockerfile index e090c5d0536..514c37566a8 100644 --- a/build_goreleaser/testsuite/Dockerfile +++ b/build_goreleaser/testsuite/Dockerfile @@ -1,4 +1,4 @@ -ARG BASE_IMAGE=alpine:3.17.0 +ARG BASE_IMAGE=alpine:3.18.3 FROM ${BASE_IMAGE} LABEL org.opencontainers.image.title=testsuite LABEL org.opencontainers.image.description="Test Suite" diff --git a/client/python/pyproject.toml b/client/python/pyproject.toml index ce74f88a5ef..fe5085afcf7 100644 --- a/client/python/pyproject.toml +++ b/client/python/pyproject.toml @@ -9,10 +9,10 @@ license = { text = "Apache Software License" } authors = [{ name = "G-Research Open Source Software", email = "armada@armadaproject.io" }] [project.optional-dependencies] -format = ["black==23.3.0", "flake8==6.0.0", "pylint==2.17.4"] +format = ["black==23.7.0", "flake8==6.1.0", "pylint==2.17.5"] # note(JayF): sphinx-jekyll-builder was broken by sphinx-markdown-builder 0.6 -- so pin to 0.5.5 -docs = ["sphinx==7.0.1", "sphinx-jekyll-builder==0.3.0", "sphinx-toolbox==3.2.0b1", "sphinx-markdown-builder==0.5.5"] -test = ["pytest==7.3.1", "coverage>=6.5.0", "pytest-asyncio==0.21.0"] +docs = ["sphinx==7.1.2", "sphinx-jekyll-builder==0.3.0", "sphinx-toolbox==3.2.0b1", "sphinx-markdown-builder==0.5.5"] +test = ["pytest==7.3.1", "coverage>=6.5.0", "pytest-asyncio==0.21.1"] [build-system] requires = ["setuptools"] diff --git a/cmd/armada-load-tester/cmd/loadtest.go b/cmd/armada-load-tester/cmd/loadtest.go index ddc6434bfc9..c2dd6d21463 100644 --- a/cmd/armada-load-tester/cmd/loadtest.go +++ b/cmd/armada-load-tester/cmd/loadtest.go @@ -52,7 +52,7 @@ var loadtestCmd = &cobra.Command{ containers: - name: sleep imagePullPolicy: IfNotPresent - image: alpine:3.10 + image: alpine:3.18.3 command: - sh args: diff --git a/cmd/armadactl/cmd/watch_test.go b/cmd/armadactl/cmd/watch_test.go new file mode 100644 index 00000000000..dc97938ca00 --- /dev/null +++ b/cmd/armadactl/cmd/watch_test.go @@ -0,0 +1,61 @@ +package cmd + +import ( + "io" + "testing" + + "github.com/spf13/cobra" + "github.com/stretchr/testify/require" + + "github.com/armadaproject/armada/internal/armadactl" +) + +func TestWatch(t *testing.T) { + tests := map[string]struct { + flags []flag + raw bool + exit_if_inactive bool + force_new_events bool + force_legacy_events bool + }{ + "default flags": {nil, false, false, false, false}, + "valid raw": {[]flag{{"raw", "true"}}, true, false, false, false}, + "valid exit-if-inactive": {[]flag{{"exit-if-inactive", "true"}}, false, true, false, false}, + "valid force-new-events": {[]flag{{"force-new-events", "true"}}, false, false, true, false}, + "valid force-legacy-events": {[]flag{{"force-legacy-events", "true"}}, false, false, false, true}, + } + for name, test := range tests { + t.Run(name, func(t *testing.T) { + a := armadactl.New() + cmd := watchCmd() + for _, flag := range test.flags { + require.NoError(t, cmd.Flags().Set(flag.name, flag.value)) + } + cmd.PreRunE = func(cmd *cobra.Command, args []string) error { + a.Out = io.Discard + if test.raw { + r, err := cmd.Flags().GetBool("raw") + require.NoError(t, err) + require.Equal(t, test.raw, r) + } + if test.exit_if_inactive { + exitOnInactiveFlag, err := cmd.Flags().GetBool("exit-if-inactive") + require.NoError(t, err) + require.Equal(t, test.raw, exitOnInactiveFlag) + } + if test.force_new_events { + forceNewEventsFlag, err := cmd.Flags().GetBool("force-new-events") + require.NoError(t, err) + require.Equal(t, test.raw, forceNewEventsFlag) + } + if test.force_legacy_events { + forceLegacyEventsFlag, err := cmd.Flags().GetBool("force-legacy-events") + require.NoError(t, err) + require.Equal(t, test.raw, forceLegacyEventsFlag) + } + return nil + } + cmd.SetArgs([]string{"arbitrary", "jobSetId1"}) + }) + } +} diff --git a/cmd/eventsprinter/logic/logic.go b/cmd/eventsprinter/logic/logic.go index 36c160647b2..34de61b4d61 100644 --- a/cmd/eventsprinter/logic/logic.go +++ b/cmd/eventsprinter/logic/logic.go @@ -9,6 +9,7 @@ import ( "github.com/gogo/protobuf/proto" v1 "k8s.io/api/core/v1" + "github.com/armadaproject/armada/internal/common/util" "github.com/armadaproject/armada/pkg/armadaevents" ) @@ -31,7 +32,15 @@ func PrintEvents(url, topic, subscription string, verbose bool) error { time.Sleep(time.Second) continue } - consumer.Ack(msg) + + util.RetryUntilSuccess( + ctx, + func() error { return consumer.Ack(msg) }, + func(err error) { + fmt.Println(err) + time.Sleep(time.Second) + }, + ) sequence := &armadaevents.EventSequence{} err = proto.Unmarshal(msg.Payload(), sequence) diff --git a/codecov.yml b/codecov.yml deleted file mode 100644 index 442d349483c..00000000000 --- a/codecov.yml +++ /dev/null @@ -1,8 +0,0 @@ -ignore: - - "**/*.pb.go" - - "**/*.sql.go" - - "**/*_moq.go" -coverage: - status: - project: off - patch: off diff --git a/config/armada/config.yaml b/config/armada/config.yaml index 991d73b294b..cb9b8e56681 100644 --- a/config/armada/config.yaml +++ b/config/armada/config.yaml @@ -102,6 +102,7 @@ scheduling: resolution: "1Mi" minTerminationGracePeriod: 1s maxTerminationGracePeriod: 300s + executorUpdateFrequency: 1m queueManagement: defaultPriorityFactor: 1000 defaultQueuedJobsLimit: 0 # No Limit @@ -125,6 +126,7 @@ pulsar: eventsPrinter: false eventsPrinterSubscription: "EventsPrinter" maxAllowedMessageSize: 4194304 # 4MB + receiverQueueSize: 100 postgres: maxOpenConns: 100 maxIdleConns: 25 diff --git a/config/eventingester/config.yaml b/config/eventingester/config.yaml index 56e4dd2b050..3cbd5afe831 100644 --- a/config/eventingester/config.yaml +++ b/config/eventingester/config.yaml @@ -9,6 +9,7 @@ pulsar: jobsetEventsTopic: events receiveTimeout: 5s backoffTime: 1s + receiverQueueSize: 100 subscriptionName: "events-ingester" minMessageCompressionSize: 1024 batchSize: 1048576 #1MB diff --git a/config/executor/config.yaml b/config/executor/config.yaml index 0b04d6c6a1a..e968a6b50a6 100644 --- a/config/executor/config.yaml +++ b/config/executor/config.yaml @@ -23,6 +23,7 @@ task: # The executor api section should only be needed until we migrate to it fully - then we go back to just using apiConnection executorApiConnection: armadaUrl: "server:50052" + forceNoTls: false apiConnection: armadaUrl: "server:50051" forceNoTls: false diff --git a/config/lookoutingester/config.yaml b/config/lookoutingester/config.yaml index 5c4fecb31c3..35d21b86624 100644 --- a/config/lookoutingester/config.yaml +++ b/config/lookoutingester/config.yaml @@ -17,6 +17,7 @@ pulsar: jobsetEventsTopic: "events" receiveTimeout: 5s backoffTime: 1s + receiverQueueSize: 100 paralellism: 1 subscriptionName: "lookout-ingester" batchSize: 10000 diff --git a/config/lookoutingesterv2/config.yaml b/config/lookoutingesterv2/config.yaml index 0750e78ab2a..b624ce7c180 100644 --- a/config/lookoutingesterv2/config.yaml +++ b/config/lookoutingesterv2/config.yaml @@ -17,6 +17,7 @@ pulsar: jobsetEventsTopic: "events" receiveTimeout: 5s backoffTime: 1s + receiverQueueSize: 100 subscriptionName: "lookout-ingester-v2" batchSize: 10000 batchDuration: 500ms diff --git a/config/scheduler/config.yaml b/config/scheduler/config.yaml index fb6856a35f7..531f4c6a78d 100644 --- a/config/scheduler/config.yaml +++ b/config/scheduler/config.yaml @@ -1,6 +1,7 @@ cyclePeriod: 1s schedulePeriod: 10s maxSchedulingDuration: 5s +maxJobsLeasedPerCall: 1000 executorTimeout: 1h databaseFetchSize: 1000 pulsarSendTimeout: 5s @@ -8,6 +9,15 @@ internedStringsCacheSize: 100000 metrics: port: 9000 refreshInterval: 30s + metrics: + scheduleCycleTimeHistogramSettings: + start: 1.0 + factor: 1.1 + count: 110 + reconcileCycleTimeHistogramSettings: + start: 1.0 + factor: 1.1 + count: 110 pulsar: URL: "pulsar://pulsar:6650" jobsetEventsTopic: "events" @@ -54,6 +64,7 @@ grpc: enabled: false scheduling: executorTimeout: 10m + executorUpdateFrequency: 1m enableAssertions: true fairnessModel: "AssetFairness" dominantResourceFairnessResourcesToConsider: diff --git a/config/scheduleringester/config.yaml b/config/scheduleringester/config.yaml index 3734042a5ad..973f8982ce4 100644 --- a/config/scheduleringester/config.yaml +++ b/config/scheduleringester/config.yaml @@ -18,6 +18,7 @@ pulsar: jobsetEventsTopic: "events" receiveTimeout: 5s backoffTime: 1s + receiverQueueSize: 100 subscriptionName: "scheduler-ingester" batchSize: 10000 diff --git a/deployment/lookout-migration/templates/job.yaml b/deployment/lookout-migration/templates/job.yaml index 32be79b3153..1d1bd8d6a53 100644 --- a/deployment/lookout-migration/templates/job.yaml +++ b/deployment/lookout-migration/templates/job.yaml @@ -25,7 +25,7 @@ spec: {{- end }} initContainers: - name: db-wait - image: alpine:3.10 + image: alpine:3.18.3 command: - /bin/sh - -c diff --git a/deployment/lookout-v2/templates/cronjob.yaml b/deployment/lookout-v2/templates/cronjob.yaml index c8ebac7bf7c..9bc07dced11 100644 --- a/deployment/lookout-v2/templates/cronjob.yaml +++ b/deployment/lookout-v2/templates/cronjob.yaml @@ -1,5 +1,5 @@ {{ if .Values.dbPruningEnabled}} -apiVersion: batch/v1beta1 +apiVersion: batch/v1 kind: CronJob metadata: name: lookout-v2-db-pruner diff --git a/deployment/scheduler/templates/cronjob.yaml b/deployment/scheduler/templates/cronjob.yaml index f3b1cfa0d64..4b6db42b472 100644 --- a/deployment/scheduler/templates/cronjob.yaml +++ b/deployment/scheduler/templates/cronjob.yaml @@ -1,5 +1,5 @@ {{ if .Values.pruner.enabled}} -apiVersion: batch/v1beta1 +apiVersion: batch/v1 kind: CronJob metadata: name: {{ include "armada-scheduler.name" . }}-db-pruner diff --git a/localdev/airflow/.env b/developer/airflow/.env similarity index 100% rename from localdev/airflow/.env rename to developer/airflow/.env diff --git a/localdev/airflow/.gitignore b/developer/airflow/.gitignore similarity index 100% rename from localdev/airflow/.gitignore rename to developer/airflow/.gitignore diff --git a/localdev/airflow/Dockerfile b/developer/airflow/Dockerfile similarity index 89% rename from localdev/airflow/Dockerfile rename to developer/airflow/Dockerfile index 02a60f3ee22..37cb026c9ed 100644 --- a/localdev/airflow/Dockerfile +++ b/developer/airflow/Dockerfile @@ -1,4 +1,4 @@ -FROM apache/airflow:2.6.0-python3.10 +FROM apache/airflow:2.7.0-python3.10 RUN umask 0002; \ mkdir -p /home/airflow/client diff --git a/localdev/airflow/dags b/developer/airflow/dags similarity index 100% rename from localdev/airflow/dags rename to developer/airflow/dags diff --git a/localdev/airflow/docker-compose.yaml b/developer/airflow/docker-compose.yaml similarity index 96% rename from localdev/airflow/docker-compose.yaml rename to developer/airflow/docker-compose.yaml index 47e3dded966..edd9e3d1e71 100644 --- a/localdev/airflow/docker-compose.yaml +++ b/developer/airflow/docker-compose.yaml @@ -24,7 +24,7 @@ # The following variables are supported: # # AIRFLOW_IMAGE_NAME - Docker image name used to run Airflow. -# Default: apache/airflow:2.6.0 +# Default: apache/airflow:|version| # AIRFLOW_UID - User ID in Airflow containers # Default: 50000 # AIRFLOW_PROJ_DIR - Base path to which all the files will be volumed. @@ -50,7 +50,7 @@ x-airflow-common: # In order to add custom dependencies or upgrade provider packages you can use your extended image. # Comment the image line, place your Dockerfile in the directory where you placed the docker-compose.yaml # and uncomment the "build" line below, Then run `docker-compose build` to build the images. - # image: ${AIRFLOW_IMAGE_NAME:-apache/airflow:2.6.0} + # image: ${AIRFLOW_IMAGE_NAME:-apache/airflow:|version|} build: . environment: &airflow-common-env @@ -78,6 +78,7 @@ x-airflow-common: - ${AIRFLOW_PROJ_DIR:-.}/armada_client:/opt/airflow/armada_client - ${AIRFLOW_PROJ_DIR:-.}/dags:/opt/airflow/dags - ${AIRFLOW_PROJ_DIR:-.}/logs:/opt/airflow/logs + - ${AIRFLOW_PROJ_DIR:-.}/config:/opt/airflow/config - ${AIRFLOW_PROJ_DIR:-.}/plugins:/opt/airflow/plugins user: "${AIRFLOW_UID:-50000}:0" depends_on: @@ -160,7 +161,7 @@ services: healthcheck: test: - "CMD-SHELL" - - 'celery --app airflow.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}"' + - 'celery --app airflow.providers.celery.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}" || celery --app airflow.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}"' interval: 30s timeout: 10s retries: 5 @@ -264,7 +265,7 @@ services: # yamllint enable rule:line-length environment: <<: *airflow-common-env - _AIRFLOW_DB_UPGRADE: 'true' + _AIRFLOW_DB_MIGRATE: 'true' _AIRFLOW_WWW_USER_CREATE: 'true' _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow} _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow} diff --git a/localdev/airflow/init.sh b/developer/airflow/init.sh similarity index 100% rename from localdev/airflow/init.sh rename to developer/airflow/init.sh diff --git a/developer/env/executor-pulsar.env b/developer/env/executor-pulsar.env new file mode 100644 index 00000000000..41ec7d8f5bf --- /dev/null +++ b/developer/env/executor-pulsar.env @@ -0,0 +1,5 @@ +ARMADA_APICONNECTION_FORCENOTLS=true +ARMADA_APPLICATION_USEEXECUTORAPI=true +ARMADA_APPLICATION_USELEGACYAPI=false +ARMADA_EXECUTORAPICONNECTION_ARMADAURL="scheduler:50052" +ARMADA_EXECUTORAPICONNECTION_FORCENOTLS=true diff --git a/developer/env/localhost_access.env b/developer/env/localhost_access.env index 10ada6e0894..ec270353d31 100644 --- a/developer/env/localhost_access.env +++ b/developer/env/localhost_access.env @@ -6,6 +6,7 @@ ARMADA_EVENTAPI_POSTGRES_CONNECTION_HOST="localhost" ARMADA_POSTGRES_CONNECTION_HOST="localhost" ARMADA_PULSAR_URL="pulsar://localhost:6650" ARMADA_APICONNECTION_ARMADAURL="localhost:50051" +ARMADA_SCHEDULING_EXECUTORUPDATEFREQUENCY=1s # For Executor # KUBECONFIG=".kube/external/config" diff --git a/developer/env/scheduler.env b/developer/env/scheduler.env new file mode 100644 index 00000000000..9ba90cbb217 --- /dev/null +++ b/developer/env/scheduler.env @@ -0,0 +1 @@ +ARMADA_HTTP_PORT:8081 \ No newline at end of file diff --git a/developer/env/scheduleringester.env b/developer/env/scheduleringester.env new file mode 100644 index 00000000000..c7564177864 --- /dev/null +++ b/developer/env/scheduleringester.env @@ -0,0 +1,2 @@ +ARMADA_PULSAR_URL="pulsar://pulsar:6650" +ARMADA_POSTGRES_CONNECTION_HOST="postgres" \ No newline at end of file diff --git a/developer/env/server-pulsar.env b/developer/env/server-pulsar.env new file mode 100644 index 00000000000..f7ba5e06139 --- /dev/null +++ b/developer/env/server-pulsar.env @@ -0,0 +1,4 @@ +EXECUTOR_UPDATE_INTERVAL="1s" +ARMADA_CORSALLOWEDORIGINS=="http://localhost:3000,http://localhost:8089,http://localhost:10000,http://example.com:10000,http://example.com:8089" +ARMADA_PULSARSCHEDULERENABLED="true" +ARMADA_PROBABILITYOFUSINGPULSARSCHEDULER="1" \ No newline at end of file diff --git a/docker-compose.yaml b/docker-compose.yaml index 7449c6ca496..fd242bde29c 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -34,7 +34,7 @@ services: - kind pulsar: - image: ${PULSAR_IMAGE:-apachepulsar/pulsar:2.10.0} + image: ${PULSAR_IMAGE:-apachepulsar/pulsar:2.10.4} container_name: pulsar volumes: - ./developer/dependencies/pulsar.conf:/conf/pulsar.conf @@ -67,6 +67,72 @@ services: - ./developer/env/server.env command: ./server --config /config/insecure-armada.yaml + server-pulsar: + container_name: server + image: ${ARMADA_IMAGE:-gresearch/armada-bundle}:${ARMADA_IMAGE_TAG:-latest} + networks: + - kind + ports: + - 50051:50051 + - 8080:8080 + - 4000:4000 + volumes: + - ./developer/config/insecure-armada.yaml:/config/insecure-armada.yaml + - "go-cache:/root/.cache/go-build:rw" + - "gomod-cache:/go/pkg/mod:rw" + depends_on: + - eventingester + working_dir: /app + env_file: + - ./developer/env/server-pulsar.env + command: ./server --config /config/insecure-armada.yaml + + scheduler: + container_name: scheduler + image: ${ARMADA_IMAGE:-gresearch/armada-bundle}:${ARMADA_IMAGE_TAG:-latest} + networks: + - kind + ports: + - 9000:9000 + - 8081:8081 + - 50052:50052 + volumes: + - ./developer/config/insecure-armada.yaml:/config/insecure-armada.yaml + - "go-cache:/root/.cache/go-build:rw" + - "gomod-cache:/go/pkg/mod:rw" + depends_on: + - postgresPulsarMigration + working_dir: /app + env_file: + - ./developer/env/scheduler.env + command: ./scheduler run --config /config/insecure-armada.yaml + + postgresPulsarMigration: + container_name: postgresPulsarMigration + image: ${ARMADA_IMAGE:-gresearch/armada-bundle}:${ARMADA_IMAGE_TAG:-latest} + networks: + - kind + volumes: + - "go-cache:/root/.cache/go-build:rw" + - "gomod-cache:/go/pkg/mod:rw" + working_dir: /app + command: ./scheduler migrateDatabase + + scheduleringester: + container_name: scheduleringester + image: ${ARMADA_IMAGE:-gresearch/armada-bundle}:${ARMADA_IMAGE_TAG:-latest} + networks: + - kind + ports: + - 9003:9003 + volumes: + - "go-cache:/root/.cache/go-build:rw" + - "gomod-cache:/go/pkg/mod:rw" + env_file: + - ./developer/env/scheduleringester.env + working_dir: /app + command: ./scheduleringester + executor: container_name: executor image: ${ARMADA_IMAGE:-gresearch/armada-bundle}:${ARMADA_IMAGE_TAG:-latest} @@ -88,6 +154,27 @@ services: working_dir: /app command: ./executor + executor-pulsar: + container_name: executor + image: ${ARMADA_IMAGE:-gresearch/armada-bundle}:${ARMADA_IMAGE_TAG:-latest} + networks: + - kind + ports: + - 9001:9001 + - 4001:4000 + depends_on: + - server-pulsar + volumes: + - ./.kube/internal:/.kube + - "go-cache:/root/.cache/go-build:rw" + - "gomod-cache:/go/pkg/mod:rw" + environment: + - KUBECONFIG=/.kube/config + env_file: + - ./developer/env/executor-pulsar.env + working_dir: /app + command: ./executor + binoculars: container_name: binoculars image: ${ARMADA_IMAGE:-gresearch/armada-bundle}:${ARMADA_IMAGE_TAG:-latest} diff --git a/docs/demo.md b/docs/demo.md index ddaf5992391..29bc12c0053 100644 --- a/docs/demo.md +++ b/docs/demo.md @@ -25,6 +25,7 @@ cd armada All commands are intended to be run from the root of the repository. ## Setup an easy-to-use alias +If you are on a Windows System, use a linux-supported terminal to run this command, for example [Git Bash](https://git-scm.com/downloads) or [Hyper](https://hyper.is/) ```bash alias armadactl='go run cmd/armadactl/main.go --armadaUrl armada.demo.armadaproject.io:443' ``` @@ -33,9 +34,10 @@ alias armadactl='go run cmd/armadactl/main.go --armadaUrl armada.demo.armadaproj Create queues, submit some jobs, and monitor progress: ### Queue Creation +Use a unique name for the queue. Make sure you remember it for the next steps. ```bash -armadactl create queue queue-a --priorityFactor 1 -armadactl create queue queue-b --priorityFactor 2 +armadactl create queue $QUEUE_NAME --priorityFactor 1 +armadactl create queue $QUEUE_NAME --priorityFactor 2 ``` For queues created in this way, user and group owners of the queue have permissions to: @@ -51,30 +53,62 @@ armadactl create -f ./docs/quickstart/queue-a.yaml armadactl create -f ./docs/quickstart/queue-b.yaml ``` +Make sure to manually edit both of these `yaml` files using a code or text editor before running the commands above. + +``` +name: $QUEUE_NAME +``` + ### Job Submission ``` armadactl submit ./docs/quickstart/job-queue-a.yaml armadactl submit ./docs/quickstart/job-queue-b.yaml ``` +Make sure to manually edit both of these `yaml` files using a code or text editor before running the commands above. +``` +queue: $QUEUE_NAME +``` + ### Monitor Job Progress ```bash -armadactl watch queue-a job-set-1 +armadactl watch $QUEUE_NAME job-set-1 ``` ```bash -armadactl watch queue-b job-set-1 +armadactl watch $QUEUE_NAME job-set-1 ``` Try submitting lots of jobs and see queues get built and processed: -```bash +#### Windows (using Git Bash): + +Use a text editor of your choice. +Copy and paste the following lines into the text editor: +``` +#!/bin/bash + for i in {1..50} do armadactl submit ./docs/quickstart/job-queue-a.yaml armadactl submit ./docs/quickstart/job-queue-b.yaml done ``` +Save the file with a ".sh" extension (e.g., myscript.sh) in the root directory of the project. +Open Git Bash, navigate to the project's directory using the 'cd' command, and then run the script by typing ./myscript.sh and pressing Enter. + +#### Linux: + +Open a text editor (e.g., Nano or Vim) in the terminal and create a new file by running: nano myscript.sh (replace "nano" with your preferred text editor if needed). +Copy and paste the script content from above into the text editor. +Save the file and exit the text editor. +Make the script file executable by running: chmod +x myscript.sh. +Run the script by typing ./myscript.sh in the terminal and pressing Enter. + +#### macOS: + +Follow the same steps as for Linux, as macOS uses the Bash shell by default. +With this approach, you create a shell script file that contains your multi-line script, and you can run it as a whole by executing the script file in the terminal. ## Observing job progress diff --git a/docs/design/diagrams/relationships/armada_system.png b/docs/design/diagrams/relationships/armada_system.png deleted file mode 100644 index f6c95340361..00000000000 Binary files a/docs/design/diagrams/relationships/armada_system.png and /dev/null differ diff --git a/docs/design/diagrams/relationships/armada_v1_system.png b/docs/design/diagrams/relationships/armada_v1_system.png new file mode 100644 index 00000000000..f7921b52299 Binary files /dev/null and b/docs/design/diagrams/relationships/armada_v1_system.png differ diff --git a/docs/design/diagrams/relationships/armada_v2_system.png b/docs/design/diagrams/relationships/armada_v2_system.png new file mode 100644 index 00000000000..bbf244c1fd4 Binary files /dev/null and b/docs/design/diagrams/relationships/armada_v2_system.png differ diff --git a/docs/design/diagrams/relationships/generate.py b/docs/design/diagrams/relationships/generate.py index 43f64d559d9..bb4050850db 100644 --- a/docs/design/diagrams/relationships/generate.py +++ b/docs/design/diagrams/relationships/generate.py @@ -1,147 +1,11 @@ -from diagrams import Cluster, Diagram, Edge -from diagrams.onprem.database import PostgreSQL -from diagrams.onprem.inmemory import Redis -from diagrams.k8s.controlplane import API -from diagrams.custom import Custom +import subprocess -graph_attr = { - "concentrate": "false", - "splines": "ortho", - "pad": "2", - "nodesep": "0.30", - "ranksep": "1.5", - "fontsize": "20", -} +def run_scripts(): + # Run generate_v1.py + subprocess.run(["python", "generate_v1.py"]) -node_attr = { - # decrease image size - "fixedsize": "true", - "width": "1", - "height": "1", - "fontsize": "15", -} + # Run generate_v2.py + subprocess.run(["python", "generate_v2.py"]) -edge_attr = { - "minlen": "1", -} - -cluster_attr_common = { - "margin": "20", - "fontsize": "15", -} - -cluster_attr_server = { - "labelloc": "b", - "bgcolor": "#c7ffd5", -} -cluster_attr_server = {**cluster_attr_common, **cluster_attr_server} - -cluster_attr_exec = { - "labelloc": "t", - "bgcolor": "#c7ffd5", -} - -cluster_attr_exec = {**cluster_attr_common, **cluster_attr_exec} - -armada_logo = "../files/armada.png" -pulsar_logo = "../files/pulsar.png" -browser_logo = "../files/browser.png" - -with Diagram( - name="Armada Systems Diagram", - show=False, - direction="LR", - graph_attr=graph_attr, - edge_attr=edge_attr, - node_attr=node_attr, - filename="out/armada_systems_diagram", -): - pulsar = Custom("Pulsar", pulsar_logo) - - # Databases - postgres_lookout = PostgreSQL("Postgres (Lookout)") - postgres_scheduler = PostgreSQL("Postgres (Scheduler)") - redis_events = Redis("Redis (Events)") - - # Components - server = Custom("Server", armada_logo) - client = Custom("Client", armada_logo) - scheduler = Custom("Scheduler", armada_logo) - - # Lookout Parts - lookout_api = Custom("Lookout API", armada_logo) - lookoutUI = Custom("Lookout UI", armada_logo) - - # Ingesters - lookout_ingester = Custom("Lookout Ingester", armada_logo) - scheduler_ingester = Custom("Scheduler Ingester", armada_logo) - event_ingerster = Custom("Event Ingester", armada_logo) - - with Cluster("Executor Cluster", graph_attr=cluster_attr_server): - executor = Custom("Executor", armada_logo) - k8s_api = API("K8s API") - binoculars = Custom("Binoculars", armada_logo) - - with Cluster("Executor Cluster 2", graph_attr=cluster_attr_server): - executor2 = Custom("Executor 2", armada_logo) - k8s_api2 = API("K8s API 2") - binoculars2 = Custom("Binoculars", armada_logo) - - # Relationships - - # client sends requests to the server - client >> Edge(color="black") >> server - - # submit api talks to pulsar - server >> Edge(color="red") >> pulsar - - # pulsar talks to each of the ingesters - pulsar >> Edge(color="red") >> lookout_ingester - pulsar >> Edge(color="red") >> scheduler_ingester - pulsar >> Edge(color="red") >> event_ingerster - - # make postgres blue, redis orange - # lookout and scheduler ingesters talk to postgres - # the other ingesters talk to redis - lookout_ingester >> Edge(color="blue") >> postgres_lookout - scheduler_ingester >> Edge(color="blue") >> postgres_scheduler - - event_ingerster >> Edge(color="orange") >> redis_events - - # the postgres scheduler talks to the scheduler and executor api - postgres_scheduler >> Edge(color="blue") >> scheduler - - # the scheduler talks to pulsar - scheduler >> Edge(color="red") >> pulsar - - executor >> Edge(color="blue") >> k8s_api - k8s_api >> Edge(color="blue") >> executor - - executor2 >> Edge(color="blue") >> k8s_api2 - k8s_api2 >> Edge(color="blue") >> executor2 - - # The binoculars in every cluster talks to k8s, and - # then talks directly to the lookout UI - k8s_api >> Edge(color="blue") >> binoculars - binoculars >> Edge(color="black") >> lookoutUI - - k8s_api2 >> Edge(color="blue") >> binoculars2 - binoculars2 >> Edge(color="black") >> lookoutUI - - # Lookout API gets its data from postgres - # and passes it to the lookout UI - postgres_lookout >> Edge(color="blue") >> lookout_api - lookout_api >> Edge(color="black") >> lookoutUI - - # The scheduler talks to the executor api - scheduler >> Edge(color="blue") >> executor - scheduler >> Edge(color="blue") >> executor2 - - # pulsar talks to the server - pulsar >> Edge(color="red") >> server - - # redis events are given back to the server - redis_events >> Edge(color="orange") >> server - - # and passed to the client - server >> Edge(color="black") >> client +if __name__ == "__main__": + run_scripts() diff --git a/docs/design/diagrams/relationships/generate_v1.py b/docs/design/diagrams/relationships/generate_v1.py new file mode 100644 index 00000000000..f478b4e45cc --- /dev/null +++ b/docs/design/diagrams/relationships/generate_v1.py @@ -0,0 +1,144 @@ +from diagrams import Cluster, Diagram, Edge +from diagrams.onprem.database import PostgreSQL +from diagrams.onprem.inmemory import Redis +from diagrams.k8s.controlplane import API +from diagrams.custom import Custom + +graph_attr = { + "concentrate": "false", + "splines": "ortho", + "pad": "2", + "nodesep": "0.30", + "ranksep": "1.5", + "fontsize": "20", +} + +node_attr = { + # decrease image size + "fixedsize": "true", + "width": "1", + "height": "1", + "fontsize": "15", +} + +edge_attr = { + "minlen": "1", +} + +cluster_attr_common = { + "margin": "20", + "fontsize": "15", +} + +cluster_attr_server = { + "labelloc": "b", + "bgcolor": "#c7ffd5", +} +cluster_attr_server = {**cluster_attr_common, **cluster_attr_server} + +cluster_attr_exec = { + "labelloc": "t", + "bgcolor": "#c7ffd5", +} + +cluster_attr_exec = {**cluster_attr_common, **cluster_attr_exec} + +armada_logo = "./images/armada.png" +pulsar_logo = "./images/pulsar.png" +browser_logo = "./images/browser.png" + +with Diagram( + name="Armada V1 System", + show=False, + direction="LR", + graph_attr=graph_attr, + edge_attr=edge_attr, + node_attr=node_attr, + # filename="out/armada_systems_diagram", +): + pulsar = Custom("Pulsar", pulsar_logo) + + # Databases + postgres_lookout = PostgreSQL("Postgres (Lookout)") + redis_events = Redis("Redis (Events)") + + # Components + server = Custom("Server", armada_logo) + client = Custom("Client", armada_logo) + executorAPI = Custom("Executor API", armada_logo) + lookoutV2API = Custom("Lookout V2 API", armada_logo) + lookoutV1API = Custom("Lookout V1 API", armada_logo) + lookoutV1UI = Custom("Lookout V1 UI", armada_logo) + + # Ingesters + lookout_v2_ingester = Custom("Lookout V2 Ingester", armada_logo) + lookout_v1_ingester = Custom("Lookout V1 Ingester", armada_logo) + + with Cluster("Executor Cluster", graph_attr=cluster_attr_server): + executor = Custom("Executor", armada_logo) + k8s_api = API("K8s API") + + with Cluster("Executor Cluster 2", graph_attr=cluster_attr_server): + executor2 = Custom("Executor 2", armada_logo) + k8s_api2 = API("K8s API 2") + + # Relationships + + # The lookout V2 API talks to The Lookout V1 UI + lookoutV2API >> Edge(color="black") >> lookoutV1UI + + # Lookout V2 ingester talks to each other Postgres lookout + lookout_v2_ingester >> Edge(color="blue") >> postgres_lookout + + # Pulsar talks to lookout_ingester + pulsar >> Edge(color="red") >> lookout_v1_ingester + + # Lookout V1 Ingester talks to Lookout V1 API + lookout_v1_ingester >> Edge(color="black") >> lookoutV1API + + # Lookout V1 Ingester talks to Postgres(Lookout) + lookout_v1_ingester >> Edge(color="blue") >> postgres_lookout + + # Pulsar talks to lookout_ingester + pulsar >> Edge(color="red") >> lookout_v2_ingester + + # Lookout V2 Ingester talks to Lookout V2 API + lookout_v2_ingester >> Edge(color="black") >> lookoutV2API + + # Pulsar talks to server + pulsar >> Edge(color="red") >> server + + # Server and client talks to each other + server >> Edge(color="black") >> client + client >> Edge(color="black") >> server + + # Executor API and server talks to each other + executorAPI >> Edge(color="black") >> server + server >> Edge(color="black") >> executorAPI + + # server talks to redis_events + server >> Edge(color="orange") >> redis_events + + # in Executor Cluster + executor >> Edge(color="blue") >> k8s_api + k8s_api >> Edge(color="blue") >> executor + + # in Executor Cluster 2 + executor2 >> Edge(color="blue") >> k8s_api2 + k8s_api2 >> Edge(color="blue") >> executor2 + + # Executor talks to executor API + executor >> Edge(color="black") >> executorAPI + executorAPI >> Edge(color="black") >> executor + + # Executor 2 talks to executor API + executor2 >> Edge(color="black") >> executorAPI + executorAPI >> Edge(color="black") >> executor2 + + # lookout v1 api talks to lookout v1 UI + lookoutV1API >> Edge(color="black") >> lookoutV1UI + + + + + diff --git a/docs/design/diagrams/relationships/generate_v2.py b/docs/design/diagrams/relationships/generate_v2.py new file mode 100644 index 00000000000..d5bc6682793 --- /dev/null +++ b/docs/design/diagrams/relationships/generate_v2.py @@ -0,0 +1,148 @@ +from diagrams import Cluster, Diagram, Edge +from diagrams.onprem.database import PostgreSQL +from diagrams.onprem.inmemory import Redis +from diagrams.k8s.controlplane import API +from diagrams.custom import Custom + +graph_attr = { + "concentrate": "false", + "splines": "ortho", + "pad": "2", + "nodesep": "0.30", + "ranksep": "1.5", + "fontsize": "20", +} + +node_attr = { + # decrease image size + "fixedsize": "true", + "width": "1", + "height": "1", + "fontsize": "15", +} + +edge_attr = { + "minlen": "1", +} + +cluster_attr_common = { + "margin": "20", + "fontsize": "15", +} + +cluster_attr_server = { + "labelloc": "b", + "bgcolor": "#c7ffd5", +} +cluster_attr_server = {**cluster_attr_common, **cluster_attr_server} + +cluster_attr_exec = { + "labelloc": "t", + "bgcolor": "#c7ffd5", +} + +cluster_attr_exec = {**cluster_attr_common, **cluster_attr_exec} + +armada_logo = "./images/armada.png" +pulsar_logo = "./images/pulsar.png" +browser_logo = "./images/browser.png" + +with Diagram( + name="Armada V2 System", + show=False, + direction="LR", + graph_attr=graph_attr, + edge_attr=edge_attr, + node_attr=node_attr, + # filename="out/armada_systems_diagram", +): + pulsar = Custom("Pulsar", pulsar_logo) + + + # Databases + postgres_lookout = PostgreSQL("Postgres (Lookout)") + postgres_scheduler = PostgreSQL("Postgres (Scheduler)") + redis_events = Redis("Redis (Events)") + + # Components + server = Custom("Server", armada_logo) + client = Custom("Client", armada_logo) + scheduler = Custom("Scheduler", armada_logo) + + # Lookout Parts + lookout_api = Custom("Lookout V2 API", armada_logo) + lookoutUI = Custom("Lookout V2 UI", armada_logo) + + # Ingesters + lookout_ingester = Custom("Lookout V2 Ingester", armada_logo) + scheduler_ingester = Custom("Scheduler Ingester", armada_logo) + event_ingerster = Custom("Event Ingester", armada_logo) + + with Cluster("Executor Cluster", graph_attr=cluster_attr_server): + executor = Custom("Executor", armada_logo) + k8s_api = API("K8s API") + binoculars = Custom("Binoculars", armada_logo) + + with Cluster("Executor Cluster 2", graph_attr=cluster_attr_server): + executor2 = Custom("Executor 2", armada_logo) + k8s_api2 = API("K8s API 2") + binoculars2 = Custom("Binoculars", armada_logo) + + # Relationships + + # client sends requests to the server + client >> Edge(color="black") >> server + + # submit api talks to pulsar + server >> Edge(color="red") >> pulsar + + # pulsar talks to each of the ingesters + pulsar >> Edge(color="red") >> lookout_ingester + pulsar >> Edge(color="red") >> scheduler_ingester + pulsar >> Edge(color="red") >> event_ingerster + + # make postgres blue, redis orange + # lookout and scheduler ingesters talk to postgres + # the other ingesters talk to redis + lookout_ingester >> Edge(color="blue") >> postgres_lookout + scheduler_ingester >> Edge(color="blue") >> postgres_scheduler + + event_ingerster >> Edge(color="orange") >> redis_events + + # the postgres scheduler talks to the scheduler and executor api + postgres_scheduler >> Edge(color="blue") >> scheduler + + # the scheduler talks to pulsar + scheduler >> Edge(color="red") >> pulsar + + executor >> Edge(color="blue") >> k8s_api + k8s_api >> Edge(color="blue") >> executor + + executor2 >> Edge(color="blue") >> k8s_api2 + k8s_api2 >> Edge(color="blue") >> executor2 + + # The binoculars in every cluster talks to k8s, and + # then talks directly to the lookout UI + k8s_api >> Edge(color="blue") >> binoculars + binoculars >> Edge(color="black") >> lookoutUI + + k8s_api2 >> Edge(color="blue") >> binoculars2 + binoculars2 >> Edge(color="black") >> lookoutUI + + # Lookout API gets its data from postgres + # and passes it to the lookout UI + postgres_lookout >> Edge(color="blue") >> lookout_api + lookout_api >> Edge(color="black") >> lookoutUI + + # The scheduler talks to the executor api + scheduler >> Edge(color="blue") >> executor + scheduler >> Edge(color="blue") >> executor2 + + # pulsar talks to the server + pulsar >> Edge(color="red") >> server + + # redis events are given back to the server + redis_events >> Edge(color="orange") >> server + + # and passed to the client + server >> Edge(color="black") >> client diff --git a/docs/design/relationships_diagram.md b/docs/design/relationships_diagram.md index 010d53182a4..2f624f44403 100644 --- a/docs/design/relationships_diagram.md +++ b/docs/design/relationships_diagram.md @@ -1,14 +1,22 @@ ## Relationships Diagram -![Systems Diagram](./diagrams/relationships/armada_system.png) +These diagrams show the high-level relationships between components of Armada and third-party softwares. + +### Armada V1 System + +![Systems Diagram](./diagrams/relationships/armada_v1_system.png) + + +### Armada V2 System + +![Systems Diagram](./diagrams/relationships/armada_v2_system.png) -This diagram shows the high-level relationships between components of Armada and third-party softwares. For a more detailed view of Armada, see the [Scheduler Architecture Doc](./architecture.md). ### Armada Client -This is the comonent that is used by users to submit jobs to Armada, using gRPC. Current languages supported are: +This is the component that is used by users to submit jobs to Armada, using gRPC. Current languages supported are: - Go - Python - C# diff --git a/docs/developer.md b/docs/developer.md index 28c0b7f4ea3..87c827d5ced 100644 --- a/docs/developer.md +++ b/docs/developer.md @@ -111,6 +111,24 @@ You can set the `ARMADA_COMPONENTS` environment variable to choose which compone export ARMADA_COMPONENTS="server,executor" ``` +### Running Pulsar backed scheduler with LocalDev + +Ensure your local environment is completely torn down with +```bash +mage LocalDevStop +``` + +And then run + +```bash +mage LocalDev minimal-pulsar +``` + +Ensure your local dev environment is completely torn down when switching between pulsar backed and legacy +setups. + +If the eventsingester or the scheduleringester don't come up then just manually spin them up with `docker-compose up`. + ## Debugging The mage target `mage debug` supports multiple methods for debugging, and runs the appropriate parts of localdev as required. @@ -192,4 +210,4 @@ For required enviromental variables, please see [The Enviromental Variables Guid ## Finer-Grain Control If you would like to run the individual mage targets yourself, you can do so. -See the [Manually Running LocalDev](./developer/manual-localdev.md) guide for more information. \ No newline at end of file +See the [Manually Running LocalDev](./developer/manual-localdev.md) guide for more information. diff --git a/docs/python_airflow_operator.md b/docs/python_airflow_operator.md index b9bb025f100..1d820856344 100644 --- a/docs/python_airflow_operator.md +++ b/docs/python_airflow_operator.md @@ -107,7 +107,7 @@ This mutates the attributes in-place and is irreversible. ## armada.operators.armada_deferrable module -### _class_ armada.operators.armada_deferrable.ArmadaDeferrableOperator(name, armada_channel_args, job_service_channel_args, armada_queue, job_request_items, lookout_url_template=None, \*\*kwargs) +### _class_ armada.operators.armada_deferrable.ArmadaDeferrableOperator(name, armada_channel_args, job_service_channel_args, armada_queue, job_request_items, lookout_url_template=None, poll_interval=30, \*\*kwargs) Bases: `BaseOperator` Implementation of a deferrable armada operator for airflow. @@ -149,6 +149,9 @@ Airflow operators inherit from BaseOperator. be replaced with the actual job ID. + * **poll_interval** (*int*) – How often to poll jobservice to get status. + + * **Returns** diff --git a/e2e/armadactl_test/armadactl_test.go b/e2e/armadactl_test/armadactl_test.go index 81311ef5883..7f357c9c330 100644 --- a/e2e/armadactl_test/armadactl_test.go +++ b/e2e/armadactl_test/armadactl_test.go @@ -179,7 +179,7 @@ jobs: containers: - name: ls imagePullPolicy: IfNotPresent - image: alpine:3.10 + image: alpine:3.18.3 command: - sh - -c diff --git a/e2e/lookout_ingester_test/lookout_ingester_test.go b/e2e/lookout_ingester_test/lookout_ingester_test.go index bad31b690e2..517da2f94e1 100644 --- a/e2e/lookout_ingester_test/lookout_ingester_test.go +++ b/e2e/lookout_ingester_test/lookout_ingester_test.go @@ -354,7 +354,7 @@ func createJobRequest(namespace string, args []string) *api.JobSubmitRequest { Containers: []v1.Container{ { Name: "container1", - Image: "alpine:3.10", + Image: "alpine:3.18.3", Command: []string{"/bin/sh", "-c"}, Args: args, Resources: v1.ResourceRequirements{ diff --git a/e2e/pulsar_test/pulsar_test.go b/e2e/pulsar_test/pulsar_test.go index ffd8b4609e5..7d526026108 100644 --- a/e2e/pulsar_test/pulsar_test.go +++ b/e2e/pulsar_test/pulsar_test.go @@ -3,7 +3,7 @@ package pulsar_test import ( "context" "fmt" - "io/ioutil" + "io" "net/http" "os" "os/exec" @@ -449,7 +449,7 @@ func TestIngress(t *testing.T) { if err != nil { return err } - httpResBytes, err := ioutil.ReadAll(httpRes.Body) + httpResBytes, err := io.ReadAll(httpRes.Body) if err != nil { return err } @@ -900,7 +900,11 @@ func receiveJobSetSequencesWithEventFilter( fmt.Println("Pulsar receive error", err) continue } - consumer.Ack(msg) + err = consumer.Ack(msg) + if err != nil { + fmt.Println("Pulsar ack error", err) + continue + } sequence := &armadaevents.EventSequence{} err = proto.Unmarshal(msg.Payload(), sequence) @@ -1012,7 +1016,7 @@ func createJobSubmitRequestWithClientId(numJobs int, clientId string) *api.JobSu Containers: []v1.Container{ { Name: "container1", - Image: "alpine:3.10", + Image: "alpine:3.18.3", Args: []string{"sleep", "5s"}, Resources: v1.ResourceRequirements{ Requests: v1.ResourceList{"cpu": cpu, "memory": memory}, @@ -1042,7 +1046,7 @@ func createWgetJobRequest(address string) *api.JobSubmitRequest { Containers: []v1.Container{ { Name: "wget", - Image: "alpine:3.10", + Image: "alpine:3.18.3", Args: []string{"wget", address, "--timeout=5"}, // Queried from the k8s services API Resources: v1.ResourceRequirements{ Requests: v1.ResourceList{"cpu": cpu, "memory": memory}, @@ -1173,7 +1177,7 @@ func createJobSubmitRequestWithEverything(numJobs int) *api.JobSubmitRequest { Containers: []v1.Container{ { Name: "container1", - Image: "alpine:3.10", + Image: "alpine:3.18.3", Args: []string{"sleep", "5s"}, Resources: v1.ResourceRequirements{ Requests: v1.ResourceList{"cpu": cpu, "memory": memory}, @@ -1240,7 +1244,7 @@ func createJobSubmitRequestWithError(numJobs int) *api.JobSubmitRequest { Containers: []v1.Container{ { Name: "container1", - Image: "alpine:3.10", + Image: "alpine:3.18.3", Args: []string{"sleep", "5s", "&&", "exit", "1"}, Resources: v1.ResourceRequirements{ Requests: v1.ResourceList{"cpu": cpu, "memory": memory}, diff --git a/e2e/setup/setup_cluster_kind.sh b/e2e/setup/setup_cluster_kind.sh index 1073c0d1d82..51e92d4ecdc 100755 --- a/e2e/setup/setup_cluster_kind.sh +++ b/e2e/setup/setup_cluster_kind.sh @@ -2,4 +2,4 @@ REPO_ROOT=$(git rev-parse --show-toplevel) echo "Setting up cluster1" kind create cluster --config ${REPO_ROOT}/e2e/setup/worker-master-config.yaml --name cluster1 --wait 3m -kind load --name cluster1 docker-image alpine:3.10 +kind load --name cluster1 docker-image alpine:3.18.3 diff --git a/go.mod b/go.mod index 0f03be2aadc..e82243a061e 100644 --- a/go.mod +++ b/go.mod @@ -10,7 +10,7 @@ replace github.com/AthenZ/athenz v1.10.39 => github.com/AthenZ/athenz v1.10.4 require ( github.com/alexbrainman/sspi v0.0.0-20180613141037-e580b900e9f5 github.com/alicebob/miniredis v2.5.0+incompatible - github.com/apache/pulsar-client-go v0.8.1-0.20220429133321-5ee63303d43e + github.com/apache/pulsar-client-go v0.11.0 github.com/avast/retry-go v3.0.0+incompatible github.com/coreos/go-oidc v2.2.1+incompatible github.com/coreos/pkg v0.0.0-20180928190104-399ea9e2e55f @@ -33,10 +33,9 @@ require ( github.com/hashicorp/go-multierror v1.1.1 github.com/hashicorp/golang-lru v0.5.4 github.com/instrumenta/kubeval v0.0.0-20190918223246-8d013ec9fc56 - github.com/jackc/pgconn v1.13.0 github.com/jackc/pgerrcode v0.0.0-20201024163028-a0d42d470451 github.com/jackc/pgtype v1.13.0 - github.com/jackc/pgx/v4 v4.17.2 + github.com/jackc/pgx/v4 v4.17.2 // indirect github.com/jcmturner/gokrb5/v8 v8.4.2-0.20201112171129-78f56934d598 github.com/jolestar/go-commons-pool v2.0.0+incompatible github.com/jstemmer/go-junit-report/v2 v2.0.0 @@ -78,6 +77,7 @@ require ( require ( github.com/Masterminds/semver/v3 v3.2.0 github.com/benbjohnson/immutable v0.4.3 + github.com/caarlos0/log v0.2.1 github.com/go-openapi/errors v0.20.3 github.com/go-openapi/strfmt v0.21.3 github.com/go-openapi/swag v0.22.3 @@ -102,13 +102,12 @@ require ( github.com/Azure/go-ntlmssp v0.0.0-20220621081337-cb9428e4ac1e // indirect github.com/DataDog/zstd v1.5.0 // indirect github.com/alicebob/gopher-json v0.0.0-20180125190556-5a6b3ba71ee6 // indirect - github.com/apache/pulsar-client-go/oauth2 v0.0.0-20220120090717-25e59572242e // indirect github.com/ardielle/ardielle-go v1.5.2 // indirect github.com/asaskevich/govalidator v0.0.0-20210307081110-f21760c49a8d // indirect github.com/aymanbagabas/go-osc52 v1.2.1 // indirect github.com/beorn7/perks v1.0.1 // indirect + github.com/bits-and-blooms/bitset v1.4.0 // indirect github.com/blang/semver v3.5.1+incompatible // indirect - github.com/caarlos0/log v0.2.1 // indirect github.com/cespare/xxhash/v2 v2.1.2 // indirect github.com/charmbracelet/lipgloss v0.6.1-0.20220911181249-6304a734e792 // indirect github.com/danieljoos/wincred v1.1.2 // indirect @@ -142,12 +141,9 @@ require ( github.com/imdario/mergo v0.3.13 // indirect github.com/inconshreveable/mousetrap v1.0.1 // indirect github.com/invopop/jsonschema v0.7.0 // indirect - github.com/jackc/chunkreader/v2 v2.0.1 // indirect github.com/jackc/pgio v1.0.0 // indirect github.com/jackc/pgpassfile v1.0.0 // indirect - github.com/jackc/pgproto3/v2 v2.3.1 // indirect github.com/jackc/pgservicefile v0.0.0-20221227161230-091c0ba34f0a // indirect - github.com/jackc/puddle v1.3.0 // indirect github.com/jackc/puddle/v2 v2.2.0 // indirect github.com/jcmturner/aescts/v2 v2.0.0 // indirect github.com/jcmturner/dnsutils/v2 v2.0.0 // indirect @@ -172,8 +168,6 @@ require ( github.com/mtibben/percent v0.2.1 // indirect github.com/muesli/reflow v0.3.0 // indirect github.com/muesli/termenv v0.14.0 // indirect - github.com/onsi/ginkgo v1.16.5 // indirect - github.com/onsi/gomega v1.19.0 // indirect github.com/pelletier/go-toml/v2 v2.0.6 // indirect github.com/pierrec/lz4 v2.0.5+incompatible // indirect github.com/pmezard/go-difflib v1.0.0 // indirect @@ -201,7 +195,7 @@ require ( golang.org/x/text v0.7.0 // indirect golang.org/x/time v0.3.0 // indirect google.golang.org/appengine v1.6.7 // indirect - google.golang.org/protobuf v1.28.1 // indirect + google.golang.org/protobuf v1.30.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/ini.v1 v1.67.0 // indirect gopkg.in/square/go-jose.v2 v2.6.0 // indirect diff --git a/go.sum b/go.sum index 3b3ef71afe9..569d059b4d6 100644 --- a/go.sum +++ b/go.sum @@ -17,9 +17,6 @@ cloud.google.com/go v0.65.0/go.mod h1:O5N8zS7uWy9vkA9vayVHs65eM1ubvY4h553ofrNHOb cloud.google.com/go v0.72.0/go.mod h1:M+5Vjvlc2wnp6tjzE102Dw08nGShTscUx2nZMufOKPI= cloud.google.com/go v0.74.0/go.mod h1:VV1xSbzvo+9QJOxLDaJfTjx5e+MePCpCWwvftOeQmWk= cloud.google.com/go v0.75.0/go.mod h1:VGuuCn7PG0dwsd5XPVm2Mm3wlh3EL55/79EKB6hlPTY= -cloud.google.com/go v0.78.0/go.mod h1:QjdrLG0uq+YwhjoVOLsS1t7TW8fs36kLs4XO5R5ECHg= -cloud.google.com/go v0.79.0/go.mod h1:3bzgcEeQlzbuEAYu4mrWhKqWjmpprinYgKJLgKHnbb8= -cloud.google.com/go v0.81.0/go.mod h1:mk/AM35KwGk/Nm2YSeZbxXdrNK3KZOYHmLkOqC2V6E0= cloud.google.com/go v0.107.0 h1:qkj22L7bgkl6vIeZDlOY2po43Mx/TIa2Wsa7VR+PEww= cloud.google.com/go/bigquery v1.0.1/go.mod h1:i/xbL2UlR5RvWAURpBYZTtm/cXjCha9lbfbpx4poX+o= cloud.google.com/go/bigquery v1.3.0/go.mod h1:PjpwJnslEMmckchkHFfq+HTD2DmtT67aNFKH1/VBDHE= @@ -45,7 +42,6 @@ cloud.google.com/go/storage v1.14.0/go.mod h1:GrKmX003DSIwi9o29oFT7YDnHYwZoctc3f dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU= github.com/99designs/go-keychain v0.0.0-20191008050251-8e49817e8af4 h1:/vQbFIOMbk2FiG/kXiLl8BRyzTWDw7gX/Hz7Dd5eDMs= github.com/99designs/go-keychain v0.0.0-20191008050251-8e49817e8af4/go.mod h1:hN7oaIRCjzsZ2dE+yG5k+rsdt3qcwykqK6HVGcKwsw4= -github.com/99designs/keyring v1.1.6/go.mod h1:16e0ds7LGQQcT59QqkTg72Hh5ShM51Byv5PEmW6uoRU= github.com/99designs/keyring v1.2.1 h1:tYLp1ULvO7i3fI5vE21ReQuj99QFSs7lGm0xWyJo87o= github.com/99designs/keyring v1.2.1/go.mod h1:fc+wB5KTk9wQ9sDx0kFXB3A0MaeGHM9AwRStKOQ5vOA= github.com/AthenZ/athenz v1.10.4 h1:EhCptJxuPU2BNU0ZUTJRLrNwAFv06zMx0viN+PrV9YA= @@ -86,10 +82,8 @@ github.com/alicebob/gopher-json v0.0.0-20180125190556-5a6b3ba71ee6/go.mod h1:SGn github.com/alicebob/miniredis v2.5.0+incompatible h1:yBHoLpsyjupjz3NL3MhKMVkR41j82Yjf3KFv7ApYzUI= github.com/alicebob/miniredis v2.5.0+incompatible/go.mod h1:8HZjEj4yU0dwhYHky+DxYx+6BMjkBbe5ONFIF1MXffk= github.com/antihax/optional v1.0.0/go.mod h1:uupD/76wgC+ih3iEmQUL+0Ugr19nfwCT1kdvxnR2qWY= -github.com/apache/pulsar-client-go v0.8.1-0.20220429133321-5ee63303d43e h1:EVr1ch94kQDLDyYBex/eSXEvwOIQ2uyK3XmEyH+Dr9g= -github.com/apache/pulsar-client-go v0.8.1-0.20220429133321-5ee63303d43e/go.mod h1:BwEavf3Bej7UQ6KHhnIhVaPoCtIJOC7ti7M6CcYX904= -github.com/apache/pulsar-client-go/oauth2 v0.0.0-20220120090717-25e59572242e h1:EqiJ0Xil8NmcXyupNqXV9oYDBeWntEIegxLahrTr8DY= -github.com/apache/pulsar-client-go/oauth2 v0.0.0-20220120090717-25e59572242e/go.mod h1:Xee4tgYLFpYcPMcTfBYWE1uKRzeciodGTSEDMzsR6i8= +github.com/apache/pulsar-client-go v0.11.0 h1:fniyVbewAOcMSMLwxzhdrCFmFTorCW40jfnmQVcsrJw= +github.com/apache/pulsar-client-go v0.11.0/go.mod h1:FoijqJwgjroSKptIWp1vvK1CXs8dXnQiL8I+MHOri4A= github.com/ardielle/ardielle-go v1.5.2 h1:TilHTpHIQJ27R1Tl/iITBzMwiUGSlVfiVhwDNGM3Zj4= github.com/ardielle/ardielle-go v1.5.2/go.mod h1:I4hy1n795cUhaVt/ojz83SNVCYIGsAFAONtv2Dr7HUI= github.com/ardielle/ardielle-tools v1.5.4/go.mod h1:oZN+JRMnqGiIhrzkRN9l26Cej9dEx4jeNG6A+AdkShk= @@ -106,7 +100,6 @@ github.com/aws/aws-sdk-go v1.30.8/go.mod h1:5zCpMtNQVjRREroY7sYe8lOMRSxkhG6MZveU github.com/aymanbagabas/go-osc52 v1.0.3/go.mod h1:zT8H+Rk4VSabYN90pWyugflM3ZhpTZNC7cASDfUCdT4= github.com/aymanbagabas/go-osc52 v1.2.1 h1:q2sWUyDcozPLcLabEMd+a+7Ea2DitxZVN9hTxab9L4E= github.com/aymanbagabas/go-osc52 v1.2.1/go.mod h1:zT8H+Rk4VSabYN90pWyugflM3ZhpTZNC7cASDfUCdT4= -github.com/beefsack/go-rate v0.0.0-20220214233405-116f4ca011a0/go.mod h1:6YNgTHLutezwnBvyneBbwvB8C82y3dcoOj5EQJIdGXA= github.com/benbjohnson/clock v1.0.3/go.mod h1:bGMdMPoPVvcYyt1gHDf4J2KE153Yf9BuiUKYMaxlTDM= github.com/benbjohnson/immutable v0.4.3 h1:GYHcksoJ9K6HyAUpGxwZURrbTkXA0Dh4otXGqbhdrjA= github.com/benbjohnson/immutable v0.4.3/go.mod h1:qJIKKSmdqz1tVzNtst1DZzvaqOU1onk1rc03IeM3Owk= @@ -115,11 +108,11 @@ github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+Ce github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/bgentry/speakeasy v0.1.0/go.mod h1:+zsyZBPWlz7T6j88CTgSN5bM796AkVf0kBD4zp0CCIs= +github.com/bits-and-blooms/bitset v1.4.0 h1:+YZ8ePm+He2pU3dZlIZiOeAKfrBkXi1lSrXJ/Xzgbu8= +github.com/bits-and-blooms/bitset v1.4.0/go.mod h1:gIdJ4wp64HaoK2YrL1Q5/N7Y16edYb8uY+O0FJTyyDA= github.com/bketelsen/crypt v0.0.3-0.20200106085610-5cbc8cc4026c/go.mod h1:MKsuJmJgSg28kpZDP6UIiPt0e0Oz0kqKNGyRaWEPv84= -github.com/bketelsen/crypt v0.0.4/go.mod h1:aI6NrJ0pMGgvZKL1iVgXLnfIFJtfV+bKCoqOes/6LfM= github.com/blang/semver v3.5.1+incompatible h1:cQNTCjp13qL8KC3Nbxr/y2Bqb63oX6wdnnjpJbkM4JQ= github.com/blang/semver v3.5.1+incompatible/go.mod h1:kRBLl5iJ+tD4TcOOxsy/0fnwebNt5EWlYSAyrTnjyyk= -github.com/bmizerany/perks v0.0.0-20141205001514-d9a9656a3a4b/go.mod h1:ac9efd0D1fsDb3EJvhqgXRbFx7bs2wqZ10HQPeU8U/Q= github.com/boynton/repl v0.0.0-20170116235056-348863958e3e/go.mod h1:Crc/GCZ3NXDVCio7Yr0o+SSrytpcFhLmVCIzi0s49t4= github.com/caarlos0/log v0.2.1 h1:E5vf0Sg24tUbrGanknDu2UH0CZq6cCColThb8gTQnHQ= github.com/caarlos0/log v0.2.1/go.mod h1:BLxpdZKXvWBjB6fshua4c8d7ApdYjypEDok6ibt+pXk= @@ -140,7 +133,6 @@ github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDk github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc= github.com/cncf/udpa/go v0.0.0-20200629203442-efcf912fb354/go.mod h1:WmhPx2Nbnhtbo57+VJT5O0JRkEi1Wbu0z5j0R8u5Hbk= github.com/cncf/udpa/go v0.0.0-20201120205902-5459f2c99403/go.mod h1:WmhPx2Nbnhtbo57+VJT5O0JRkEi1Wbu0z5j0R8u5Hbk= -github.com/cockroachdb/apd v1.1.0 h1:3LFP3629v+1aKXU5Q37mxmRxX/pIu1nijXydLShEq5I= github.com/cockroachdb/apd v1.1.0/go.mod h1:8Sl8LxpKi29FqWXR16WEFZRNSz3SoPzUzeMeY4+DwBQ= github.com/coreos/bbolt v1.3.2/go.mod h1:iRUV2dpdMOn7Bo10OQBFzIJO9kkE559Wcmn+qkEiiKk= github.com/coreos/etcd v3.3.13+incompatible/go.mod h1:uF7uidLiAD3TWHmW31ZFd/JWoc32PjwdhPthX9715RE= @@ -149,7 +141,6 @@ github.com/coreos/go-oidc v2.2.1+incompatible/go.mod h1:CgnwVTmzoESiwO9qyAFEMiHo github.com/coreos/go-semver v0.3.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3EedlOD2RNk= github.com/coreos/go-systemd v0.0.0-20190321100706-95778dfbb74e/go.mod h1:F5haX7vjVVG0kc13fIWeqUViNPyEJxv/OmvnBo0Yme4= github.com/coreos/go-systemd v0.0.0-20190719114852-fd7a80b32e1f/go.mod h1:F5haX7vjVVG0kc13fIWeqUViNPyEJxv/OmvnBo0Yme4= -github.com/coreos/go-systemd/v22 v22.3.2/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= github.com/coreos/pkg v0.0.0-20180928190104-399ea9e2e55f h1:lBNOc5arjvs8E5mO2tbpBpLoyyu8B6e44T7hJy6potg= github.com/coreos/pkg v0.0.0-20180928190104-399ea9e2e55f/go.mod h1:E3G3o1h8I7cfcXa63jLwjI0eiQQMgzzUDFVpN/nH/eA= github.com/cpuguy83/go-md2man/v2 v2.0.0/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU= @@ -157,7 +148,6 @@ github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46t github.com/creack/pty v1.1.7/go.mod h1:lj5s0c3V2DBrqTV7llrYr5NG6My20zk30Fl46Y7DoTY= github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/creack/pty v1.1.11/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= -github.com/danieljoos/wincred v1.0.2/go.mod h1:SnuYRW9lp1oJrZX/dXJqr0cPK5gYXqx3EJbmjhLdK9U= github.com/danieljoos/wincred v1.1.2 h1:QLdCxFs1/Yl4zduvBdcHB8goaYk9RARS2SgLLRuAyr0= github.com/danieljoos/wincred v1.1.2/go.mod h1:GijpziifJoIBfYh+S7BbkdUTU4LfM+QnGqR5Vl2tAx0= github.com/davecgh/go-spew v0.0.0-20161028175848-04cdfd42973b/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= @@ -176,7 +166,6 @@ github.com/docopt/docopt-go v0.0.0-20180111231733-ee0de3bc6815/go.mod h1:WwZ+bS3 github.com/doug-martin/goqu/v9 v9.18.0 h1:/6bcuEtAe6nsSMVK/M+fOiXUNfyFF3yYtE07DBPFMYY= github.com/doug-martin/goqu/v9 v9.18.0/go.mod h1:nf0Wc2/hV3gYK9LiyqIrzBEVGlI8qW3GuDCEobC4wBQ= github.com/dustin/go-humanize v1.0.0 h1:VSnTsYCnlFHaM2/igO1h6X3HA71jcobQuxemgkq4zYo= -github.com/dvsekhvalnov/jose2go v0.0.0-20200901110807-248326c1351b/go.mod h1:7BvyPhdbLxMXIYTFPLsyJRFMsKmOZnQmzh6Gb+uquuM= github.com/dvsekhvalnov/jose2go v1.5.0 h1:3j8ya4Z4kMCwT5nXIKFSV84YS+HdqSSO0VsTQxaLAeM= github.com/dvsekhvalnov/jose2go v1.5.0/go.mod h1:QsHjhyTlD/lAVqn/NSbVZmSCGeDehTB/mPZadG+mhXU= github.com/elazarl/goproxy v0.0.0-20180725130230-947c36da3153/go.mod h1:/Zj4wYkgs4iZTTu3o/KG3Itv/qCCa8VVMlb3i9OVuzc= @@ -273,7 +262,6 @@ github.com/go-sql-driver/mysql v1.5.0/go.mod h1:DCzpHaOWr8IXmIStZouvnhqoel9Qv2LB github.com/go-sql-driver/mysql v1.6.0/go.mod h1:DCzpHaOWr8IXmIStZouvnhqoel9Qv2LBy8hT2VhHyBg= github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY= github.com/go-stack/stack v1.8.1/go.mod h1:dcoOX6HbPZSZptuspn9bctJ+N/CnF5gGygcUP3XYfe4= -github.com/go-task/slim-sprig v0.0.0-20210107165309-348f09dbbbc0/go.mod h1:fyg7847qk6SyHyPtNmDHnmrv/HOrqktSC+C9fM+CJOE= github.com/gobuffalo/attrs v0.0.0-20190224210810-a9411de4debd/go.mod h1:4duuawTqi2wkkpB4ePgWMaai6/Kc6WEz83bhFwpHzj0= github.com/gobuffalo/depgen v0.0.0-20190329151759-d478694a28d3/go.mod h1:3STtPUQYuzV0gBVOY3vy6CfMm/ljR4pABfrTeHNLHUY= github.com/gobuffalo/depgen v0.1.0/go.mod h1:+ifsuy7fhi15RWncXQQKjWS9JPkdah5sZvtHc2RXGlg= @@ -302,8 +290,6 @@ github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y= github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8= github.com/godbus/dbus v0.0.0-20190726142602-4481cbc300e2 h1:ZpnhV/YsD2/4cESfV5+Hoeu/iUR3ruzNvZ+yQfO03a0= github.com/godbus/dbus v0.0.0-20190726142602-4481cbc300e2/go.mod h1:bBOAhwG1umN6/6ZUMtDFBMQR8jRg9O75tm9K00oMsK4= -github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= -github.com/gofrs/uuid v4.0.0+incompatible h1:1SD/1F5pU8p29ybwgQSwpQk+mwdRrXCYuPhW6m+TnJw= github.com/gofrs/uuid v4.0.0+incompatible/go.mod h1:b2aQJv3Z4Fp6yNu3cdSllBxTCLRxnplIgP/c0N/04lM= github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ= github.com/gogo/protobuf v1.2.1/go.mod h1:hp+jE20tsWTFYpLwKvXlhS1hjn+gTNwPg2I6zVXpSg4= @@ -326,7 +312,6 @@ github.com/golang/mock v1.4.0/go.mod h1:UOMv5ysSaYNkG+OFQykRIcU/QvvxJf3p21QfJ2Bt github.com/golang/mock v1.4.1/go.mod h1:UOMv5ysSaYNkG+OFQykRIcU/QvvxJf3p21QfJ2Bt3cw= github.com/golang/mock v1.4.3/go.mod h1:UOMv5ysSaYNkG+OFQykRIcU/QvvxJf3p21QfJ2Bt3cw= github.com/golang/mock v1.4.4/go.mod h1:l3mdAwkq5BuhzHwde/uurv3sEJeZMXNpwsxVWU71h+4= -github.com/golang/mock v1.5.0/go.mod h1:CWnOUgYIOo4TcNZ0wHX3YZCqsaM1I1Jvs6v3mP3KVu8= github.com/golang/mock v1.6.0 h1:ErTB+efbowRARo13NNdxyJji2egdxLGQhRaY+DUumQc= github.com/golang/mock v1.6.0/go.mod h1:p6yTPP+5HYm5mzsMV8JkE6ZKdX+/wYM6Hr+LicevLPs= github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= @@ -344,7 +329,6 @@ github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QD github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI= github.com/golang/protobuf v1.4.3/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI= github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= -github.com/golang/protobuf v1.5.1/go.mod h1:DopwsBzvsk0Fs44TXzsVbJyPhcCPeIwnvohx4u74HPM= github.com/golang/protobuf v1.5.2 h1:ROPKBNFfQgOUMifHyP+KYbvpjbdoFNs+aK7DXlji0Tw= github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= github.com/golang/snappy v0.0.1/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= @@ -363,7 +347,6 @@ github.com/google/go-cmp v0.4.1/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/ github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.1/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= -github.com/google/go-cmp v0.5.3/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.4/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.8/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= @@ -385,8 +368,6 @@ github.com/google/pprof v0.0.0-20200708004538-1a94d8640e99/go.mod h1:ZgVRPoUq/hf github.com/google/pprof v0.0.0-20201023163331-3e6fc7fc9c4c/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= github.com/google/pprof v0.0.0-20201203190320-1bf35d6f28c2/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= github.com/google/pprof v0.0.0-20201218002935-b9804c9f04c2/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= -github.com/google/pprof v0.0.0-20210122040257-d980be63207e/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= -github.com/google/pprof v0.0.0-20210226084205-cbba55b83ad5/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= github.com/google/pprof v0.0.0-20221118152302-e6195bd50e26 h1:Xim43kblpZXfIBQsbuBVKCudVG457BR2GZFIz3uw3hQ= github.com/google/protobuf v3.11.4+incompatible/go.mod h1:lUQ9D1ePzbH2PrIS7ob/bjm9HXyH5WHB0Akwh7URreM= github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI= @@ -477,6 +458,7 @@ github.com/instrumenta/kubeval v0.0.0-20190918223246-8d013ec9fc56 h1:kKOrEaxR9Kv github.com/instrumenta/kubeval v0.0.0-20190918223246-8d013ec9fc56/go.mod h1:bpiMYvNpVxWjdJsS0hDRu9TrobT5GfWCZwJseGUstxE= github.com/invopop/jsonschema v0.7.0 h1:2vgQcBz1n256N+FpX3Jq7Y17AjYt46Ig3zIWyy770So= github.com/invopop/jsonschema v0.7.0/go.mod h1:O9uiLokuu0+MGFlyiaqtWxwqJm41/+8Nj0lD7A36YH0= +github.com/jackc/chunkreader v1.0.0 h1:4s39bBR8ByfqH+DKm8rQA3E1LHZWB9XWcrz8fqaZbe0= github.com/jackc/chunkreader v1.0.0/go.mod h1:RT6O25fNZIuasFJRyZ4R/Y2BbhasbmZXF9QQ7T3kePo= github.com/jackc/chunkreader/v2 v2.0.0/go.mod h1:odVSm741yZoC3dpHEUXIqA9tQRhFrgOHwnPIn9lDKlk= github.com/jackc/chunkreader/v2 v2.0.1 h1:i+RDz65UE+mmpjTfyz0MoVTnzeYxroil2G82ki7MGG8= @@ -495,10 +477,10 @@ github.com/jackc/pgio v1.0.0 h1:g12B9UwVnzGhueNavwioyEEpAmqMe1E/BN9ES+8ovkE= github.com/jackc/pgio v1.0.0/go.mod h1:oP+2QK2wFfUWgr+gxjoBH9KGBb31Eio69xUb0w5bYf8= github.com/jackc/pgmock v0.0.0-20190831213851-13a1b77aafa2/go.mod h1:fGZlG77KXmcq05nJLRkk0+p82V8B8Dw8KN2/V9c/OAE= github.com/jackc/pgmock v0.0.0-20201204152224-4fe30f7445fd/go.mod h1:hrBW0Enj2AZTNpt/7Y5rr2xe/9Mn757Wtb2xeBzPv2c= -github.com/jackc/pgmock v0.0.0-20210724152146-4ad1a8207f65 h1:DadwsjnMwFjfWc9y5Wi/+Zz7xoE5ALHsRQlOctkOiHc= github.com/jackc/pgmock v0.0.0-20210724152146-4ad1a8207f65/go.mod h1:5R2h2EEX+qri8jOWMbJCtaPWkrrNc7OHwsp2TCqp7ak= github.com/jackc/pgpassfile v1.0.0 h1:/6Hmqy13Ss2zCq62VdNG8tM1wchn8zjSGOBJ6icpsIM= github.com/jackc/pgpassfile v1.0.0/go.mod h1:CEx0iS5ambNFdcRtxPj5JhEz+xB6uRky5eyVu/W2HEg= +github.com/jackc/pgproto3 v1.1.0 h1:FYYE4yRw+AgI8wXIinMlNjBbp/UitDJwfj5LqqewP1A= github.com/jackc/pgproto3 v1.1.0/go.mod h1:eR5FA3leWg7p9aeAqi37XOTgTIbkABlvcPB3E5rlc78= github.com/jackc/pgproto3/v2 v2.0.0-alpha1.0.20190420180111-c116219b62db/go.mod h1:bhq50y+xrl9n5mRYyCBFKkpRVTLYJVWeCc+mEAI3yXA= github.com/jackc/pgproto3/v2 v2.0.0-alpha1.0.20190609003834-432c2951c711/go.mod h1:uH0AWtUmuShn0bcesswc4aBTWGvw0cAxIJp+6OB//Wg= @@ -529,7 +511,6 @@ github.com/jackc/pgx/v5 v5.3.1/go.mod h1:t3JDKnCBlYIc0ewLF0Q7B8MXmoIaBOZj/ic7iHo github.com/jackc/puddle v0.0.0-20190413234325-e4ced69a3a2b/go.mod h1:m4B5Dj62Y0fbyuIc15OsIqK0+JU8nkqQjsgx7dvjSWk= github.com/jackc/puddle v0.0.0-20190608224051-11cab39313c9/go.mod h1:m4B5Dj62Y0fbyuIc15OsIqK0+JU8nkqQjsgx7dvjSWk= github.com/jackc/puddle v1.1.3/go.mod h1:m4B5Dj62Y0fbyuIc15OsIqK0+JU8nkqQjsgx7dvjSWk= -github.com/jackc/puddle v1.3.0 h1:eHK/5clGOatcjX3oWGBO/MpxpbHzSwud5EWTSCI+MX0= github.com/jackc/puddle v1.3.0/go.mod h1:m4B5Dj62Y0fbyuIc15OsIqK0+JU8nkqQjsgx7dvjSWk= github.com/jackc/puddle/v2 v2.2.0 h1:RdcDk92EJBuBS55nQMMYFXTxwstHug4jkhT5pq8VxPk= github.com/jackc/puddle/v2 v2.2.0/go.mod h1:vriiEXHvEE654aYKXXjOvZM39qJ0q+azkZFrfEOc3H4= @@ -573,12 +554,10 @@ github.com/karrick/godirwalk v1.8.0/go.mod h1:H5KPZjojv4lE+QYImBI8xVtrBRgYrIVsaR github.com/karrick/godirwalk v1.10.3/go.mod h1:RoGL9dQei4vP9ilrpETWE8CLOZ1kiN0LhBygSwrAsHA= github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51 h1:Z9n2FFNUXsshfwJMBgNA0RU6/i7WVaAegv3PtuIHPMs= github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51/go.mod h1:CzGEWj7cYgsdH8dAjBGEr58BoE7ScuLd+fwFZ44+/x8= -github.com/keybase/go-keychain v0.0.0-20190712205309-48d3d31d256d/go.mod h1:JJNrCn9otv/2QP4D7SMJBgaleKpOf66PnW6F5WGNRIc= github.com/kisielk/errcheck v1.1.0/go.mod h1:EZBBE59ingxPouuu3KfxchcWSUPOHkagtvWXihfKN4Q= github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= github.com/klauspost/compress v1.13.6/go.mod h1:/3/Vjq9QcHkK5uEr5lBEmyoZ1iFhe47etQ6QUkpK6sk= -github.com/klauspost/compress v1.14.4/go.mod h1:/3/Vjq9QcHkK5uEr5lBEmyoZ1iFhe47etQ6QUkpK6sk= github.com/klauspost/compress v1.15.15 h1:EF27CXIuDsYJ6mmvtBRlEuB2UVOqHG1tAXgZ7yIO+lw= github.com/klauspost/compress v1.15.15/go.mod h1:ZcK2JAFqKOpnBlxcLsJzYfrS9X1akm9fHZNnD9+Vo/4= github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= @@ -614,7 +593,6 @@ github.com/magefile/mage v1.14.0 h1:6QDX3g6z1YvJ4olPhT1wksUcSa/V0a1B+pJb73fBjyo= github.com/magefile/mage v1.14.0/go.mod h1:z5UZb/iS3GoOSn0JgWuiw7dxlurVYTu+/jHXqQg881A= github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ= github.com/magiconair/properties v1.8.1/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ= -github.com/magiconair/properties v1.8.5/go.mod h1:y3VJvCyxH9uVvJTWEGAELF3aiYNyPKd5NZ3oSwXrF60= github.com/magiconair/properties v1.8.7 h1:IeQXZAiQcpL9mgcAe1Nu6cX9LLw6ExEHKjN0VQdvPDY= github.com/magiconair/properties v1.8.7/go.mod h1:Dhd985XPs7jluiymwWYZ0G4Z61jb3vdS329zhj2hYo0= github.com/mailru/easyjson v0.0.0-20190614124828-94de47d64c63/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= @@ -697,7 +675,6 @@ github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+ github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno= github.com/nxadm/tail v1.4.4/go.mod h1:kenIhsEOeOJmVchQTgglprH7qJGnHDVpk1VPCcaMI8A= github.com/nxadm/tail v1.4.8 h1:nPr65rt6Y5JFSKQO7qToXr7pePgD6Gwiw05lkbyAQTE= -github.com/nxadm/tail v1.4.8/go.mod h1:+ncqLTQzXmGhMZNUePPaPqPvBxHAIsmXswZKocGu+AU= github.com/oklog/ulid v1.3.1 h1:EGfNDEx6MqHz8B3uNV6QAib1UR2Lm97sHi3ocA6ESJ4= github.com/oklog/ulid v1.3.1/go.mod h1:CirwcVhetQ6Lv90oh/F+FBtV6XMibvdAFo93nm5qn4U= github.com/onsi/ginkgo v0.0.0-20170829012221-11459a886d9c/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= @@ -705,12 +682,10 @@ github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+W github.com/onsi/ginkgo v1.12.1/go.mod h1:zj2OWP4+oCPe1qIXoGWkgMRwljMUYCdkwsT2108oapk= github.com/onsi/ginkgo v1.14.0/go.mod h1:iSB4RoI2tjJc9BBv4NKIKWKya62Rps+oPG/Lv9klQyY= github.com/onsi/ginkgo v1.16.5 h1:8xi0RTUf59SOSfEtZMvwTvXYMzG4gV23XVHOZiXNtnE= -github.com/onsi/ginkgo v1.16.5/go.mod h1:+E8gABHa3K6zRBolWtd+ROzc/U5bkGt0FwiG042wbpU= github.com/onsi/gomega v0.0.0-20170829124025-dcabb60a477c/go.mod h1:C1qb7wdrVGGVU+Z6iS04AVkA3Q65CEZX59MT0QO5uiA= github.com/onsi/gomega v1.7.1/go.mod h1:XdKZgCCFLUoM/7CFJVPcG8C1xQ1AJ0vpAezJrB7JYyY= github.com/onsi/gomega v1.10.1/go.mod h1:iN09h71vgCQne3DLsj+A5owkum+a2tYe+TOCB1ybHNo= github.com/onsi/gomega v1.19.0 h1:4ieX6qQjPP/BfC3mpsAtIGGlxTWPeA3Inl/7DtXw1tw= -github.com/onsi/gomega v1.19.0/go.mod h1:LY+I3pBVzYsTBU1AnDwOSxaYi9WoWiqgwooUqq9yPro= github.com/openconfig/gnmi v0.0.0-20200414194230-1597cc0f2600/go.mod h1:M/EcuapNQgvzxo1DDXHK4tx3QpYM/uG4l591v33jG2A= github.com/openconfig/goyang v0.0.0-20200115183954-d0a48929f0ea/go.mod h1:dhXaV0JgHJzdrHi2l+w0fZrwArtXL7jEFoiqLEdmkvU= github.com/openconfig/goyang v1.2.0 h1:mChUZvp1kCWq6Q00wVCtOToddFzEsGlMGG+V+wNXva8= @@ -725,7 +700,6 @@ github.com/pborman/getopt v0.0.0-20190409184431-ee0cd42419d3/go.mod h1:85jBQOZwp github.com/pelletier/go-toml v0.0.0-20180724185102-c2dbbc24a979/go.mod h1:5z9KED0ma1S8pY6P1sdut58dfprrGBbd/94hg7ilaic= github.com/pelletier/go-toml v1.2.0/go.mod h1:5z9KED0ma1S8pY6P1sdut58dfprrGBbd/94hg7ilaic= github.com/pelletier/go-toml v1.7.0/go.mod h1:vwGMzjaWMwyfHwgIBhI2YUM4fB6nL6lVAvS1LBMMhTE= -github.com/pelletier/go-toml v1.9.3/go.mod h1:u1nR/EPcESfeI/szUZKdtJ0xRNbUoANCkoOuaOx1Y+c= github.com/pelletier/go-toml/v2 v2.0.6 h1:nrzqCb7j9cDFj2coyLNLaZuJTLjWjlaz6nvTvIwycIU= github.com/pelletier/go-toml/v2 v2.0.6/go.mod h1:eumQOmlWiOPt5WriQQqoM5y18pDHwha2N+QD+EUNTek= github.com/peterbourgon/diskv v2.0.1+incompatible/go.mod h1:uqqh8zWWbv1HBMNONnaR/tNboyR3/BZd58JJSHlUSCU= @@ -736,7 +710,6 @@ github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINE github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= -github.com/pkg/sftp v1.10.1/go.mod h1:lYOWFsE0bwd1+KfKJaKeuokY15vzFx25BLbzYYoAxZI= github.com/pkg/sftp v1.13.1/go.mod h1:3HaPG6Dq1ILlpPZRO0HVMrsydcdLt6HRDccSgb87qRg= github.com/pmezard/go-difflib v0.0.0-20151028094244-d8ed2627bdf0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= @@ -749,7 +722,6 @@ github.com/prometheus/client_golang v0.9.3/go.mod h1:/TN21ttK/J9q6uSwhBd54HahCDf github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo= github.com/prometheus/client_golang v1.7.1/go.mod h1:PY5Wy2awLA44sXw4AOSfFBetzPP4j5+D6mVACh+pe2M= github.com/prometheus/client_golang v1.11.0/go.mod h1:Z6t4BnS23TR94PD6BsDNk8yVqroYurpAkEiz0P2BEV0= -github.com/prometheus/client_golang v1.11.1/go.mod h1:Z6t4BnS23TR94PD6BsDNk8yVqroYurpAkEiz0P2BEV0= github.com/prometheus/client_golang v1.12.1/go.mod h1:3Z9XVyYiZYEO+YQWt3RD2R3jrbd179Rt297l4aS6nDY= github.com/prometheus/client_golang v1.14.0 h1:nJdhIvne2eSX/XRAFV9PcvFFRbrjbcTUj0VP62TMhnw= github.com/prometheus/client_golang v1.14.0/go.mod h1:8vpkKitgIVNcqrRBWh1C4TIUQgYNtG/XQE4E/Zae36Y= @@ -808,7 +780,6 @@ github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529/go.mod h1:DxrIzT+xaE7yg github.com/segmentio/fasthash v1.0.3 h1:EI9+KE1EwvMLBWwjpRDc+fEM+prwxDYbslddQGtrmhM= github.com/segmentio/fasthash v1.0.3/go.mod h1:waKX8l2N8yckOgmSsXJi7x1ZfdKZ4x7KRMzBtS3oedY= github.com/shopspring/decimal v0.0.0-20180709203117-cd690d0c9e24/go.mod h1:M+9NzErvs504Cn4c5DxATwIqPbtswREoFCre64PpcG4= -github.com/shopspring/decimal v1.2.0 h1:abSATXmQEYyShuxI4/vyW3tV1MrKAJzCZ/0zLUXYbsQ= github.com/shopspring/decimal v1.2.0/go.mod h1:DKyhrW/HYNuLGql+MJL6WCR6knT2jwCFRcu2hWCYk4o= github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc= github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo= @@ -827,18 +798,15 @@ github.com/spaolacci/murmur3 v1.1.0/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2 github.com/spf13/afero v1.1.1/go.mod h1:j4pytiNVoe2o6bmDsKpLACNPDBIoEAkihy7loJ1B0CQ= github.com/spf13/afero v1.1.2/go.mod h1:j4pytiNVoe2o6bmDsKpLACNPDBIoEAkihy7loJ1B0CQ= github.com/spf13/afero v1.2.2/go.mod h1:9ZxEEn6pIJ8Rxe320qSDBk6AsU0r9pR7Q4OcevTdifk= -github.com/spf13/afero v1.6.0/go.mod h1:Ai8FlHk4v/PARR026UzYexafAt9roJ7LcLMAmO6Z93I= github.com/spf13/afero v1.9.3 h1:41FoI0fD7OR7mGcKE/aOiLkGreyf8ifIOQmJANWogMk= github.com/spf13/afero v1.9.3/go.mod h1:iUV7ddyEEZPO5gA3zD4fJt6iStLlL+Lg4m2cihcDf8Y= github.com/spf13/cast v1.2.0/go.mod h1:r2rcYCSwa1IExKTDiTfzaxqT2FNHs8hODu4LnUfgKEg= github.com/spf13/cast v1.3.0/go.mod h1:Qx5cxh0v+4UWYiBimWS+eyWzqEqokIECu5etghLkUJE= -github.com/spf13/cast v1.3.1/go.mod h1:Qx5cxh0v+4UWYiBimWS+eyWzqEqokIECu5etghLkUJE= github.com/spf13/cast v1.5.0 h1:rj3WzYc11XZaIZMPKmwP96zkFEnnAmV8s6XbB2aY32w= github.com/spf13/cast v1.5.0/go.mod h1:SpXXQ5YoyJw6s3/6cMTQuxvgRl3PCJiyaX9p6b155UU= github.com/spf13/cobra v0.0.0-20180820174524-ff0d02e85550/go.mod h1:1l0Ry5zgKvJasoi3XT1TypsSe7PqH0Sj9dhYf7v3XqQ= github.com/spf13/cobra v0.0.3/go.mod h1:1l0Ry5zgKvJasoi3XT1TypsSe7PqH0Sj9dhYf7v3XqQ= github.com/spf13/cobra v1.1.3/go.mod h1:pGADOWyqRD/YMrPZigI/zbliZ2wVD/23d+is3pSWzOo= -github.com/spf13/cobra v1.2.1/go.mod h1:ExllRjgxM/piMAM+3tAZvg8fsklGAf3tPfi+i8t68Nk= github.com/spf13/cobra v1.6.1 h1:o94oiPyS4KD1mPy2fmcYYHHfCxLqYjJOhGsCHFZtEzA= github.com/spf13/cobra v1.6.1/go.mod h1:IOw/AERYS7UzyrGinqmz6HLUo219MORXGxhbaJUqzrY= github.com/spf13/jwalterweatherman v0.0.0-20180814060501-14d3d4c51834/go.mod h1:cQK4TGJAtQXfYWX+Ddv3mKDzgVb68N+wFjFa4jdeBTo= @@ -852,7 +820,6 @@ github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/spf13/viper v1.1.0/go.mod h1:A8kyI5cUJhb8N+3pkfONlcEcZbueH6nhAm0Fq7SrnBM= github.com/spf13/viper v1.7.0/go.mod h1:8WkrPz2fc9jxqZNCJI/76HCieCp4Q8HaLFoCha5qpdg= -github.com/spf13/viper v1.8.1/go.mod h1:o0Pch8wJ9BVSWGQMbra6iw0oQ5oktSIBaujf1rJH9Ns= github.com/spf13/viper v1.15.0 h1:js3yy885G8xwJa6iOISGFwd+qlUo5AvyXb7CiihdtiU= github.com/spf13/viper v1.15.0/go.mod h1:fFcTBJxvhhzSJiZy8n+PeW6t8l+KeT/uTARa0jHOQLA= github.com/stoewer/go-strcase v1.2.0/go.mod h1:IBiWB2sKIp3wVVQ3Y035++gc+knqhUQag1KpM8ahLw8= @@ -907,9 +874,6 @@ github.com/yuin/gopher-lua v0.0.0-20190514113301-1cd887cd7036 h1:1b6PAtenNyhsmo/ github.com/yuin/gopher-lua v0.0.0-20190514113301-1cd887cd7036/go.mod h1:gqRgreBUhTSL0GeU64rtZ3Uq3wtjOa/TB2YfrtkCbVQ= github.com/zenazn/goji v0.9.0/go.mod h1:7S9M489iMyHBNxwZnk9/EHS098H4/F6TATF2mIxtB1Q= go.etcd.io/bbolt v1.3.2/go.mod h1:IbVyRI1SCnLcuJnV2u8VeU0CEYM7e686BmAb1XKL+uU= -go.etcd.io/etcd/api/v3 v3.5.0/go.mod h1:cbVKeC6lCfl7j/8jBhAK6aIYO9XOjdptoxU/nLQcPvs= -go.etcd.io/etcd/client/pkg/v3 v3.5.0/go.mod h1:IJHfcCEKxYu1Os13ZdwCwIUTUVGYTSAM3YSwc9/Ac1g= -go.etcd.io/etcd/client/v2 v2.305.0/go.mod h1:h9puh54ZTgAKtEbut2oe9P4L/oqKCVB6xsXlzd7alYQ= go.mongodb.org/mongo-driver v1.7.3/go.mod h1:NqaYOwnXWr5Pm7AOpO5QFxKJ503nbMse/R79oO62zWg= go.mongodb.org/mongo-driver v1.7.5/go.mod h1:VXEWRZ6URJIkUq2SCAyapmhH0ZLRBP+FT4xhp5Zvxng= go.mongodb.org/mongo-driver v1.8.3/go.mod h1:0sQWfOeY63QTntERDJJ/0SuKK0T1uVSgKCuAROlKEPY= @@ -922,7 +886,6 @@ go.opencensus.io v0.22.2/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= go.opencensus.io v0.22.3/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= go.opencensus.io v0.22.4/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= go.opencensus.io v0.22.5/go.mod h1:5pWMHQbX5EPX2/62yrJeAkowc+lfs/XD7Uxpq3pI6kk= -go.opencensus.io v0.23.0/go.mod h1:XItmlyltB5F7CS4xOC1DcqMoFqwtC6OG2xF7mCv7P7E= go.opentelemetry.io/contrib v0.20.0/go.mod h1:G/EtFaa6qaN7+LxqfIAT3GiZa7Wv5DTBUzl5H4LY0Kc= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.20.0/go.mod h1:2AboqHi0CiIZU0qwhtUfCYD1GeUzvvIXWNkhDt7ZMG4= go.opentelemetry.io/otel v0.20.0/go.mod h1:Y3ugLH2oa81t5QO+Lty+zXf8zC9L26ax4Nzoxm/dooo= @@ -1054,12 +1017,9 @@ golang.org/x/net v0.0.0-20200707034311-ab3426394381/go.mod h1:/O7V0waA8r7cgGh81R golang.org/x/net v0.0.0-20200822124328-c89045814202/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= golang.org/x/net v0.0.0-20201031054903-ff519b6c9102/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.0.0-20201110031124-69a78807bb2b/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= golang.org/x/net v0.0.0-20201209123823-ac852fbbde11/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= golang.org/x/net v0.0.0-20201224014010-6772e930b67b/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= -golang.org/x/net v0.0.0-20210119194325-5f4716e94777/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= -golang.org/x/net v0.0.0-20210316092652-d523dce5a7f4/go.mod h1:RBQZq4jEuRlivfhVLdyRGr576XBO4/greRjx4P4O3yc= golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96bSt6lcn1PtDYWL6XObtHCRCNQM= golang.org/x/net v0.0.0-20210421230115-4e50805a0758/go.mod h1:72T/g9IO56b78aLF+1Kcs5dz7/ng1VjMUvfKvpfy+jM= golang.org/x/net v0.0.0-20210520170846-37e1c6afe023/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= @@ -1078,9 +1038,6 @@ golang.org/x/oauth2 v0.0.0-20200902213428-5d25da1a8d43/go.mod h1:KelEdhl1UZF7XfJ golang.org/x/oauth2 v0.0.0-20201109201403-9fd604954f58/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A= golang.org/x/oauth2 v0.0.0-20201208152858-08078c50e5b5/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A= golang.org/x/oauth2 v0.0.0-20210218202405-ba52d332ba99/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A= -golang.org/x/oauth2 v0.0.0-20210220000619-9bb904979d93/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A= -golang.org/x/oauth2 v0.0.0-20210313182246-cd4f82c27b84/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A= -golang.org/x/oauth2 v0.0.0-20210402161424-2e8d93401602/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A= golang.org/x/oauth2 v0.0.0-20210514164344-f6687ab2804c/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A= golang.org/x/oauth2 v0.0.0-20220223155221-ee480838109b/go.mod h1:DAh4E804XQdzx2j+YRIaUnCqCV2RuMz24cGBJ5QYIrc= golang.org/x/oauth2 v0.4.0 h1:NF0gk8LVPg1Ml7SSbGyySuoxdsXitj7TvgvuRxIMc/M= @@ -1121,7 +1078,6 @@ golang.org/x/sys v0.0.0-20190507160741-ecd444e8653b/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20190531175056-4c3a928424d2/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190606165138-5da285871e9c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190624142023-c5567b49c5d0/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20190712062909-fae7ac547cb7/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190726091711-fc99dfbffb4e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190813064441-fde4db37ae7a/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190904154756-749cb33beabd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -1155,16 +1111,11 @@ golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20201201145000-ef89a241ccb3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210104204734-6f8348627aad/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20210112080510-489259a85091/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210119212857-b64e53b001e4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20210220050731-9a76102bfb43/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210225134936-a50acf3fe073/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20210305230114-8fe3ee5dd75b/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20210315160823-c6e025ad8005/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210320140829-1e4c9ba3b0c4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210330210617-4fbd30eecc44/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20210403161142-5e06dd20ab57/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210420072515-93ed5bcd2bfe/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210423185535-09eb48e85fd7/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -1267,7 +1218,6 @@ golang.org/x/tools v0.0.0-20200904185747-39188db58858/go.mod h1:Cj7w3i3Rnn0Xh82u golang.org/x/tools v0.0.0-20201110124207-079ba7bd75cd/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= golang.org/x/tools v0.0.0-20201201161351-ac6f37ff4c2a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= golang.org/x/tools v0.0.0-20201208233053-a543418bbed2/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= -golang.org/x/tools v0.0.0-20201224043029-2b0845dc783e/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= golang.org/x/tools v0.0.0-20210105154028-b0ab187a4818/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= golang.org/x/tools v0.0.0-20210108195828-e2f9c7f1fc8e/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= @@ -1301,9 +1251,6 @@ google.golang.org/api v0.30.0/go.mod h1:QGmEvQ87FHZNiUVJkT14jQNYJ4ZJjdRF23ZXz513 google.golang.org/api v0.35.0/go.mod h1:/XrVsuzM0rZmrsbjJutiuftIzeuTQcEeaYcSk/mQ1dg= google.golang.org/api v0.36.0/go.mod h1:+z5ficQTmoYpPn8LCUNVpK5I7hwkpjbcgqA7I34qYtE= google.golang.org/api v0.40.0/go.mod h1:fYKFpnQN0DsDSKRVRcQSDQNtqWPfM9i+zNPxepjRCQ8= -google.golang.org/api v0.41.0/go.mod h1:RkxM5lITDfTzmyKFPt+wGrCJbVfniCr2ool8kTBzRTU= -google.golang.org/api v0.43.0/go.mod h1:nQsDGjRXMo4lvh5hP0TKqF244gqhGcr/YSIykhUk/94= -google.golang.org/api v0.44.0/go.mod h1:EBOGZqzyhtvMDoxwS97ctnh0zUmYY6CxqXsc1AvkYD8= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= @@ -1350,12 +1297,7 @@ google.golang.org/genproto v0.0.0-20201201144952-b05cb90ed32e/go.mod h1:FWY/as6D google.golang.org/genproto v0.0.0-20201210142538-e3217bee35cc/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no= google.golang.org/genproto v0.0.0-20201214200347-8c77b98c765d/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no= google.golang.org/genproto v0.0.0-20210108203827-ffc7fda8c3d7/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no= -google.golang.org/genproto v0.0.0-20210222152913-aa3ee6e6a81c/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no= google.golang.org/genproto v0.0.0-20210226172003-ab064af71705/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no= -google.golang.org/genproto v0.0.0-20210303154014-9728d6b83eeb/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no= -google.golang.org/genproto v0.0.0-20210310155132-4ce2db91004e/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no= -google.golang.org/genproto v0.0.0-20210319143718-93e7006c17a6/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no= -google.golang.org/genproto v0.0.0-20210402141018-6c239bbf2bb1/go.mod h1:9lPAdzaEmUacj36I+k7YKbEc5CXzPIeORRgDAUOu28A= google.golang.org/genproto v0.0.0-20210602131652-f16073e35f0c/go.mod h1:UODoCrxHCcBojKKwX1terBiRUaqAsFqJiF615XL43r0= google.golang.org/genproto v0.0.0-20221227171554-f9683d7f8bef h1:uQ2vjV/sHTsWSqdKeLqmwitzgvjMl7o4IdtHwUDXSJY= google.golang.org/genproto v0.0.0-20221227171554-f9683d7f8bef/go.mod h1:RGgjbofJ8xD9Sq1VVhDM1Vok1vRONV+rg+CjzG4SZKM= @@ -1377,7 +1319,6 @@ google.golang.org/grpc v1.33.2/go.mod h1:JMHMWHQWaTccqQQlmk3MJZS+GWXOdAesneDmEnv google.golang.org/grpc v1.34.0/go.mod h1:WotjhfgOW/POjDeRt8vscBtXq+2VjORFy659qA51WJ8= google.golang.org/grpc v1.35.0/go.mod h1:qjiiYl8FncCW8feJPdyg3v6XW24KsRHe+dy9BAGRRjU= google.golang.org/grpc v1.36.0/go.mod h1:qjiiYl8FncCW8feJPdyg3v6XW24KsRHe+dy9BAGRRjU= -google.golang.org/grpc v1.36.1/go.mod h1:qjiiYl8FncCW8feJPdyg3v6XW24KsRHe+dy9BAGRRjU= google.golang.org/grpc v1.37.0/go.mod h1:NREThFqKR1f3iQ6oBuvc5LadQuXVGo9rkm5ZGrQdJfM= google.golang.org/grpc v1.38.0/go.mod h1:NREThFqKR1f3iQ6oBuvc5LadQuXVGo9rkm5ZGrQdJfM= google.golang.org/grpc v1.52.0 h1:kd48UiU7EHsV4rnLyOJRuP/Il/UHE7gdDAQ+SZI7nZk= @@ -1394,8 +1335,8 @@ google.golang.org/protobuf v1.24.0/go.mod h1:r/3tXBNzIEhYS9I1OUVjXDlt8tc493IdKGj google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c= google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= -google.golang.org/protobuf v1.28.1 h1:d0NfwRgPtno5B1Wa6L2DAG+KivqkdutMf1UhdNx175w= -google.golang.org/protobuf v1.28.1/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= +google.golang.org/protobuf v1.30.0 h1:kPPoIgf3TsEvrm0PFe15JQ+570QVxYzEvvHqChK+cng= +google.golang.org/protobuf v1.30.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= @@ -1410,7 +1351,6 @@ gopkg.in/inconshreveable/log15.v2 v2.0.0-20180818164646-67afb5ed74ec/go.mod h1:a gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= gopkg.in/ini.v1 v1.51.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k= -gopkg.in/ini.v1 v1.62.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k= gopkg.in/ini.v1 v1.67.0 h1:Dgnx+6+nfE+IfzjUEISNeydPJh9AXNNsWbGP9KzCsOA= gopkg.in/ini.v1 v1.67.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k= gopkg.in/natefinch/lumberjack.v2 v2.0.0/go.mod h1:l0ndWWf7gzL7RNwBG7wST/UCcT4T24xpD6X8LsfU/+k= diff --git a/internal/armada/configuration/types.go b/internal/armada/configuration/types.go index 4473a58bd50..77fc34a548c 100644 --- a/internal/armada/configuration/types.go +++ b/internal/armada/configuration/types.go @@ -86,6 +86,8 @@ type PulsarConfig struct { ReceiveTimeout time.Duration // Backoff from polling when Pulsar returns an error BackoffTime time.Duration + // Number of pulsar messages that will be queued by the pulsar consumer. + ReceiverQueueSize int } // DatabaseConfig represents the configuration of the database connection. @@ -107,6 +109,8 @@ type DatabaseConfig struct { } type SchedulingConfig struct { + // Set to true to disable scheduling + DisableScheduling bool // Set to true to enable scheduler assertions. This results in some performance loss. EnableAssertions bool // If true, schedule jobs across all executors in the same pool in a unified manner. @@ -216,6 +220,10 @@ type SchedulingConfig struct { MaxUnacknowledgedJobsPerExecutor uint // If true, do not during scheduling skip jobs with requirements known to be impossible to meet. AlwaysAttemptScheduling bool + // The frequency at which the scheduler updates the cluster state. + ExecutorUpdateFrequency time.Duration + // Enable new preemption strategy. + EnableNewPreemptionStrategy bool } // FairnessModel controls how fairness is computed. @@ -323,6 +331,18 @@ type MetricsConfig struct { Port uint16 RefreshInterval time.Duration ExposeSchedulingMetrics bool + Metrics SchedulerMetricsConfig +} + +type SchedulerMetricsConfig struct { + ScheduleCycleTimeHistogramSettings HistogramConfig + ReconcileCycleTimeHistogramSettings HistogramConfig +} + +type HistogramConfig struct { + Start float64 + Factor float64 + Count int } type EventApiConfig struct { diff --git a/internal/armada/repository/apimessages/conversions.go b/internal/armada/repository/apimessages/conversions.go index 3cf19b240cd..4c7eb890ff1 100644 --- a/internal/armada/repository/apimessages/conversions.go +++ b/internal/armada/repository/apimessages/conversions.go @@ -54,11 +54,12 @@ func FromEventSequence(es *armadaevents.EventSequence) ([]*api.EventMessage, err case *armadaevents.EventSequence_Event_ReprioritiseJobSet, *armadaevents.EventSequence_Event_CancelJobSet, *armadaevents.EventSequence_Event_JobRunSucceeded, + *armadaevents.EventSequence_Event_JobRequeued, *armadaevents.EventSequence_Event_PartitionMarker: // These events have no api analog right now, so we ignore - log.Debugf("Ignoring event") + log.Debugf("ignoring event type %T", esEvent) default: - log.Warnf("Unknown event type: %T", esEvent) + log.Warnf("unknown event type: %T", esEvent) convertedEvents = nil } if err != nil { diff --git a/internal/armada/repository/job.go b/internal/armada/repository/job.go index ba315b7fe6c..9484ae3ead3 100644 --- a/internal/armada/repository/job.go +++ b/internal/armada/repository/job.go @@ -1076,10 +1076,9 @@ func addJob(db redis.Cmdable, job *api.Job, jobData *[]byte) *redis.Cmd { jobObjectPrefix + job.Id, jobSetPrefix + job.JobSetId, jobSetPrefix + job.Queue + keySeparator + job.JobSetId, - jobClientIdPrefix + job.Queue + keySeparator + job.ClientId, jobExistsPrefix + job.Id, }, - job.Id, job.Priority, *jobData, job.ClientId) + job.Id, job.Priority, *jobData) } // This script will create the queue if it doesn't already exist. @@ -1089,28 +1088,17 @@ local queueKey = KEYS[1] local jobKey = KEYS[2] local jobSetKey = KEYS[3] local jobSetQueueKey = KEYS[4] -local jobClientIdKey = KEYS[5] -local jobExistsKey = KEYS[6] +local jobExistsKey = KEYS[5] local jobId = ARGV[1] local jobPriority = ARGV[2] local jobData = ARGV[3] -local clientId = ARGV[4] - local jobExists = redis.call('EXISTS', jobExistsKey) if jobExists == 1 then return '-1' end -if clientId ~= '' then - local existingJobId = redis.call('GET', jobClientIdKey) - if existingJobId then - return existingJobId - end - redis.call('SET', jobClientIdKey, jobId, 'EX', 14400) -end - redis.call('SET', jobExistsKey, '1', 'EX', 604800) redis.call('SET', jobKey, jobData) redis.call('SADD', jobSetKey, jobId) diff --git a/internal/armada/repository/job_test.go b/internal/armada/repository/job_test.go index d219e1836de..9ef188db19c 100644 --- a/internal/armada/repository/job_test.go +++ b/internal/armada/repository/job_test.go @@ -17,11 +17,14 @@ import ( "github.com/armadaproject/armada/pkg/api" ) +// This test used to assert that submitting a job twice with the same clientId resulted +// in the same job id being returned. We now perform the client id check earlier in the process +// so now we assert that different ids are returned. func TestJobDoubleSubmit(t *testing.T) { withRepository(func(r *RedisJobRepository) { job1 := addTestJobWithClientId(t, r, "queue1", "my-job-1") job2 := addTestJobWithClientId(t, r, "queue1", "my-job-1") - assert.Equal(t, job1.Id, job2.Id) + assert.NotEqual(t, job1.Id, job2.Id) }) } diff --git a/internal/armada/server.go b/internal/armada/server.go index 64ae58a0eb3..e60567583bc 100644 --- a/internal/armada/server.go +++ b/internal/armada/server.go @@ -10,7 +10,7 @@ import ( "github.com/go-redis/redis" "github.com/google/uuid" grpc_prometheus "github.com/grpc-ecosystem/go-grpc-prometheus" - "github.com/jackc/pgx/v4/pgxpool" + "github.com/jackc/pgx/v5/pgxpool" "github.com/pkg/errors" log "github.com/sirupsen/logrus" "golang.org/x/sync/errgroup" @@ -209,7 +209,7 @@ func Serve(ctx context.Context, config *configuration.ArmadaConfig, healthChecks } log.Info("Pulsar submit API deduplication enabled") - store, err := pgkeyvalue.New(pool, 1000000, config.Pulsar.DedupTable) + store, err := pgkeyvalue.New(ctx, pool, config.Pulsar.DedupTable) if err != nil { return err } @@ -225,9 +225,10 @@ func Serve(ctx context.Context, config *configuration.ArmadaConfig, healthChecks // Service that consumes Pulsar messages and writes to Redis consumer, err := pulsarClient.Subscribe(pulsar.ConsumerOptions{ - Topic: config.Pulsar.JobsetEventsTopic, - SubscriptionName: config.Pulsar.RedisFromPulsarSubscription, - Type: pulsar.KeyShared, + Topic: config.Pulsar.JobsetEventsTopic, + SubscriptionName: config.Pulsar.RedisFromPulsarSubscription, + Type: pulsar.KeyShared, + ReceiverQueueSize: config.Pulsar.ReceiverQueueSize, }) if err != nil { return errors.WithStack(err) diff --git a/internal/armada/server/eventsprinter.go b/internal/armada/server/eventsprinter.go index 62e55d817ce..90bbca97f83 100644 --- a/internal/armada/server/eventsprinter.go +++ b/internal/armada/server/eventsprinter.go @@ -13,6 +13,7 @@ import ( "github.com/armadaproject/armada/internal/common/logging" "github.com/armadaproject/armada/internal/common/pulsarutils/pulsarrequestid" "github.com/armadaproject/armada/internal/common/requestid" + "github.com/armadaproject/armada/internal/common/util" "github.com/armadaproject/armada/pkg/armadaevents" ) @@ -73,8 +74,9 @@ func (srv *EventsPrinter) Run(ctx context.Context) error { default: // Get a message from Pulsar, which consists of a sequence of events (i.e., state transitions). - ctxWithTimeout, _ := context.WithTimeout(ctx, 10*time.Second) + ctxWithTimeout, cancel := context.WithTimeout(ctx, 10*time.Second) msg, err := consumer.Receive(ctxWithTimeout) + cancel() if errors.Is(err, context.DeadlineExceeded) { // expected log.Info("no new messages from Pulsar (or another instance holds the subscription)") break @@ -82,7 +84,14 @@ func (srv *EventsPrinter) Run(ctx context.Context) error { logging.WithStacktrace(log, err).Warnf("receiving from Pulsar failed") break } - consumer.Ack(msg) + util.RetryUntilSuccess( + context.Background(), + func() error { return consumer.Ack(msg) }, + func(err error) { + logging.WithStacktrace(log, err).Warnf("acking pulsar message failed") + time.Sleep(time.Second) // Not sure what the right backoff is here + }, + ) sequence := &armadaevents.EventSequence{} if err := proto.Unmarshal(msg.Payload(), sequence); err != nil { diff --git a/internal/armada/server/lease.go b/internal/armada/server/lease.go index 4bee2145df7..508206d9758 100644 --- a/internal/armada/server/lease.go +++ b/internal/armada/server/lease.go @@ -40,6 +40,7 @@ import ( schedulerconstraints "github.com/armadaproject/armada/internal/scheduler/constraints" schedulercontext "github.com/armadaproject/armada/internal/scheduler/context" "github.com/armadaproject/armada/internal/scheduler/database" + "github.com/armadaproject/armada/internal/scheduler/fairness" "github.com/armadaproject/armada/internal/scheduler/interfaces" schedulerinterfaces "github.com/armadaproject/armada/internal/scheduler/interfaces" "github.com/armadaproject/armada/internal/scheduler/nodedb" @@ -257,6 +258,7 @@ func (q *AggregatedQueueServer) getJobs(ctx context.Context, req *api.StreamingL log = log.WithFields(logrus.Fields{ "function": "getJobs", "cluster": req.ClusterId, + "pool": req.Pool, }) ctx = ctxlogrus.ToContext(ctx, log) @@ -453,6 +455,13 @@ func (q *AggregatedQueueServer) getJobs(ctx context.Context, req *api.StreamingL log.WithError(err).Warnf("could not store executor details for cluster %s", req.ClusterId) } + // At this point we've written updated usage information to Redis and are ready to start scheduling. + // Exit here if scheduling is disabled. + if q.schedulingConfig.DisableScheduling { + log.Infof("skipping scheduling on %s - scheduling disabled", req.ClusterId) + return make([]*api.Job, 0), nil + } + // Give Schedule() a 3 second shorter deadline than ctx to give it a chance to finish up before ctx deadline. if deadline, ok := ctx.Deadline(); ok { var cancel context.CancelFunc @@ -460,17 +469,30 @@ func (q *AggregatedQueueServer) getJobs(ctx context.Context, req *api.StreamingL defer cancel() } + var fairnessCostProvider fairness.FairnessCostProvider + totalResources := schedulerobjects.ResourceList{Resources: totalCapacity} + if q.schedulingConfig.FairnessModel == configuration.DominantResourceFairness { + fairnessCostProvider, err = fairness.NewDominantResourceFairness( + totalResources, + q.schedulingConfig.DominantResourceFairnessResourcesToConsider, + ) + if err != nil { + return nil, err + } + } else { + fairnessCostProvider, err = fairness.NewAssetFairness(q.schedulingConfig.ResourceScarcity) + if err != nil { + return nil, err + } + } sctx := schedulercontext.NewSchedulingContext( req.ClusterId, req.Pool, q.schedulingConfig.Preemption.PriorityClasses, q.schedulingConfig.Preemption.DefaultPriorityClass, - q.schedulingConfig.ResourceScarcity, - schedulerobjects.ResourceList{Resources: totalCapacity}, + fairnessCostProvider, + totalResources, ) - if q.schedulingConfig.FairnessModel == configuration.DominantResourceFairness { - sctx.EnableDominantResourceFairness(q.schedulingConfig.DominantResourceFairnessResourcesToConsider) - } for queue, priorityFactor := range priorityFactorByQueue { if !isActiveByQueueName[queue] { // To ensure fair share is computed only from active queues, i.e., queues with jobs queued or running. @@ -510,6 +532,13 @@ func (q *AggregatedQueueServer) getJobs(ctx context.Context, req *api.StreamingL if q.schedulingConfig.EnableAssertions { sch.EnableAssertions() } + if q.schedulingConfig.EnableNewPreemptionStrategy { + sch.EnableNewPreemptionStrategy() + } + log.Infof( + "starting scheduling with total resources %s", + schedulerobjects.ResourceList{Resources: totalCapacity}.CompactString(), + ) result, err := sch.Schedule( ctxlogrus.ToContext( ctx, diff --git a/internal/armada/server/submit_from_log.go b/internal/armada/server/submit_from_log.go index 7637f721a1b..13acbf9904a 100644 --- a/internal/armada/server/submit_from_log.go +++ b/internal/armada/server/submit_from_log.go @@ -113,7 +113,7 @@ func (srv *SubmitFromLog) Run(ctx context.Context) error { // If this message isn't for us we can simply ack it // and go to the next message if !schedulers.ForLegacyScheduler(msg) { - srv.Consumer.Ack(msg) + srv.ack(ctx, msg) break } @@ -137,7 +137,7 @@ func (srv *SubmitFromLog) Run(ctx context.Context) error { // Unmarshal and validate the message. sequence, err := eventutil.UnmarshalEventSequence(ctxWithLogger, msg.Payload()) if err != nil { - srv.Consumer.Ack(msg) + srv.ack(ctx, msg) logging.WithStacktrace(messageLogger, err).Warnf("processing message failed; ignoring") numErrored++ break @@ -146,7 +146,7 @@ func (srv *SubmitFromLog) Run(ctx context.Context) error { messageLogger.WithField("numEvents", len(sequence.Events)).Info("processing sequence") // TODO: Improve retry logic. srv.ProcessSequence(ctxWithLogger, sequence) - srv.Consumer.Ack(msg) + srv.ack(ctx, msg) } } } @@ -766,3 +766,16 @@ func (srv *SubmitFromLog) ReprioritizeJobSet( return true, nil } + +func (srv *SubmitFromLog) ack(ctx context.Context, msg pulsar.Message) { + util.RetryUntilSuccess( + ctx, + func() error { + return srv.Consumer.Ack(msg) + }, + func(err error) { + logrus.WithError(err).Warnf("Error acking pulsar message") + time.Sleep(time.Second) + }, + ) +} diff --git a/internal/armada/server/submit_test.go b/internal/armada/server/submit_test.go index 71be6a8e258..a142c7dc2bd 100644 --- a/internal/armada/server/submit_test.go +++ b/internal/armada/server/submit_test.go @@ -469,38 +469,6 @@ func TestSubmitServer_SubmitJob_ReturnsJobItemsInTheSameOrderTheyWereSubmitted(t }) } -func TestSubmitServer_SubmitJobs_HandlesDoubleSubmit(t *testing.T) { - withSubmitServer(func(s *SubmitServer, events *repository.TestEventStore) { - jobSetId := util.NewULID() - jobRequest := createJobRequest(jobSetId, 1) - - result, err := s.SubmitJobs(context.Background(), jobRequest) - assert.NoError(t, err) - - result2, err := s.SubmitJobs(context.Background(), jobRequest) - assert.NoError(t, err) - - assert.Equal(t, result.JobResponseItems[0].JobId, result2.JobResponseItems[0].JobId) - - messages := events.ReceivedEvents - assert.NoError(t, err) - assert.Equal(t, len(messages), 4) - - submitted := messages[0].GetSubmitted() - queued := messages[1].GetQueued() - submitted2 := messages[2].GetSubmitted() - duplicateFound := messages[3].GetDuplicateFound() - - assert.NotNil(t, submitted) - assert.NotNil(t, queued) - assert.NotNil(t, submitted2) - assert.NotNil(t, duplicateFound) - - assert.Equal(t, duplicateFound.OriginalJobId, submitted.JobId) - assert.Equal(t, duplicateFound.JobId, submitted2.JobId) - }) -} - func TestSubmitServer_SubmitJobs_RejectsIfTooManyJobsAreQueued(t *testing.T) { withSubmitServer(func(s *SubmitServer, events *repository.TestEventStore) { limit := 3 diff --git a/internal/armada/server/submit_to_log.go b/internal/armada/server/submit_to_log.go index 08f2002b99e..cf4b12ceca2 100644 --- a/internal/armada/server/submit_to_log.go +++ b/internal/armada/server/submit_to_log.go @@ -13,6 +13,7 @@ import ( "github.com/google/uuid" "github.com/pkg/errors" log "github.com/sirupsen/logrus" + "golang.org/x/exp/maps" "google.golang.org/grpc/codes" "google.golang.org/grpc/status" @@ -105,12 +106,13 @@ func (srv *PulsarSubmitServer) SubmitJobs(ctx context.Context, req *api.JobSubmi return nil, err } - // Convert the API jobs to log jobs. + jobsSubmitted := make([]*api.Job, 0, len(req.JobRequestItems)) responses := make([]*api.JobSubmitResponseItem, len(req.JobRequestItems)) originalIds, err := srv.getOriginalJobIds(ctx, apiJobs) if err != nil { - return nil, err + // Deduplication is best-effort, therefore this is not fatal + log.WithError(err).Warn("Error fetching original job ids, deduplication will not occur.") } pulsarJobDetails := make([]*schedulerobjects.PulsarSchedulerJobDetails, 0) @@ -198,6 +200,8 @@ func (srv *PulsarSubmitServer) SubmitJobs(ctx context.Context, req *api.JobSubmi apiJob.ClientId, apiJob.GetId()) } + } else { + jobsSubmitted = append(jobsSubmitted, apiJob) } } @@ -225,6 +229,14 @@ func (srv *PulsarSubmitServer) SubmitJobs(ctx context.Context, req *api.JobSubmi } } + // Store the deduplication ids. note that this will not be called if pulsar submission has failed, which means that + // a partial pulsar submission will not cause deduplication ids to be updated and thus we may get duplicate jobs + // if the user then resubmits. Likewise, if there is a failure in persisting the ids, we treat this as non-fatal so + // we could get duplicate events. + err = srv.storeOriginalJobIds(ctx, jobsSubmitted) + if err != nil { + log.WithError(err).Warn("failed to satore deduplicattion ids") + } return &api.JobSubmitResponse{JobResponseItems: responses}, nil } @@ -693,56 +705,68 @@ func (srv *PulsarSubmitServer) publishToPulsar(ctx context.Context, sequences [] return pulsarutils.PublishSequences(ctx, srv.Producer, sequences, scheduler) } +func jobKey(j *api.Job) string { + combined := fmt.Sprintf("%s:%s", j.Queue, j.ClientId) + h := sha1.Sum([]byte(combined)) + return fmt.Sprintf("%x", h) +} + // getOriginalJobIds returns the mapping between jobId and originalJobId. If the job (or more specifically the clientId // on the job) has not been seen before then jobId -> jobId. If the job has been seen before then jobId -> originalJobId // Note that if srv.KVStore is nil then this function simply returns jobId -> jobId func (srv *PulsarSubmitServer) getOriginalJobIds(ctx context.Context, apiJobs []*api.Job) (map[string]string, error) { + // Default is the current id + ret := make(map[string]string, len(apiJobs)) + for _, apiJob := range apiJobs { + ret[apiJob.GetId()] = apiJob.GetId() + } + // If we don't have a KV store, then just return original mappings if srv.KVStore == nil { - ret := make(map[string]string, len(apiJobs)) - for _, apiJob := range apiJobs { - ret[apiJob.GetId()] = apiJob.GetId() - } return ret, nil } - hash := func(queue string, clientId string) [20]byte { - combined := fmt.Sprintf("%s:%s", queue, clientId) - return sha1.Sum([]byte(combined)) - } - // Armada checks for duplicate job submissions if a ClientId (i.e., a deduplication id) is provided. // Deduplication is based on storing the combined hash of the ClientId and queue. // For storage efficiency, we store hashes instead of user-provided strings. - kvs := make([]*pgkeyvalue.KeyValue, 0, len(apiJobs)) + kvs := make(map[string][]byte, len(apiJobs)) for _, apiJob := range apiJobs { if apiJob.ClientId != "" { - clientIdHash := hash(apiJob.Queue, apiJob.ClientId) - kvs = append(kvs, &pgkeyvalue.KeyValue{ - Key: fmt.Sprintf("%x", clientIdHash), - Value: []byte(apiJob.GetId()), - }) + kvs[jobKey(apiJob)] = []byte(apiJob.GetId()) } } - // If we have any client Ids add them to store + // If we have any client Ids, retrieve their job ids if len(kvs) > 0 { - addedKvs, err := srv.KVStore.LoadOrStoreBatch(ctx, kvs) + keys := maps.Keys(kvs) + existingKvs, err := srv.KVStore.Load(ctx, keys) if err != nil { - return nil, err + return ret, err } - ret := make(map[string]string, len(addedKvs)) for _, apiJob := range apiJobs { - if apiJob.ClientId != "" { - clientIdHash := hash(apiJob.Queue, apiJob.ClientId) - originalJobId := addedKvs[fmt.Sprintf("%x", clientIdHash)] + originalJobId, ok := existingKvs[jobKey(apiJob)] + if apiJob.ClientId != "" && ok { ret[apiJob.GetId()] = string(originalJobId) } } - return ret, nil } + return ret, nil +} - return nil, nil +func (srv *PulsarSubmitServer) storeOriginalJobIds(ctx context.Context, apiJobs []*api.Job) error { + if srv.KVStore == nil { + return nil + } + kvs := make(map[string][]byte, 0) + for _, apiJob := range apiJobs { + if apiJob.ClientId != "" { + kvs[jobKey(apiJob)] = []byte(apiJob.GetId()) + } + } + if len(kvs) == 0 { + return nil + } + return srv.KVStore.Store(ctx, kvs) } // assignScheduler assigns each job to either the legacy or pulsar scheduler. diff --git a/internal/common/armadaerrors/errors.go b/internal/common/armadaerrors/errors.go index c84b260972e..72c375ff5fa 100644 --- a/internal/common/armadaerrors/errors.go +++ b/internal/common/armadaerrors/errors.go @@ -16,8 +16,8 @@ import ( "syscall" "github.com/apache/pulsar-client-go/pulsar" - "github.com/jackc/pgconn" "github.com/jackc/pgerrcode" + "github.com/jackc/pgx/v5/pgconn" "github.com/pkg/errors" "google.golang.org/grpc" "google.golang.org/grpc/codes" diff --git a/internal/common/certs/cached_certificate_test.go b/internal/common/certs/cached_certificate_test.go index 651794a543a..7687c80fd63 100644 --- a/internal/common/certs/cached_certificate_test.go +++ b/internal/common/certs/cached_certificate_test.go @@ -19,8 +19,9 @@ import ( ) const ( - certFilePath = "testdata/tls.crt" - keyFilePath = "testdata/tls.key" + certFilePath = "testdata/tls.crt" + keyFilePath = "testdata/tls.key" + directoryName = "testdata" ) func TestCachedCertificateService_LoadsCertificateOnStartup(t *testing.T) { @@ -105,6 +106,8 @@ func TestCachedCertificateService_ReloadsCertPeriodically_WhenUsingRun(t *testin } func writeCerts(t *testing.T, certData *bytes.Buffer, keyData *bytes.Buffer) { + err := os.MkdirAll(directoryName, 0o755) + require.NoError(t, err) if certData != nil { err := os.WriteFile(certFilePath, certData.Bytes(), 0o644) require.NoError(t, err) @@ -117,8 +120,7 @@ func writeCerts(t *testing.T, certData *bytes.Buffer, keyData *bytes.Buffer) { } func cleanup() { - os.Remove(certFilePath) - os.Remove(keyFilePath) + os.RemoveAll(directoryName) } func createCerts(t *testing.T) (*tls.Certificate, *bytes.Buffer, *bytes.Buffer) { diff --git a/internal/common/compress/compressor_test.go b/internal/common/compress/compressor_test.go index ecdbde738f1..a11d22c20bb 100644 --- a/internal/common/compress/compressor_test.go +++ b/internal/common/compress/compressor_test.go @@ -3,7 +3,7 @@ package compress import ( "bytes" "compress/zlib" - "io/ioutil" + "io" "testing" "github.com/stretchr/testify/assert" @@ -66,7 +66,7 @@ func decompress(b []byte) (string, error) { if err != nil { return "", err } - p, err := ioutil.ReadAll(z) + p, err := io.ReadAll(z) if err != nil { return "", err } diff --git a/internal/common/database/db_testutil.go b/internal/common/database/db_testutil.go index 1207080a5d0..a36affdef73 100644 --- a/internal/common/database/db_testutil.go +++ b/internal/common/database/db_testutil.go @@ -4,9 +4,9 @@ import ( "context" "fmt" - "github.com/jackc/pgx/v4" - "github.com/jackc/pgx/v4/pgxpool" - _ "github.com/jackc/pgx/v4/stdlib" + "github.com/jackc/pgx/v5" + "github.com/jackc/pgx/v5/pgxpool" + _ "github.com/jackc/pgx/v5/stdlib" "github.com/pkg/errors" "github.com/armadaproject/armada/internal/armada/configuration" @@ -34,7 +34,7 @@ func WithTestDb(migrations []Migration, action func(db *pgxpool.Pool) error) err } // Connect again: this time to the database we just created. This is the databse we use for tests - testDbPool, err := pgxpool.Connect(ctx, connectionString+" dbname="+dbName) + testDbPool, err := pgxpool.New(ctx, connectionString+" dbname="+dbName) if err != nil { return errors.WithStack(err) } diff --git a/internal/common/database/functions.go b/internal/common/database/functions.go index 23f182b40c1..5446f7cd0e1 100644 --- a/internal/common/database/functions.go +++ b/internal/common/database/functions.go @@ -8,8 +8,8 @@ import ( "time" "github.com/google/uuid" - "github.com/jackc/pgx/v4" - "github.com/jackc/pgx/v4/pgxpool" + "github.com/jackc/pgx/v5" + "github.com/jackc/pgx/v5/pgxpool" "github.com/pkg/errors" "github.com/armadaproject/armada/internal/armada/configuration" @@ -35,7 +35,7 @@ func OpenPgxConn(config configuration.PostgresConfig) (*pgx.Conn, error) { } func OpenPgxPool(config configuration.PostgresConfig) (*pgxpool.Pool, error) { - db, err := pgxpool.Connect(context.Background(), CreateConnectionString(config.Connection)) + db, err := pgxpool.New(context.Background(), CreateConnectionString(config.Connection)) if err != nil { return nil, err } diff --git a/internal/common/database/lookout/util.go b/internal/common/database/lookout/util.go index 29b30a104ef..a4c4690a598 100644 --- a/internal/common/database/lookout/util.go +++ b/internal/common/database/lookout/util.go @@ -1,7 +1,7 @@ package lookout import ( - "github.com/jackc/pgx/v4/pgxpool" + "github.com/jackc/pgx/v5/pgxpool" "github.com/armadaproject/armada/internal/common/database" "github.com/armadaproject/armada/internal/lookoutv2/schema" diff --git a/internal/common/database/migrations.go b/internal/common/database/migrations.go index 7ea36b9cf38..164c75b313d 100644 --- a/internal/common/database/migrations.go +++ b/internal/common/database/migrations.go @@ -9,8 +9,6 @@ import ( "strconv" "strings" - "github.com/jackc/pgtype/pgxtype" - stakikfs "github.com/rakyll/statik/fs" log "github.com/sirupsen/logrus" ) @@ -30,7 +28,7 @@ func NewMigration(id int, name string, sql string) Migration { } } -func UpdateDatabase(ctx context.Context, db pgxtype.Querier, migrations []Migration) error { +func UpdateDatabase(ctx context.Context, db Querier, migrations []Migration) error { log.Info("Updating postgres...") version, err := readVersion(ctx, db) if err != nil { @@ -57,7 +55,7 @@ func UpdateDatabase(ctx context.Context, db pgxtype.Querier, migrations []Migrat return nil } -func readVersion(ctx context.Context, db pgxtype.Querier) (int, error) { +func readVersion(ctx context.Context, db Querier) (int, error) { _, err := db.Exec(ctx, `CREATE SEQUENCE IF NOT EXISTS database_version START WITH 0 MINVALUE 0;`) if err != nil { @@ -77,7 +75,7 @@ func readVersion(ctx context.Context, db pgxtype.Querier) (int, error) { return version, err } -func setVersion(ctx context.Context, db pgxtype.Querier, version int) error { +func setVersion(ctx context.Context, db Querier, version int) error { _, err := db.Exec(ctx, `SELECT setval('database_version', $1)`, version) return err } diff --git a/internal/common/database/querier.go b/internal/common/database/querier.go new file mode 100644 index 00000000000..afba592b296 --- /dev/null +++ b/internal/common/database/querier.go @@ -0,0 +1,16 @@ +package database + +import ( + "context" + + "github.com/jackc/pgx/v5" + "github.com/jackc/pgx/v5/pgconn" +) + +// This is a temporary interface to act as a bridge between upgrading from pgx/v4 to pgx/v5 +// TODO (Mo-Fatah): Remove this after https://github.com/armadaproject/armada/pull/2659 is ready to be used in the code. +type Querier interface { + Exec(context.Context, string, ...any) (pgconn.CommandTag, error) + Query(context.Context, string, ...any) (pgx.Rows, error) + QueryRow(context.Context, string, ...any) pgx.Row +} diff --git a/internal/common/database/upsert.go b/internal/common/database/upsert.go index 91047fb4ed9..23f27164f9b 100644 --- a/internal/common/database/upsert.go +++ b/internal/common/database/upsert.go @@ -6,10 +6,10 @@ import ( "reflect" "strings" - "github.com/jackc/pgx/v4/pgxpool" + "github.com/jackc/pgx/v5/pgxpool" "github.com/google/uuid" - "github.com/jackc/pgx/v4" + "github.com/jackc/pgx/v5" "github.com/pkg/errors" ) @@ -17,7 +17,7 @@ func UpsertWithTransaction[T any](ctx context.Context, db *pgxpool.Pool, tableNa if len(records) == 0 { return nil } - return db.BeginTxFunc(ctx, pgx.TxOptions{ + return pgx.BeginTxFunc(ctx, db, pgx.TxOptions{ IsoLevel: pgx.ReadCommitted, AccessMode: pgx.ReadWrite, DeferrableMode: pgx.Deferrable, @@ -35,7 +35,7 @@ func UpsertWithTransaction[T any](ctx context.Context, db *pgxpool.Pool, tableNa // // The COPY protocol can be faster than repeated inserts for as little as 5 rows; see // https://www.postgresql.org/docs/current/populate.html -// https://pkg.go.dev/github.com/jackc/pgx/v4#hdr-Copy_Protocol +// https://pkg.go.dev/github.com/jackc/pgx/v5#hdr-Copy_Protocol // // The records to write should be structs with fields marked with "db" tags. // Field names and values are extracted using the NamesValuesFromRecord function; @@ -67,7 +67,7 @@ func Upsert[T any](ctx context.Context, tx pgx.Tx, tableName string, records []T // Use the postgres-specific COPY wire protocol to load data into the new table in a single operation. // The COPY protocol can be faster than repeated inserts for as little as 5 rows; see // https://www.postgresql.org/docs/current/populate.html - // https://pkg.go.dev/github.com/jackc/pgx/v4#hdr-Copy_Protocol + // https://pkg.go.dev/github.com/jackc/pgx/v5#hdr-Copy_Protocol // // We're guaranteed there is at least one record. names, _ := NamesValuesFromRecord(records[0]) diff --git a/internal/common/database/upsert_test.go b/internal/common/database/upsert_test.go index 913215a52db..b1329921c1e 100644 --- a/internal/common/database/upsert_test.go +++ b/internal/common/database/upsert_test.go @@ -7,7 +7,7 @@ import ( "time" "github.com/google/uuid" - "github.com/jackc/pgx/v4/pgxpool" + "github.com/jackc/pgx/v5/pgxpool" "github.com/stretchr/testify/assert" ) diff --git a/internal/common/eventutil/eventutil_test.go b/internal/common/eventutil/eventutil_test.go index ec1e4b72245..0591a10f755 100644 --- a/internal/common/eventutil/eventutil_test.go +++ b/internal/common/eventutil/eventutil_test.go @@ -485,7 +485,7 @@ func testContainer(name string) v1.Container { memory, _ := resource.ParseQuantity("50Mi") return v1.Container{ Name: name, - Image: "alpine:3.10", + Image: "alpine:3.18.3", Command: []string{"cmd1", "cmd2"}, Args: []string{"sleep", "5s"}, Resources: v1.ResourceRequirements{ diff --git a/internal/common/ingest/ingestion_pipeline.go b/internal/common/ingest/ingestion_pipeline.go index 97a4ca05979..2b5e9a9e783 100644 --- a/internal/common/ingest/ingestion_pipeline.go +++ b/internal/common/ingest/ingestion_pipeline.go @@ -14,6 +14,7 @@ import ( "github.com/armadaproject/armada/internal/common/eventutil" commonmetrics "github.com/armadaproject/armada/internal/common/ingest/metrics" "github.com/armadaproject/armada/internal/common/pulsarutils" + "github.com/armadaproject/armada/internal/common/util" "github.com/armadaproject/armada/pkg/armadaevents" ) @@ -204,7 +205,14 @@ func (ingester *IngestionPipeline[T]) Run(ctx context.Context) error { break } else { for _, msgId := range msg.GetMessageIDs() { - ingester.consumer.AckID(msgId) + util.RetryUntilSuccess( + context.Background(), + func() error { return ingester.consumer.AckID(msgId) }, + func(err error) { + log.WithError(err).Warnf("Pulsar ack failed; backing off for %s", ingester.pulsarConfig.BackoffTime) + time.Sleep(ingester.pulsarConfig.BackoffTime) + }, + ) } } } @@ -229,6 +237,7 @@ func (ingester *IngestionPipeline[T]) subscribe() (pulsar.Consumer, func(), erro Topic: ingester.pulsarConfig.JobsetEventsTopic, SubscriptionName: ingester.pulsarSubscriptionName, Type: ingester.pulsarSubscriptionType, + ReceiverQueueSize: ingester.pulsarConfig.ReceiverQueueSize, SubscriptionInitialPosition: pulsar.SubscriptionPositionEarliest, }) if err != nil { diff --git a/internal/common/ingest/ingestion_pipeline_test.go b/internal/common/ingest/ingestion_pipeline_test.go index e2c04d2c64e..da0d653b39a 100644 --- a/internal/common/ingest/ingestion_pipeline_test.go +++ b/internal/common/ingest/ingestion_pipeline_test.go @@ -144,12 +144,13 @@ func (p *mockPulsarConsumer) Receive(ctx context.Context) (pulsar.Message, error } } -func (p *mockPulsarConsumer) AckID(messageId pulsar.MessageID) { +func (p *mockPulsarConsumer) AckID(messageId pulsar.MessageID) error { p.acked[messageId] = true p.received++ if p.received >= len(p.messages) { p.cancelFn() } + return nil } func (p *mockPulsarConsumer) assertDidAck(messages []pulsar.Message) { diff --git a/internal/common/mocks/mock_executorapi.go b/internal/common/mocks/mock_executorapi.go index cc882976a1a..3d6081ead6d 100644 --- a/internal/common/mocks/mock_executorapi.go +++ b/internal/common/mocks/mock_executorapi.go @@ -8,12 +8,11 @@ import ( context "context" reflect "reflect" + executorapi "github.com/armadaproject/armada/pkg/executorapi" types "github.com/gogo/protobuf/types" gomock "github.com/golang/mock/gomock" grpc "google.golang.org/grpc" metadata "google.golang.org/grpc/metadata" - - executorapi "github.com/armadaproject/armada/pkg/executorapi" ) // MockExecutorApiClient is a mock of ExecutorApiClient interface. diff --git a/internal/common/mocks/mock_pulsar.go b/internal/common/mocks/mock_pulsar.go index d362ffce4b7..50de787b91a 100644 --- a/internal/common/mocks/mock_pulsar.go +++ b/internal/common/mocks/mock_pulsar.go @@ -93,6 +93,21 @@ func (mr *MockClientMockRecorder) CreateTableView(arg0 interface{}) *gomock.Call return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "CreateTableView", reflect.TypeOf((*MockClient)(nil).CreateTableView), arg0) } +// NewTransaction mocks base method. +func (m *MockClient) NewTransaction(arg0 time.Duration) (pulsar.Transaction, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "NewTransaction", arg0) + ret0, _ := ret[0].(pulsar.Transaction) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// NewTransaction indicates an expected call of NewTransaction. +func (mr *MockClientMockRecorder) NewTransaction(arg0 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "NewTransaction", reflect.TypeOf((*MockClient)(nil).NewTransaction), arg0) +} + // Subscribe mocks base method. func (m *MockClient) Subscribe(arg0 pulsar.ConsumerOptions) (pulsar.Consumer, error) { m.ctrl.T.Helper() @@ -474,6 +489,20 @@ func (mr *MockMessageMockRecorder) RedeliveryCount() *gomock.Call { return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "RedeliveryCount", reflect.TypeOf((*MockMessage)(nil).RedeliveryCount)) } +// SchemaVersion mocks base method. +func (m *MockMessage) SchemaVersion() []byte { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "SchemaVersion") + ret0, _ := ret[0].([]byte) + return ret0 +} + +// SchemaVersion indicates an expected call of SchemaVersion. +func (mr *MockMessageMockRecorder) SchemaVersion() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SchemaVersion", reflect.TypeOf((*MockMessage)(nil).SchemaVersion)) +} + // Topic mocks base method. func (m *MockMessage) Topic() string { m.ctrl.T.Helper() diff --git a/internal/common/pgkeyvalue/pgkeyvalue.go b/internal/common/pgkeyvalue/pgkeyvalue.go index e5ee9cdda45..8476146d727 100644 --- a/internal/common/pgkeyvalue/pgkeyvalue.go +++ b/internal/common/pgkeyvalue/pgkeyvalue.go @@ -3,24 +3,22 @@ package pgkeyvalue import ( "context" "fmt" - "strings" "time" - lru "github.com/hashicorp/golang-lru" - "github.com/jackc/pgconn" - "github.com/jackc/pgerrcode" - "github.com/jackc/pgx/v4" - "github.com/jackc/pgx/v4/pgxpool" + "github.com/jackc/pgx/v5/pgxpool" "github.com/pkg/errors" "github.com/sirupsen/logrus" + "k8s.io/apimachinery/pkg/util/clock" "github.com/armadaproject/armada/internal/common/armadaerrors" + "github.com/armadaproject/armada/internal/common/database" "github.com/armadaproject/armada/internal/common/logging" ) type KeyValue struct { - Key string - Value []byte + Key string `db:"key"` + Value []byte `db:"value"` + Inserted time.Time `db:"inserted"` } // PGKeyValueStore is a time-limited key-value store backed by postgres with a local LRU cache. @@ -28,16 +26,15 @@ type KeyValue struct { // Keys can only be deleted by running the cleanup function. // Deleting keys does not cause caches to update, i.e., nodes may have an inconsistent view if keys are deleted. type PGKeyValueStore struct { - // For performance, keys are cached locally. - cache *lru.Cache // Postgres connection. db *pgxpool.Pool // Name of the postgres table used for storage. tableName string - Logger *logrus.Logger + // Used to set inserted time + clock clock.Clock } -func New(db *pgxpool.Pool, cacheSize int, tableName string) (*PGKeyValueStore, error) { +func New(ctx context.Context, db *pgxpool.Pool, tableName string) (*PGKeyValueStore, error) { if db == nil { return nil, errors.WithStack(&armadaerrors.ErrInvalidArgument{ Name: "db", @@ -52,102 +49,23 @@ func New(db *pgxpool.Pool, cacheSize int, tableName string) (*PGKeyValueStore, e Message: "TableName must be non-empty", }) } - cache, err := lru.New(cacheSize) + err := createTableIfNotExists(ctx, db, tableName) if err != nil { return nil, errors.WithStack(err) } return &PGKeyValueStore{ - cache: cache, db: db, tableName: tableName, + clock: clock.RealClock{}, }, nil } -// Add adds a key-value pair. Returns true if successful and false if the key already exists. -// The postgres table backing the key-value storage is created automatically if it doesn't already exist. -func (c *PGKeyValueStore) Add(ctx context.Context, key string, value []byte) (bool, error) { - ok, err := c.add(ctx, key, value) - - // If the table doesn't exist, create it and try again. - var pgErr *pgconn.PgError - if errors.As(err, &pgErr) && pgErr.Code == pgerrcode.UndefinedTable { // Relation doesn't exist; create it. - if err := c.createTable(ctx); err != nil { - return false, errors.WithStack(err) - } - ok, err = c.add(ctx, key, value) - } - - return ok, err -} - -// LoadOrStoreBatch returns the existing values for the supplied keys if present. Otherwise, it stores and returns -// the supplied value. -// The postgres table backing the key-value storage is created automatically if it doesn't already exist. -func (c *PGKeyValueStore) LoadOrStoreBatch(ctx context.Context, batch []*KeyValue) (map[string][]byte, error) { - ret, err := c.addBatch(ctx, batch) - - // If the table doesn't exist, create it and try again. - var pgErr *pgconn.PgError - if errors.As(err, &pgErr) && pgErr.Code == pgerrcode.UndefinedTable { // Relation doesn't exist; create it. - if err := c.createTable(ctx); err != nil { - return nil, errors.WithStack(err) - } - ret, err = c.addBatch(ctx, batch) - } - return ret, err -} - -// AddKey is equivalent to Add(ctx, key, nil). -func (c *PGKeyValueStore) AddKey(ctx context.Context, key string) (bool, error) { - return c.Add(ctx, key, nil) -} - -func (c *PGKeyValueStore) createTable(ctx context.Context) error { - var pgErr *pgconn.PgError - _, err := c.db.Exec(ctx, fmt.Sprintf("create table %s (key text primary key, value bytea, inserted timestamp not null);", c.tableName)) - if errors.As(err, &pgErr) && pgErr.Code == pgerrcode.DuplicateTable { // Someone else just created it, which is fine. - return nil - } - return err -} - -func (c *PGKeyValueStore) addBatch(ctx context.Context, batch []*KeyValue) (map[string][]byte, error) { - addedByKey := map[string][]byte{} - keysToAdd := map[string][]byte{} - - // first check the cache to see if we have added anything - for _, kv := range batch { - if val, ok := c.cache.Get(kv.Key); ok { - addedByKey[kv.Key] = val.([]byte) - } else { - keysToAdd[kv.Key] = kv.Value - } - } - - if len(addedByKey) == len(batch) { - return addedByKey, nil - } - - valueStrings := make([]string, 0, len(batch)) - valueArgs := make([]interface{}, 0, len(batch)*3) - i := 0 - now := time.Now().UTC() - for k, v := range keysToAdd { - valueStrings = append(valueStrings, fmt.Sprintf("($%d, $%d, $%d)", i*3+1, i*3+2, i*3+3)) - valueArgs = append(valueArgs, k) - valueArgs = append(valueArgs, v) - valueArgs = append(valueArgs, now) - i++ - } - stmt := fmt.Sprintf( - "INSERT INTO %s (key, value, inserted) VALUES %s ON CONFLICT (key) DO UPDATE SET key=EXCLUDED.key RETURNING key, value", - c.tableName, - strings.Join(valueStrings, ",")) - rows, err := c.db.Query(ctx, stmt, valueArgs...) +func (c *PGKeyValueStore) Load(ctx context.Context, keys []string) (map[string][]byte, error) { + rows, err := c.db.Query(ctx, fmt.Sprintf("SELECT KEY, VALUE FROM %s WHERE KEY = any($1)", c.tableName), keys) if err != nil { return nil, errors.WithStack(err) } - + kv := make(map[string][]byte, len(keys)) for rows.Next() { key := "" var value []byte = nil @@ -155,90 +73,37 @@ func (c *PGKeyValueStore) addBatch(ctx context.Context, batch []*KeyValue) (map[ if err != nil { return nil, errors.WithStack(err) } - c.cache.Add(key, value) - addedByKey[key] = value + kv[key] = value } - - return addedByKey, nil + return kv, nil } -func (c *PGKeyValueStore) add(ctx context.Context, key string, value []byte) (bool, error) { - // Overwriting isn't allowed. - if _, ok := c.cache.Get(key); ok { - return false, nil - } - - // Otherwise, get and set the key in a transaction. - var exists *bool - err := c.db.BeginTxFunc(ctx, pgx.TxOptions{}, func(tx pgx.Tx) error { - // Check if the key already exists in postgres. - sql := fmt.Sprintf("select exists(select 1 from %s where key=$1) AS \"exists\"", c.tableName) - if err := tx.QueryRow(ctx, sql, key).Scan(&exists); err != nil { - return err - } - - // Only write the key-value pair if it doesn't already exist (overwriting not allowed). - if !*exists { - sql = fmt.Sprintf("insert into %s (key, value, inserted) values ($1, $2, now());", c.tableName) - _, err := tx.Exec(ctx, sql, key, value) - if err != nil { - return err - } - } - - return nil - }) - // We need to return on error (in particular tx rollback) - // to avoid writing to the cache after failing to write to postgres. - if err != nil { - return false, errors.WithStack(err) - } - - // Only add to cache if we also wrote to postgres. - if *exists { - return false, nil - } else { - c.cache.Add(key, value) - } - - return true, nil -} - -// Get returns the value associated with the provided key, -// or &armadaerrors.ErrNotFound if the key can't be found. -func (c *PGKeyValueStore) Get(ctx context.Context, key string) ([]byte, error) { - // First check the local cache. - if value, ok := c.cache.Get(key); ok { - return value.([]byte), nil - } - - // Otherwise, check postgres. - var exists bool - var value *[]byte - sql := fmt.Sprintf("select value from %s where key=$1", c.tableName) - err := c.db.QueryRow(ctx, sql, key).Scan(&value) - if errors.Is(err, pgx.ErrNoRows) { - exists = false - } else if err != nil { - return nil, errors.WithStack(err) - } else { - exists = true - } - - if !exists { - return nil, errors.WithStack(&armadaerrors.ErrNotFound{ - Type: "Postgres key-value pair", - Value: key, +func (c *PGKeyValueStore) Store(ctx context.Context, kvs map[string][]byte) error { + data := make([]KeyValue, 0, len(kvs)) + for k, v := range kvs { + data = append(data, KeyValue{ + Key: k, + Value: v, + Inserted: c.clock.Now(), }) } + return database.UpsertWithTransaction(ctx, c.db, c.tableName, data) +} - return *value, nil +func createTableIfNotExists(ctx context.Context, db *pgxpool.Pool, tableName string) error { + _, err := db.Exec(ctx, fmt.Sprintf(` + CREATE TABLE IF NOT EXISTS %s ( + key TEXT PRIMARY KEY, + value BYTEA, + inserted TIMESTAMP not null + );`, tableName)) + return err } // Cleanup removes all key-value pairs older than lifespan. -func (c *PGKeyValueStore) Cleanup(ctx context.Context, lifespan time.Duration) error { - sql := fmt.Sprintf("delete from %s where (inserted <= (now() - $1::interval));", c.tableName) - _, err := c.db.Exec(ctx, sql, lifespan) +func (c *PGKeyValueStore) cleanup(ctx context.Context, lifespan time.Duration) error { + sql := fmt.Sprintf("DELETE FROM %s WHERE (inserted <= $1);", c.tableName) + _, err := c.db.Exec(ctx, sql, c.clock.Now().Add(-lifespan)) if err != nil { return errors.WithStack(err) } @@ -248,27 +113,21 @@ func (c *PGKeyValueStore) Cleanup(ctx context.Context, lifespan time.Duration) e // PeriodicCleanup starts a goroutine that automatically runs the cleanup job // every interval until the provided context is cancelled. func (c *PGKeyValueStore) PeriodicCleanup(ctx context.Context, interval time.Duration, lifespan time.Duration) error { - var log *logrus.Entry - if c.Logger == nil { - log = logrus.StandardLogger().WithField("service", "PGKeyValueStoreCleanup") - } else { - log = c.Logger.WithField("service", "PGKeyValueStoreCleanup") - } - + log := logrus.StandardLogger().WithField("service", "PGKeyValueStoreCleanup") log.Info("service started") - ticker := time.NewTicker(interval) + ticker := c.clock.NewTicker(interval) for { select { case <-ctx.Done(): ticker.Stop() return nil - case <-ticker.C: + case <-ticker.C(): start := time.Now() - err := c.Cleanup(ctx, lifespan) + err := c.cleanup(ctx, lifespan) if err != nil { logging.WithStacktrace(log, err).WithField("delay", time.Since(start)).Warn("cleanup failed") } else { - log.WithField("delay", time.Since(start)).Info("cleanup succeeded") + log.WithField("delay", c.clock.Since(start)).Info("cleanup succeeded") } } } diff --git a/internal/common/pgkeyvalue/pgkeyvalue_test.go b/internal/common/pgkeyvalue/pgkeyvalue_test.go index c6b4cda2889..c8a9beeb175 100644 --- a/internal/common/pgkeyvalue/pgkeyvalue_test.go +++ b/internal/common/pgkeyvalue/pgkeyvalue_test.go @@ -5,217 +5,84 @@ import ( "testing" "time" - "github.com/jackc/pgx/v4/pgxpool" - "github.com/sirupsen/logrus" + "github.com/jackc/pgx/v5/pgxpool" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "golang.org/x/exp/maps" + "k8s.io/apimachinery/pkg/util/clock" - "github.com/armadaproject/armada/internal/common/armadaerrors" "github.com/armadaproject/armada/internal/lookout/testutil" ) -func TestAdd(t *testing.T) { - cacheSize := 100 +func TestLoadStore(t *testing.T) { + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() err := testutil.WithDatabasePgx(func(db *pgxpool.Pool) error { - store, err := New(db, cacheSize, "cachetable") + kvStore, err := New(ctx, db, "cachetable") require.NoError(t, err) - // Adding a key for the first time should insert into both the local cache and postgres. - ok, err := store.Add(context.Background(), "foo", []byte{0, 1, 2}) - require.NoError(t, err) - assert.True(t, ok) - - // The second time we add the key, we should get an error. - ok, err = store.Add(context.Background(), "foo", []byte{0, 1, 2}) - require.NoError(t, err) - assert.False(t, ok) - - // Adding another key should succeed. - ok, err = store.Add(context.Background(), "bar", []byte{0, 1, 2}) - require.NoError(t, err) - assert.True(t, ok) - - // Clear the local cache to verify that it queries postgres. - store.cache.Purge() - ok, err = store.Add(context.Background(), "foo", []byte{0, 1, 2}) - require.NoError(t, err) - assert.False(t, ok) - - // Test AddKey - ok, err = store.AddKey(context.Background(), "baz") - require.NoError(t, err) - assert.True(t, ok) - - ok, err = store.AddKey(context.Background(), "baz") - require.NoError(t, err) - assert.False(t, ok) - return nil - }) - require.NoError(t, err) -} - -func TestLoadOrStoreBatch(t *testing.T) { - cacheSize := 100 - err := testutil.WithDatabasePgx(func(db *pgxpool.Pool) error { - store, err := New(db, cacheSize, "cachetable") - require.NoError(t, err) - - // Add two items - kv1 := []*KeyValue{ - {"foo", []byte{0x1}}, - {"bar", []byte{0x2}}, + data1 := map[string][]byte{ + "a": {0x1}, "b": {0x2}, "c": {0x3}, } - expected1 := map[string][]byte{"foo": {0x1}, "bar": {0x2}} - added, err := store.LoadOrStoreBatch(context.Background(), kv1) + err = kvStore.Store(ctx, data1) require.NoError(t, err) - assert.Equal(t, expected1, added) - // Add items again - added, err = store.LoadOrStoreBatch(context.Background(), kv1) - require.NoError(t, err) - assert.Equal(t, expected1, added) - - // Add three items - kv2 := []*KeyValue{ - {"foo", []byte{0x1}}, - {"bar", []byte{0x2}}, - {"baz", []byte{0x3}}, - } - expected2 := map[string][]byte{"foo": {0x1}, "bar": {0x2}, "baz": {0x3}} - - // Asset that only one is added - added, err = store.LoadOrStoreBatch(context.Background(), kv2) - require.NoError(t, err) - assert.Equal(t, added, expected2) - - // Add a duplicate - kv3 := []*KeyValue{ - {"foo", []byte{0x4}}, - {"bar", []byte{0x5}}, - } - expected3 := map[string][]byte{"foo": {0x1}, "bar": {0x2}} - added, err = store.LoadOrStoreBatch(context.Background(), kv3) + loaded, err := kvStore.Load(ctx, maps.Keys(data1)) require.NoError(t, err) - assert.Equal(t, added, expected3) + assert.Equal(t, data1, loaded) - return nil - }) - assert.NoError(t, err) -} - -func TestAddGet(t *testing.T) { - cacheSize := 100 - err := testutil.WithDatabasePgx(func(db *pgxpool.Pool) error { - store, err := New(db, cacheSize, "cachetable") - require.NoError(t, err) - // Adding a key for the first time should insert into both the local cache and postgres. - expected := []byte{0, 1, 2} - ok, err := store.Add(context.Background(), "foo", expected) + data2 := map[string][]byte{"c": {0x4}, "d": {0x5}} + err = kvStore.Store(ctx, data2) require.NoError(t, err) - assert.True(t, ok) - // Get should return the same value - actual, err := store.Get(context.Background(), "foo") + loaded, err = kvStore.Load(ctx, []string{"a", "b", "c", "d"}) require.NoError(t, err) - assert.Equal(t, expected, actual) - - // Getting another value should return *armadaerrors.ErrNotFound - var targetErr *armadaerrors.ErrNotFound - _, err = store.Get(context.Background(), "bar") - assert.ErrorAs(t, err, &targetErr) + assert.Equal(t, map[string][]byte{ + "a": {0x1}, "b": {0x2}, "c": {0x4}, "d": {0x5}, + }, loaded) - // Purging the cache should still return the same value for foo - store.cache.Purge() - actual, err = store.Get(context.Background(), "foo") - require.NoError(t, err) - assert.Equal(t, expected, actual) return nil }) require.NoError(t, err) } func TestCleanup(t *testing.T) { - cacheSize := 100 + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() err := testutil.WithDatabasePgx(func(db *pgxpool.Pool) error { - store, err := New(db, cacheSize, "cachetable") - require.NoError(t, err) - - // Set an empty logger to avoid annoying "cleanup succeeded" messages - store.Logger = &logrus.Logger{} - - // Adding a key for the first time should insert into both the local cache and postgres, - // and return false (since the key didn't already exist). - expected := []byte{0, 1, 2} - ok, err := store.Add(context.Background(), "foo", expected) + baseTime := time.Now() + testClock := clock.NewFakeClock(baseTime) + kvStore, err := New(ctx, db, "cachetable") + kvStore.clock = testClock require.NoError(t, err) - assert.True(t, ok) - // Run the cleanup. - err = store.Cleanup(context.Background(), 0*time.Second) + // Data that will be cleaned up + data1 := map[string][]byte{"a": {0x1}, "b": {0x2}} + err = kvStore.Store(ctx, data1) require.NoError(t, err) - // Purge the cache to ensure the next get will query postgres. - store.cache.Purge() - - // The key should've been cleaned up and get should return an error. - var targetErr *armadaerrors.ErrNotFound - _, err = store.Get(context.Background(), "foo") - assert.ErrorAs(t, err, &targetErr) + // advance the clock + testClock.SetTime(testClock.Now().Add(5 * time.Second)) - // Add another key - ok, err = store.Add(context.Background(), "bar", expected) + // Data that won't be cleaned up + data2 := map[string][]byte{"c": {0x3}} + err = kvStore.Store(ctx, data2) require.NoError(t, err) - assert.True(t, ok) - // The cleanup shouldn't delete this key - err = store.Cleanup(context.Background(), time.Hour) + loaded, err := kvStore.Load(ctx, []string{"a", "b", "c"}) require.NoError(t, err) - store.cache.Purge() - _, err = store.Get(context.Background(), "bar") - require.NoError(t, err) - - // Test the automatic cleanup - ok, err = store.Add(context.Background(), "baz", expected) - require.NoError(t, err) - assert.True(t, ok) - - // Start the cleanup job to run a quick interval. - // Then try adding baz twice more to make sure it gets cleaned up both times. - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - go func() { - err := store.PeriodicCleanup(ctx, time.Microsecond, time.Microsecond) - assert.NoError(t, err) - }() + assert.Equal(t, map[string][]byte{ + "a": {0x1}, "b": {0x2}, "c": {0x3}, + }, loaded) - time.Sleep(100 * time.Millisecond) - store.cache.Purge() - - ok, err = store.Add(context.Background(), "baz", expected) + // Run the cleanup. + err = kvStore.cleanup(ctx, 5*time.Second) require.NoError(t, err) - assert.True(t, ok) - - time.Sleep(100 * time.Millisecond) - store.cache.Purge() - ok, err = store.Add(context.Background(), "baz", expected) + loaded, err = kvStore.Load(ctx, []string{"a", "b", "c"}) require.NoError(t, err) - assert.True(t, ok) + assert.Equal(t, map[string][]byte{"c": {0x3}}, loaded) return nil }) require.NoError(t, err) } - -func BenchmarkStore(b *testing.B) { - cacheSize := 100 - err := testutil.WithDatabasePgx(func(db *pgxpool.Pool) error { - store, err := New(db, cacheSize, "cachetable") - require.NoError(b, err) - for i := 0; i < b.N; i++ { - _, err := store.AddKey(context.Background(), "foo") - require.NoError(b, err) - } - return nil - }) - require.NoError(b, err) -} diff --git a/internal/common/pulsarutils/async.go b/internal/common/pulsarutils/async.go index 06c99762ece..8f71781d558 100644 --- a/internal/common/pulsarutils/async.go +++ b/internal/common/pulsarutils/async.go @@ -13,6 +13,7 @@ import ( "github.com/sirupsen/logrus" "github.com/armadaproject/armada/internal/common/logging" + "github.com/armadaproject/armada/internal/common/util" ) // ConsumerMessageId wraps a pulsar message id and an identifier for the consumer which originally received the @@ -108,7 +109,7 @@ func Receive( // Ack will ack all pulsar messages coming in on the msgs channel. The incoming messages contain a consumer id which // corresponds to the index of the consumer that should be used to perform the ack. In theory, the acks could be done // in parallel, however its unlikely that they will be a performance bottleneck -func Ack(ctx context.Context, consumers []pulsar.Consumer, msgs chan []*ConsumerMessageId, wg *sync.WaitGroup) { +func Ack(ctx context.Context, consumers []pulsar.Consumer, msgs chan []*ConsumerMessageId, backoffTime time.Duration, wg *sync.WaitGroup) { for msg := range msgs { for _, id := range msg { if id.ConsumerId < 0 || id.ConsumerId >= len(consumers) { @@ -118,7 +119,17 @@ func Ack(ctx context.Context, consumers []pulsar.Consumer, msgs chan []*Consumer "Asked to ack message belonging to consumer %d, however this is outside the bounds of the consumers array which is of length %d", id.ConsumerId, len(consumers))) } - consumers[id.ConsumerId].AckID(id.MessageId) + util.RetryUntilSuccess( + ctx, + func() error { return consumers[id.ConsumerId].AckID(id.MessageId) }, + func(err error) { + logging. + WithStacktrace(msgLogger, err). + WithField("lastMessageId", id.MessageId). + Warnf("Pulsar ack failed; backing off for %s", backoffTime) + time.Sleep(backoffTime) + }, + ) } } msgLogger.Info("Shutting down Ackker") diff --git a/internal/common/pulsarutils/async_test.go b/internal/common/pulsarutils/async_test.go index 5eb0b98326b..d47151c660d 100644 --- a/internal/common/pulsarutils/async_test.go +++ b/internal/common/pulsarutils/async_test.go @@ -21,8 +21,9 @@ type mockConsumer struct { ackedIds []pulsar.MessageID } -func (c *mockConsumer) AckID(message pulsar.MessageID) { +func (c *mockConsumer) AckID(message pulsar.MessageID) error { c.ackedIds = append(c.ackedIds, message) + return nil } func (c *mockConsumer) Receive(ctx context.Context) (pulsar.Message, error) { @@ -70,7 +71,7 @@ func TestAcks(t *testing.T) { consumers := []pulsar.Consumer{&mockConsumer} wg := sync.WaitGroup{} wg.Add(1) - go Ack(ctx.Background(), consumers, input, &wg) + go Ack(ctx.Background(), consumers, input, 1*time.Second, &wg) input <- []*ConsumerMessageId{ {NewMessageId(1), 0, 0}, {NewMessageId(2), 0, 0}, } diff --git a/internal/common/pulsarutils/pulsar_to_channel.go b/internal/common/pulsarutils/pulsar_to_channel.go deleted file mode 100644 index 91427a4c641..00000000000 --- a/internal/common/pulsarutils/pulsar_to_channel.go +++ /dev/null @@ -1,35 +0,0 @@ -package pulsarutils - -import ( - "context" - - "github.com/apache/pulsar-client-go/pulsar" -) - -// PulsarToChannel is a service for receiving messages from Pulsar and forwarding those on C. -type PulsarToChannel struct { - Consumer pulsar.Consumer - C chan pulsar.Message -} - -func NewPulsarToChannel(consumer pulsar.Consumer) *PulsarToChannel { - return &PulsarToChannel{ - Consumer: consumer, - C: make(chan pulsar.Message), - } -} - -// Run starts the service. -func (srv *PulsarToChannel) Run(ctx context.Context) error { - for { - msg, err := srv.Consumer.Receive(ctx) - if err != nil { - return err - } - select { - case <-ctx.Done(): - return ctx.Err() - case srv.C <- msg: - } - } -} diff --git a/internal/common/slices/slices.go b/internal/common/slices/slices.go index db8301e357e..1d3065dad2b 100644 --- a/internal/common/slices/slices.go +++ b/internal/common/slices/slices.go @@ -157,3 +157,14 @@ func Filter[S ~[]E, E any](s S, predicate func(e E) bool) S { } return out } + +// Repeat returns a slice []T of length n*len(vs) consisting of n copies of vs. +func Repeat[T any](n int, vs ...T) []T { + rv := make([]T, n*len(vs)) + for i := 0; i < n; i++ { + for j, v := range vs { + rv[i*len(vs)+j] = v + } + } + return rv +} diff --git a/internal/common/slices/slices_test.go b/internal/common/slices/slices_test.go index e293c6638fd..08e74a297ed 100644 --- a/internal/common/slices/slices_test.go +++ b/internal/common/slices/slices_test.go @@ -264,3 +264,37 @@ func TestPop(t *testing.T) { }, ) } + +func TestRepeat(t *testing.T) { + tests := map[string]struct { + n int + vs []int + expected []int + }{ + "n: 0": { + n: 0, + vs: []int{1, 2, 3}, + expected: []int{}, + }, + "vs emptyy": { + n: 3, + vs: []int{}, + expected: []int{}, + }, + "one entry": { + n: 3, + vs: []int{0}, + expected: []int{0, 0, 0}, + }, + "multiple entries": { + n: 3, + vs: []int{0, 1}, + expected: []int{0, 1, 0, 1, 0, 1}, + }, + } + for name, tc := range tests { + t.Run(name, func(t *testing.T) { + assert.Equal(t, tc.expected, Repeat(tc.n, tc.vs...)) + }) + } +} diff --git a/internal/common/startup.go b/internal/common/startup.go index 2e80220eda9..276109e8a0f 100644 --- a/internal/common/startup.go +++ b/internal/common/startup.go @@ -5,6 +5,9 @@ import ( "fmt" "net/http" "os" + "path" + "runtime" + "strconv" "strings" "time" @@ -21,6 +24,9 @@ import ( const baseConfigFileName = "config" +// RFC3339Millis +const logTimestampFormat = "2006-01-02T15:04:05.999Z07:00" + func BindCommandlineArguments() { err := viper.BindPFlags(pflag.CommandLine) if err != nil { @@ -77,6 +83,7 @@ func ConfigureCommandLineLogging() { func ConfigureLogging() { log.SetLevel(readEnvironmentLogLevel()) log.SetFormatter(readEnvironmentLogFormat()) + log.SetReportCaller(true) log.SetOutput(os.Stdout) } @@ -96,16 +103,29 @@ func readEnvironmentLogFormat() log.Formatter { if !ok { formatStr = "colourful" } + + textFormatter := &log.TextFormatter{ + ForceColors: true, + FullTimestamp: true, + TimestampFormat: logTimestampFormat, + CallerPrettyfier: func(frame *runtime.Frame) (function string, file string) { + fileName := path.Base(frame.File) + ":" + strconv.Itoa(frame.Line) + return "", fileName + }, + } + switch strings.ToLower(formatStr) { case "json": - return &log.JSONFormatter{} + return &log.JSONFormatter{TimestampFormat: logTimestampFormat} case "colourful": - return &log.TextFormatter{ForceColors: true, FullTimestamp: true} + return textFormatter case "text": - return &log.TextFormatter{DisableColors: true, FullTimestamp: true} + textFormatter.ForceColors = false + textFormatter.DisableColors = true + return textFormatter default: println(os.Stderr, fmt.Sprintf("Unknown log format %s, defaulting to colourful format", formatStr)) - return &log.TextFormatter{ForceColors: true, FullTimestamp: true} + return textFormatter } } diff --git a/internal/common/util/retry.go b/internal/common/util/retry.go new file mode 100644 index 00000000000..9f178c037d8 --- /dev/null +++ b/internal/common/util/retry.go @@ -0,0 +1,19 @@ +package util + +import "golang.org/x/net/context" + +func RetryUntilSuccess(ctx context.Context, performAction func() error, onError func(error)) { + for { + select { + case <-ctx.Done(): + return + default: + err := performAction() + if err == nil { + return + } else { + onError(err) + } + } + } +} diff --git a/internal/common/util/retry_test.go b/internal/common/util/retry_test.go new file mode 100644 index 00000000000..43180ac6f39 --- /dev/null +++ b/internal/common/util/retry_test.go @@ -0,0 +1,85 @@ +package util + +import ( + "context" + "fmt" + "testing" + "time" + + "github.com/stretchr/testify/assert" +) + +func TestRetryDoesntSpin(t *testing.T) { + ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second) + defer cancel() + + RetryUntilSuccess( + ctx, + func() error { + return nil + }, + func(err error) {}, + ) + + select { + case <-ctx.Done(): + t.Fatalf("Function did not complete within time limit.") + default: + break + } +} + +func TestRetryCancel(t *testing.T) { + ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second) + defer cancel() + + RetryUntilSuccess( + ctx, + func() error { + return fmt.Errorf("Dummy error.") + }, + func(err error) {}, + ) + + select { + case <-ctx.Done(): + break + default: + t.Fatalf("Function exit was early.") + } +} + +func TestSucceedsAfterFailures(t *testing.T) { + ch := make(chan error, 6) + err := fmt.Errorf("Dummy error.") + + // Load up the channel with my errors + for range [5]int{} { + ch <- err + } + ch <- nil + + errorCount := 0 + + ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second) + defer cancel() + + RetryUntilSuccess( + ctx, + func() error { + return <-ch + }, + func(err error) { + errorCount += 1 + }, + ) + + select { + case <-ctx.Done(): + t.Fatalf("Function timed out.") + default: + break + } + + assert.Equal(t, 5, errorCount) +} diff --git a/internal/executor/application.go b/internal/executor/application.go index d7ad549c47c..26a5c8115e9 100644 --- a/internal/executor/application.go +++ b/internal/executor/application.go @@ -9,6 +9,7 @@ import ( "time" grpc_prometheus "github.com/grpc-ecosystem/go-grpc-prometheus" + "github.com/prometheus/client_golang/prometheus" log "github.com/sirupsen/logrus" "google.golang.org/grpc" "google.golang.org/grpc/keepalive" @@ -23,6 +24,7 @@ import ( "github.com/armadaproject/armada/internal/executor/job/processors" "github.com/armadaproject/armada/internal/executor/metrics" "github.com/armadaproject/armada/internal/executor/metrics/pod_metrics" + "github.com/armadaproject/armada/internal/executor/metrics/runstate" "github.com/armadaproject/armada/internal/executor/node" "github.com/armadaproject/armada/internal/executor/podchecks" "github.com/armadaproject/armada/internal/executor/reporter" @@ -204,6 +206,8 @@ func setupExecutorApiComponents( taskManager.Register(clusterAllocationService.AllocateSpareClusterCapacity, config.Task.AllocateSpareClusterCapacityInterval, "submit_runs") taskManager.Register(eventReporter.ReportMissingJobEvents, config.Task.MissingJobEventReconciliationInterval, "event_reconciliation") pod_metrics.ExposeClusterContextMetrics(clusterContext, clusterUtilisationService, podUtilisationService, nodeInfoService) + runStateMetricsCollector := runstate.NewJobRunStateStoreMetricsCollector(jobRunState) + prometheus.MustRegister(runStateMetricsCollector) if config.Metric.ExposeQueueUsageMetrics && config.Task.UtilisationEventReportingInterval > 0 { podUtilisationReporter := utilisation.NewUtilisationEventReporter( diff --git a/internal/executor/healthmonitor/etcd.go b/internal/executor/healthmonitor/etcd.go index 61769e22fd1..855e9baa0c2 100644 --- a/internal/executor/healthmonitor/etcd.go +++ b/internal/executor/healthmonitor/etcd.go @@ -52,7 +52,7 @@ type EtcdHealthMonitor struct { } var etcdInstanceUpDesc = prometheus.NewDesc( - metrics.ArmadaExecutorMetricsPrefix+"_etcd_instance_up", + metrics.ArmadaExecutorMetricsPrefix+"etcd_instance_up", "Shows if an etcd instance is sufficiently live to get metrics from", []string{etcdMemberUrl}, nil, ) diff --git a/internal/executor/job/domain.go b/internal/executor/job/domain.go index 8c9a6288f99..03e967f1c9c 100644 --- a/internal/executor/job/domain.go +++ b/internal/executor/job/domain.go @@ -55,6 +55,25 @@ const ( Missing ) +func (r RunPhase) String() string { + switch r { + case Invalid: + return "invalid" + case Leased: + return "leased" + case SuccessfulSubmission: + return "successful-submission" + case FailedSubmission: + return "failed-submission" + case Active: + return "active" + case Missing: + return "missing" + default: + return "unknown" + } +} + type RunState struct { Meta *RunMeta Job *SubmitJob diff --git a/internal/executor/metrics/runstate/job_run_state.go b/internal/executor/metrics/runstate/job_run_state.go new file mode 100644 index 00000000000..f2a4907dd93 --- /dev/null +++ b/internal/executor/metrics/runstate/job_run_state.go @@ -0,0 +1,58 @@ +package runstate + +import ( + "github.com/prometheus/client_golang/prometheus" + + "github.com/armadaproject/armada/internal/executor/job" + "github.com/armadaproject/armada/internal/executor/metrics" +) + +const ( + queueLabel = "queue" + phaseLabel = "phase" +) + +var runPhaseCountDesc = prometheus.NewDesc( + metrics.ArmadaExecutorMetricsPrefix+"run_phase", + "Runs in different phases by queue", + []string{queueLabel, phaseLabel}, nil, +) + +type JobRunStateStoreMetricsCollector struct { + jobRunStateStore job.RunStateStore +} + +func NewJobRunStateStoreMetricsCollector(jobRunStateStore job.RunStateStore) *JobRunStateStoreMetricsCollector { + collector := &JobRunStateStoreMetricsCollector{ + jobRunStateStore: jobRunStateStore, + } + return collector +} + +func (j *JobRunStateStoreMetricsCollector) Describe(desc chan<- *prometheus.Desc) { + desc <- runPhaseCountDesc +} + +type runStateKey struct { + Queue string + Phase job.RunPhase +} + +func (j *JobRunStateStoreMetricsCollector) Collect(metrics chan<- prometheus.Metric) { + runs := j.jobRunStateStore.GetAll() + + phaseCountByQueue := map[runStateKey]int{} + + for _, run := range runs { + key := runStateKey{ + Queue: run.Meta.Queue, + Phase: run.Phase, + } + phaseCountByQueue[key]++ + } + + for metricKey, value := range phaseCountByQueue { + metrics <- prometheus.MustNewConstMetric(runPhaseCountDesc, prometheus.GaugeValue, + float64(value), metricKey.Queue, metricKey.Phase.String()) + } +} diff --git a/internal/executor/metrics/runstate/job_run_state_test.go b/internal/executor/metrics/runstate/job_run_state_test.go new file mode 100644 index 00000000000..8059aa0e67c --- /dev/null +++ b/internal/executor/metrics/runstate/job_run_state_test.go @@ -0,0 +1,81 @@ +package runstate + +import ( + "testing" + + "github.com/prometheus/client_golang/prometheus" + "github.com/stretchr/testify/assert" + + "github.com/armadaproject/armada/internal/common/util" + "github.com/armadaproject/armada/internal/executor/job" +) + +func TestJobRunStateStoreMetricsCollector_DefaultsToNoMetrics(t *testing.T) { + collector := setupJobRunStateMetricCollectorTest([]*job.RunState{}) + + actual := getCurrentMetrics(collector) + assert.Len(t, actual, 0) +} + +func TestJobRunStateStoreMetricsCollector_ProducesPhaseCountMetrics(t *testing.T) { + states := []*job.RunState{ + createJobRunState("queue-1", job.Leased), + createJobRunState("queue-1", job.Leased), + createJobRunState("queue-2", job.Leased), + + createJobRunState("queue-1", job.Invalid), + createJobRunState("queue-1", job.SuccessfulSubmission), + createJobRunState("queue-1", job.FailedSubmission), + createJobRunState("queue-1", job.Active), + createJobRunState("queue-1", job.Missing), + } + collector := setupJobRunStateMetricCollectorTest(states) + + expected := []prometheus.Metric{ + prometheus.MustNewConstMetric(runPhaseCountDesc, prometheus.GaugeValue, 2, "queue-1", job.Leased.String()), + prometheus.MustNewConstMetric(runPhaseCountDesc, prometheus.GaugeValue, 1, "queue-2", job.Leased.String()), + + prometheus.MustNewConstMetric(runPhaseCountDesc, prometheus.GaugeValue, 1, "queue-1", job.Invalid.String()), + prometheus.MustNewConstMetric(runPhaseCountDesc, prometheus.GaugeValue, 1, "queue-1", job.SuccessfulSubmission.String()), + prometheus.MustNewConstMetric(runPhaseCountDesc, prometheus.GaugeValue, 1, "queue-1", job.FailedSubmission.String()), + prometheus.MustNewConstMetric(runPhaseCountDesc, prometheus.GaugeValue, 1, "queue-1", job.Active.String()), + prometheus.MustNewConstMetric(runPhaseCountDesc, prometheus.GaugeValue, 1, "queue-1", job.Missing.String()), + } + + actual := getCurrentMetrics(collector) + assert.Len(t, actual, 7) + for i := 0; i < len(expected); i++ { + a1 := actual[i] + // Metrics are not calculated in a deterministic order, so just check all expected metrics are present + assert.Contains(t, expected, a1) + } +} + +func createJobRunState(queue string, phase job.RunPhase) *job.RunState { + return &job.RunState{ + Meta: &job.RunMeta{ + Queue: queue, + RunId: util.NewULID(), + JobId: util.NewULID(), + }, + Phase: phase, + } +} + +func setupJobRunStateMetricCollectorTest(initialJobRuns []*job.RunState) *JobRunStateStoreMetricsCollector { + stateStore := job.NewJobRunStateStoreWithInitialState(initialJobRuns) + collector := NewJobRunStateStoreMetricsCollector(stateStore) + return collector +} + +func getCurrentMetrics(collector *JobRunStateStoreMetricsCollector) []prometheus.Metric { + metricChan := make(chan prometheus.Metric, 1000) + collector.Collect(metricChan) + close(metricChan) + + actual := make([]prometheus.Metric, 0) + for m := range metricChan { + actual = append(actual, m) + } + return actual +} diff --git a/internal/executor/utilisation/prometheus_scraping_test.go b/internal/executor/utilisation/prometheus_scraping_test.go index 3aeaf998a46..f5c38e78ea9 100644 --- a/internal/executor/utilisation/prometheus_scraping_test.go +++ b/internal/executor/utilisation/prometheus_scraping_test.go @@ -2,7 +2,7 @@ package utilisation import ( "errors" - "io/ioutil" + "io" "net/http" "strings" "testing" @@ -157,7 +157,7 @@ func makeTestResponse() *http.Response { func makeResponse(header http.Header, body string) *http.Response { response := http.Response{ Header: header, - Body: ioutil.NopCloser(strings.NewReader((body))), + Body: io.NopCloser(strings.NewReader((body))), } return &response } diff --git a/internal/jobservice/application.go b/internal/jobservice/application.go index 11efc3db63a..596a64ae9bd 100644 --- a/internal/jobservice/application.go +++ b/internal/jobservice/application.go @@ -38,13 +38,31 @@ var DefaultConfiguration = &configuration.JobServiceConfiguration{ InitialConnections: 5, Capacity: 5, }, - SubscriberPoolSize: 30, + SubscriberPoolSize: 30, + SubscriptionExpirySecs: 300, + PurgeJobSetTime: 600, } // Mutates config where possible to correct mis-configurations. func RectifyConfig(config *configuration.JobServiceConfiguration) { logger := log.WithField("JobService", "RectifyConfig") + if config.SubscriptionExpirySecs == 0 { + logger.WithFields(log.Fields{ + "default": DefaultConfiguration.SubscriptionExpirySecs, + "configured": config.SubscriptionExpirySecs, + }).Warn("config.SubscriptionExpirySecs invalid, using default instead") + config.SubscriptionExpirySecs = DefaultConfiguration.SubscriptionExpirySecs + } + + if config.PurgeJobSetTime == 0 { + logger.WithFields(log.Fields{ + "default": DefaultConfiguration.PurgeJobSetTime, + "configured": config.PurgeJobSetTime, + }).Warn("config.PurgeJobSetTime invalid, using default instead") + config.PurgeJobSetTime = DefaultConfiguration.PurgeJobSetTime + } + // Grpc Pool if config.GrpcPool.InitialConnections <= 0 { logger.WithFields(log.Fields{ @@ -105,7 +123,8 @@ func (a *App) StartUp(ctx context.Context, config *configuration.JobServiceConfi jobService := server.NewJobService(config, sqlJobRepo) js.RegisterJobServiceServer(grpcServer, jobService) - lis, err := net.Listen("tcp", fmt.Sprintf(":%d", config.GrpcPort)) + lc := net.ListenConfig{} + lis, err := lc.Listen(ctx, "tcp", fmt.Sprintf(":%d", config.GrpcPort)) if err != nil { return err } @@ -139,10 +158,19 @@ func (a *App) StartUp(ctx context.Context, config *configuration.JobServiceConfi g.Go(func() error { defer log.Infof("stopping server.") + go func() { + select { + case <-ctx.Done(): + log.Info("Got context done for grpc server.") + grpcServer.Stop() + } + }() + log.Info("jobservice service listening on ", config.GrpcPort) if err := grpcServer.Serve(lis); err != nil { log.Fatalf("failed to serve: %v", err) } + return nil }) g.Go(func() error { diff --git a/internal/jobservice/application_test.go b/internal/jobservice/application_test.go index 54ac736955e..a5820f5947e 100644 --- a/internal/jobservice/application_test.go +++ b/internal/jobservice/application_test.go @@ -9,13 +9,7 @@ import ( "github.com/armadaproject/armada/internal/jobservice/configuration" ) -var knownGoodConfig = &configuration.JobServiceConfiguration{ - GrpcPool: grpcconfig.GrpcPoolConfig{ - InitialConnections: 10, - Capacity: 10, - }, - SubscriberPoolSize: 30, -} +var knownGoodConfig = DefaultConfiguration func TestRectifyConfig(t *testing.T) { testCases := []struct { @@ -31,7 +25,6 @@ func TestRectifyConfig(t *testing.T) { { name: "Zero-length SubscriberPoolSize", config: &configuration.JobServiceConfiguration{ - GrpcPool: grpcconfig.GrpcPoolConfig{InitialConnections: 10, Capacity: 10}, SubscriberPoolSize: 0, }, expectedConfig: knownGoodConfig, @@ -39,30 +32,16 @@ func TestRectifyConfig(t *testing.T) { { name: "Incorrect GrpcPool.InitialConnections", config: &configuration.JobServiceConfiguration{ - GrpcPool: grpcconfig.GrpcPoolConfig{InitialConnections: 0, Capacity: 10}, - SubscriberPoolSize: 30, - }, - expectedConfig: &configuration.JobServiceConfiguration{ - GrpcPool: grpcconfig.GrpcPoolConfig{ - InitialConnections: DefaultConfiguration.GrpcPool.InitialConnections, - Capacity: 10, - }, - SubscriberPoolSize: 30, + GrpcPool: grpcconfig.GrpcPoolConfig{InitialConnections: 0, Capacity: knownGoodConfig.GrpcPool.Capacity}, }, + expectedConfig: knownGoodConfig, }, { name: "Incorrect GrpcPool.Capacity", config: &configuration.JobServiceConfiguration{ - GrpcPool: grpcconfig.GrpcPoolConfig{InitialConnections: 10, Capacity: 0}, - SubscriberPoolSize: 30, - }, - expectedConfig: &configuration.JobServiceConfiguration{ - GrpcPool: grpcconfig.GrpcPoolConfig{ - InitialConnections: 10, - Capacity: DefaultConfiguration.GrpcPool.Capacity, - }, - SubscriberPoolSize: 30, + GrpcPool: grpcconfig.GrpcPoolConfig{InitialConnections: knownGoodConfig.GrpcPool.InitialConnections, Capacity: 0}, }, + expectedConfig: knownGoodConfig, }, } diff --git a/internal/jobservice/eventstojobs/manage_subs.go b/internal/jobservice/eventstojobs/manage_subs.go index b8b4848b571..fa509e1adc3 100644 --- a/internal/jobservice/eventstojobs/manage_subs.go +++ b/internal/jobservice/eventstojobs/manage_subs.go @@ -2,7 +2,6 @@ package eventstojobs import ( "context" - "errors" "fmt" "io" "strings" @@ -60,7 +59,7 @@ func NewJobSetSubscriptionExecutor(ctx context.Context, sqlJobService: sqlJobService, subscriptions: make(map[repository.JobSetKey]*JobSetSubscription), newSubChan: newSubChan, - subDoneChan: make(chan *repository.JobSetKey), + subDoneChan: make(chan *repository.JobSetKey, 1000), subTimeout: subTimeout, } } @@ -191,7 +190,13 @@ func (jse *JobSetSubscriptionExecutor) NumActiveSubscriptions() int { return len(jse.subscriptions) } -func NewJobSetSubscription(ctx context.Context, eventReader events.JobEventReader, subInfo *repository.SubscribedTuple, subTimeout time.Duration, subDoneChan chan<- *repository.JobSetKey, sqlJobService repository.SQLJobService) *JobSetSubscription { +func NewJobSetSubscription(ctx context.Context, + eventReader events.JobEventReader, + subInfo *repository.SubscribedTuple, + subTimeout time.Duration, + subDoneChan chan<- *repository.JobSetKey, + sqlJobService repository.SQLJobService, +) *JobSetSubscription { newCtx, cancel := context.WithCancel(ctx) return &JobSetSubscription{ ctx: newCtx, @@ -217,6 +222,7 @@ func (js *JobSetSubscription) Subscribe() error { Queue: js.Queue, JobSetId: js.JobSetId, } + log.WithFields(requestFields).Debugf("Sent message to subDoneChan") }() log.WithFields(requestFields).Debugf("Calling GetJobEventMessage") @@ -229,7 +235,6 @@ func (js *JobSetSubscription) Subscribe() error { }) if err != nil { log.WithFields(requestFields).WithError(err).Error("error from GetJobEventMessage") - js.cancel() return err } @@ -266,6 +271,11 @@ func (js *JobSetSubscription) Subscribe() error { g.Go(func() error { nextRecv := time.After(1 * time.Nanosecond) + defer func() { + js.cancel() + log.WithFields(requestFields).Debugf("Called cancel") + }() + // this loop will run until the context is canceled for { select { @@ -275,7 +285,7 @@ func (js *JobSetSubscription) Subscribe() error { case <-nextRecv: msg, err := stream.Recv() if err != nil { - if errors.Is(err, io.EOF) { + if strings.Contains(err.Error(), io.EOF.Error()) { log.WithFields(requestFields).Info("Reached stream end for JobSetSubscription") return nil } else if strings.Contains(err.Error(), "context canceled") { @@ -304,7 +314,7 @@ func (js *JobSetSubscription) Subscribe() error { log.WithFields(requestFields).WithFields(log.Fields{ "job_id": currentJobId, "job_status": jobStatus.GetState().String(), - }).Info("Got event") + }).Debug("Got event") jobStatus := repository.NewJobStatus(js.Queue, js.JobSetId, currentJobId, *jobStatus) err := js.sqlJobService.UpdateJobServiceDb(js.ctx, jobStatus) if err != nil { diff --git a/internal/jobservice/eventstojobs/manage_subs_test.go b/internal/jobservice/eventstojobs/manage_subs_test.go index 241da7fb2b0..7864ca2b4a1 100644 --- a/internal/jobservice/eventstojobs/manage_subs_test.go +++ b/internal/jobservice/eventstojobs/manage_subs_test.go @@ -36,6 +36,9 @@ func (m *MockEventClient) Recv() (*api.EventStreamMessage, error) { m.messagesSent += 1 if m.messagesSent > 3 { + // Sleep a bit to mimick the stream being open but no events + // currently available. + time.Sleep(time.Second * 5) return nil, io.EOF } @@ -227,7 +230,7 @@ func TestJobSetSubscriptionExecutor(t *testing.T) { sawSubs := false numberSeen := 0 func() { - watchDog := time.After(time.Second * 5) + watchDog := time.After(time.Second * 10) ticker := time.NewTicker(time.Millisecond * 200) for { select { diff --git a/internal/jobservice/repository/postgres.go b/internal/jobservice/repository/postgres.go index 04611157cb0..0b8a81b5cb6 100644 --- a/internal/jobservice/repository/postgres.go +++ b/internal/jobservice/repository/postgres.go @@ -315,18 +315,23 @@ func (s *JSRepoPostgres) PurgeExpiredJobSets(ctx context.Context) { log := log.WithField("JobService", "ExpiredJobSetsPurge") log.Info("Starting purge of expired jobsets") - for range ticker.C { - result, err := s.dbpool.Exec(ctx, jobSetStmt) - if err != nil { - log.Error("error deleting expired jobsets: ", err) - } else { - log.Debugf("Deleted %d expired jobsets", result.RowsAffected()) - } - result, err = s.dbpool.Exec(ctx, jobStmt) - if err != nil { - log.Error("error deleting expired jobs: ", err) - } else { - log.Debugf("Deleted %d expired jobs", result.RowsAffected()) + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + result, err := s.dbpool.Exec(ctx, jobSetStmt) + if err != nil { + log.Error("error deleting expired jobsets: ", err) + } else { + log.Debugf("Deleted %d expired jobsets", result.RowsAffected()) + } + result, err = s.dbpool.Exec(ctx, jobStmt) + if err != nil { + log.Error("error deleting expired jobs: ", err) + } else { + log.Debugf("Deleted %d expired jobs", result.RowsAffected()) + } } } } diff --git a/internal/lookout/db-gen/main.go b/internal/lookout/db-gen/main.go index 3081640605a..b38a986f38e 100644 --- a/internal/lookout/db-gen/main.go +++ b/internal/lookout/db-gen/main.go @@ -11,7 +11,7 @@ import ( "time" "github.com/google/uuid" - _ "github.com/jackc/pgx/v4/stdlib" + _ "github.com/jackc/pgx/v5/stdlib" ) type State int diff --git a/internal/lookout/postgres/postgres.go b/internal/lookout/postgres/postgres.go index c2fb9a9c3f1..c83a4b53848 100644 --- a/internal/lookout/postgres/postgres.go +++ b/internal/lookout/postgres/postgres.go @@ -3,7 +3,7 @@ package postgres import ( "database/sql" - _ "github.com/jackc/pgx/v4/stdlib" + _ "github.com/jackc/pgx/v5/stdlib" "github.com/armadaproject/armada/internal/armada/configuration" "github.com/armadaproject/armada/internal/common/database" diff --git a/internal/lookout/testutil/db_testutil.go b/internal/lookout/testutil/db_testutil.go index c9ed69523dd..5ce57e8effa 100644 --- a/internal/lookout/testutil/db_testutil.go +++ b/internal/lookout/testutil/db_testutil.go @@ -5,8 +5,8 @@ import ( "database/sql" "fmt" - "github.com/jackc/pgx/v4/pgxpool" - _ "github.com/jackc/pgx/v4/stdlib" + "github.com/jackc/pgx/v5/pgxpool" + _ "github.com/jackc/pgx/v5/stdlib" "github.com/pkg/errors" "github.com/armadaproject/armada/internal/common/util" @@ -79,7 +79,7 @@ func WithDatabasePgx(action func(db *pgxpool.Pool) error) error { } // Connect again- this time to the database we just created and using pgx pool. This will be used for tests - testDbPool, err := pgxpool.Connect(ctx, connectionString+" dbname="+dbName) + testDbPool, err := pgxpool.New(ctx, connectionString+" dbname="+dbName) if err != nil { return errors.WithStack(err) } diff --git a/internal/lookout/ui/package.json b/internal/lookout/ui/package.json index e6d151087af..a27e1b4bd30 100644 --- a/internal/lookout/ui/package.json +++ b/internal/lookout/ui/package.json @@ -13,6 +13,7 @@ "test": "react-scripts test --coverage", "eject": "react-scripts eject", "openapi": "docker run --rm -u $(id -u ${USER}):$(id -g ${USER}) -v \"${PWD}/../../../:/project\" openapitools/openapi-generator-cli:v5.4.0 /project/internal/lookout/ui/openapi.sh", + "openapi:win": "powershell -Command \"$uid = (New-Object System.Security.Principal.WindowsPrincipal([System.Security.Principal.WindowsIdentity]::GetCurrent())).Identity.User.Value; $gid = (Get-WmiObject Win32_UserAccount | Where-Object { $_.SID -eq $uid }).SID.Value; docker run --rm -e USERID=$uid -e GROUPID=$gid -v \"%cd%/../../../:/project\" openapitools/openapi-generator-cli:v5.4.0 /project/internal/lookout/ui/openapi.sh\"", "lint": "eslint './src/**/*.{js,ts,tsx}' --max-warnings 0 --no-error-on-unmatched-pattern", "fmt": "eslint './src/**/*.{js,ts,tsx}' --max-warnings 0 --no-error-on-unmatched-pattern --fix" }, diff --git a/internal/lookoutingester/instructions/instructions.go b/internal/lookoutingester/instructions/instructions.go index 9771534f8bf..f49ac049975 100644 --- a/internal/lookoutingester/instructions/instructions.go +++ b/internal/lookoutingester/instructions/instructions.go @@ -84,6 +84,7 @@ func (c *InstructionConverter) convertSequence(es *armadaevents.EventSequence, u err = c.handleJobRunPreempted(*event.Created, event.GetJobRunPreempted(), update) case *armadaevents.EventSequence_Event_CancelJob, *armadaevents.EventSequence_Event_JobRunLeased, + *armadaevents.EventSequence_Event_JobRequeued, *armadaevents.EventSequence_Event_ReprioritiseJobSet, *armadaevents.EventSequence_Event_CancelJobSet, *armadaevents.EventSequence_Event_ResourceUtilisation, diff --git a/internal/lookoutingester/lookoutdb/insertion.go b/internal/lookoutingester/lookoutdb/insertion.go index 3a0934960af..a22b2eab29b 100644 --- a/internal/lookoutingester/lookoutdb/insertion.go +++ b/internal/lookoutingester/lookoutdb/insertion.go @@ -6,8 +6,8 @@ import ( "sync" "time" - "github.com/jackc/pgx/v4" - "github.com/jackc/pgx/v4/pgxpool" + "github.com/jackc/pgx/v5" + "github.com/jackc/pgx/v5/pgxpool" "github.com/pkg/errors" log "github.com/sirupsen/logrus" @@ -662,7 +662,7 @@ func (l *LookoutDb) CreateJobRunContainersScalar(ctx context.Context, instructio func batchInsert(ctx context.Context, db *pgxpool.Pool, createTmp func(pgx.Tx) error, insertTmp func(pgx.Tx) error, copyToDest func(pgx.Tx) error, ) error { - return db.BeginTxFunc(ctx, pgx.TxOptions{ + return pgx.BeginTxFunc(ctx, db, pgx.TxOptions{ IsoLevel: pgx.ReadCommitted, AccessMode: pgx.ReadWrite, DeferrableMode: pgx.Deferrable, diff --git a/internal/lookoutingester/lookoutdb/insertion_test.go b/internal/lookoutingester/lookoutdb/insertion_test.go index 5f4a5b69e0d..25a3ff1af03 100644 --- a/internal/lookoutingester/lookoutdb/insertion_test.go +++ b/internal/lookoutingester/lookoutdb/insertion_test.go @@ -10,7 +10,7 @@ import ( "github.com/armadaproject/armada/internal/common/database/lookout" "github.com/apache/pulsar-client-go/pulsar" - "github.com/jackc/pgx/v4/pgxpool" + "github.com/jackc/pgx/v5/pgxpool" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" diff --git a/internal/lookoutingesterv2/benchmark/benchmark.go b/internal/lookoutingesterv2/benchmark/benchmark.go index e69846345dd..6c808ca14f3 100644 --- a/internal/lookoutingesterv2/benchmark/benchmark.go +++ b/internal/lookoutingesterv2/benchmark/benchmark.go @@ -9,7 +9,7 @@ import ( "time" "github.com/google/uuid" - "github.com/jackc/pgx/v4/pgxpool" + "github.com/jackc/pgx/v5/pgxpool" "k8s.io/utils/pointer" "github.com/armadaproject/armada/internal/common/database" diff --git a/internal/lookoutingesterv2/lookoutdb/insertion.go b/internal/lookoutingesterv2/lookoutdb/insertion.go index c084cbc2c0f..c5378543df0 100644 --- a/internal/lookoutingesterv2/lookoutdb/insertion.go +++ b/internal/lookoutingesterv2/lookoutdb/insertion.go @@ -6,8 +6,8 @@ import ( "sync" "time" - "github.com/jackc/pgx/v4" - "github.com/jackc/pgx/v4/pgxpool" + "github.com/jackc/pgx/v5" + "github.com/jackc/pgx/v5/pgxpool" "github.com/pkg/errors" log "github.com/sirupsen/logrus" @@ -699,7 +699,7 @@ func (l *LookoutDb) CreateUserAnnotationsScalar(ctx context.Context, instruction func batchInsert(ctx context.Context, db *pgxpool.Pool, createTmp func(pgx.Tx) error, insertTmp func(pgx.Tx) error, copyToDest func(pgx.Tx) error, ) error { - return db.BeginTxFunc(ctx, pgx.TxOptions{ + return pgx.BeginTxFunc(ctx, db, pgx.TxOptions{ IsoLevel: pgx.ReadCommitted, AccessMode: pgx.ReadWrite, DeferrableMode: pgx.Deferrable, diff --git a/internal/lookoutingesterv2/lookoutdb/insertion_test.go b/internal/lookoutingesterv2/lookoutdb/insertion_test.go index b6095ff18b1..9de584df3fa 100644 --- a/internal/lookoutingesterv2/lookoutdb/insertion_test.go +++ b/internal/lookoutingesterv2/lookoutdb/insertion_test.go @@ -8,7 +8,7 @@ import ( "time" "github.com/apache/pulsar-client-go/pulsar" - "github.com/jackc/pgx/v4/pgxpool" + "github.com/jackc/pgx/v5/pgxpool" "github.com/stretchr/testify/assert" "k8s.io/utils/pointer" diff --git a/internal/lookoutv2/gen/restapi/server.go b/internal/lookoutv2/gen/restapi/server.go index 4af6552d17e..442c1f37b4b 100644 --- a/internal/lookoutv2/gen/restapi/server.go +++ b/internal/lookoutv2/gen/restapi/server.go @@ -8,7 +8,6 @@ import ( "crypto/x509" "errors" "fmt" - "io/ioutil" "log" "net" "net/http" @@ -274,7 +273,7 @@ func (s *Server) Serve() (err error) { if s.TLSCACertificate != "" { // include specified CA certificate - caCert, caCertErr := ioutil.ReadFile(string(s.TLSCACertificate)) + caCert, caCertErr := os.ReadFile(string(s.TLSCACertificate)) if caCertErr != nil { return caCertErr } diff --git a/internal/lookoutv2/pruner/pruner.go b/internal/lookoutv2/pruner/pruner.go index 8267a820f41..18ee81c8da1 100644 --- a/internal/lookoutv2/pruner/pruner.go +++ b/internal/lookoutv2/pruner/pruner.go @@ -4,7 +4,7 @@ import ( "context" "time" - "github.com/jackc/pgx/v4" + "github.com/jackc/pgx/v5" "github.com/pkg/errors" log "github.com/sirupsen/logrus" "k8s.io/apimachinery/pkg/util/clock" @@ -32,7 +32,7 @@ func PruneDb(ctx context.Context, db *pgx.Conn, keepAfterCompletion time.Duratio for keepGoing { batchStart := clock.Now() batchSize := 0 - err = db.BeginTxFunc(ctx, pgx.TxOptions{ + err = pgx.BeginTxFunc(ctx, db, pgx.TxOptions{ IsoLevel: pgx.ReadCommitted, AccessMode: pgx.ReadWrite, DeferrableMode: pgx.Deferrable, diff --git a/internal/lookoutv2/pruner/pruner_test.go b/internal/lookoutv2/pruner/pruner_test.go index baaebddae0d..a88274c316a 100644 --- a/internal/lookoutv2/pruner/pruner_test.go +++ b/internal/lookoutv2/pruner/pruner_test.go @@ -6,7 +6,7 @@ import ( "time" "github.com/google/uuid" - "github.com/jackc/pgx/v4/pgxpool" + "github.com/jackc/pgx/v5/pgxpool" "github.com/stretchr/testify/assert" "k8s.io/apimachinery/pkg/util/clock" diff --git a/internal/lookoutv2/repository/getjobrunerror.go b/internal/lookoutv2/repository/getjobrunerror.go index 65646b0a5a2..467da22ec1a 100644 --- a/internal/lookoutv2/repository/getjobrunerror.go +++ b/internal/lookoutv2/repository/getjobrunerror.go @@ -3,8 +3,8 @@ package repository import ( "context" - "github.com/jackc/pgx/v4" - "github.com/jackc/pgx/v4/pgxpool" + "github.com/jackc/pgx/v5" + "github.com/jackc/pgx/v5/pgxpool" "github.com/pkg/errors" log "github.com/sirupsen/logrus" @@ -29,29 +29,13 @@ func NewSqlGetJobRunErrorRepository(db *pgxpool.Pool, decompressor compress.Deco func (r *SqlGetJobRunErrorRepository) GetJobRunError(ctx context.Context, runId string) (string, error) { var rawBytes []byte - err := r.db.BeginTxFunc(ctx, pgx.TxOptions{ - IsoLevel: pgx.RepeatableRead, - AccessMode: pgx.ReadOnly, - DeferrableMode: pgx.Deferrable, - }, func(tx pgx.Tx) error { - rows, err := tx.Query(ctx, "SELECT error FROM job_run WHERE run_id = $1 AND error IS NOT NULL", runId) - if err != nil { - return err - } - defer rows.Close() - for rows.Next() { - err := rows.Scan(&rawBytes) - if err != nil { - return err - } - return nil - } - return errors.Errorf("no error found for run with id %s", runId) - }) + err := r.db.QueryRow(ctx, "SELECT error FROM job_run WHERE run_id = $1 AND error IS NOT NULL", runId).Scan(&rawBytes) if err != nil { + if err == pgx.ErrNoRows { + return "", errors.Errorf("no error found for run with id %s", runId) + } return "", err } - decompressed, err := r.decompressor.Decompress(rawBytes) if err != nil { log.WithError(err).Error("failed to decompress") diff --git a/internal/lookoutv2/repository/getjobrunerror_test.go b/internal/lookoutv2/repository/getjobrunerror_test.go index 91a4cb6a623..274de5e6d40 100644 --- a/internal/lookoutv2/repository/getjobrunerror_test.go +++ b/internal/lookoutv2/repository/getjobrunerror_test.go @@ -4,7 +4,7 @@ import ( "context" "testing" - "github.com/jackc/pgx/v4/pgxpool" + "github.com/jackc/pgx/v5/pgxpool" "github.com/stretchr/testify/assert" "github.com/armadaproject/armada/internal/common/compress" diff --git a/internal/lookoutv2/repository/getjobs.go b/internal/lookoutv2/repository/getjobs.go index 567f9e58e6b..eac6cc0aaf5 100644 --- a/internal/lookoutv2/repository/getjobs.go +++ b/internal/lookoutv2/repository/getjobs.go @@ -7,8 +7,8 @@ import ( "sort" "time" - "github.com/jackc/pgx/v4" - "github.com/jackc/pgx/v4/pgxpool" + "github.com/jackc/pgx/v5" + "github.com/jackc/pgx/v5/pgxpool" "github.com/pkg/errors" log "github.com/sirupsen/logrus" @@ -83,7 +83,7 @@ func (r *SqlGetJobsRepository) GetJobs(ctx context.Context, filters []*model.Fil var annotationRows []*annotationRow var count int - err := r.db.BeginTxFunc(ctx, pgx.TxOptions{ + err := pgx.BeginTxFunc(ctx, r.db, pgx.TxOptions{ IsoLevel: pgx.RepeatableRead, AccessMode: pgx.ReadWrite, DeferrableMode: pgx.Deferrable, @@ -134,7 +134,6 @@ func (r *SqlGetJobsRepository) GetJobs(ctx context.Context, filters []*model.Fil log.WithError(err).Error("failed getting annotation rows") return err } - return nil }) if err != nil { diff --git a/internal/lookoutv2/repository/getjobs_test.go b/internal/lookoutv2/repository/getjobs_test.go index 8bd8530d584..3c28805c198 100644 --- a/internal/lookoutv2/repository/getjobs_test.go +++ b/internal/lookoutv2/repository/getjobs_test.go @@ -7,7 +7,7 @@ import ( "time" "github.com/google/uuid" - "github.com/jackc/pgx/v4/pgxpool" + "github.com/jackc/pgx/v5/pgxpool" "github.com/stretchr/testify/assert" "k8s.io/apimachinery/pkg/api/resource" diff --git a/internal/lookoutv2/repository/getjobspec.go b/internal/lookoutv2/repository/getjobspec.go index 47d52e98527..60c6ac41cd1 100644 --- a/internal/lookoutv2/repository/getjobspec.go +++ b/internal/lookoutv2/repository/getjobspec.go @@ -4,8 +4,8 @@ import ( "context" "github.com/gogo/protobuf/proto" - "github.com/jackc/pgx/v4" - "github.com/jackc/pgx/v4/pgxpool" + "github.com/jackc/pgx/v5" + "github.com/jackc/pgx/v5/pgxpool" "github.com/pkg/errors" log "github.com/sirupsen/logrus" @@ -31,29 +31,13 @@ func NewSqlGetJobSpecRepository(db *pgxpool.Pool, decompressor compress.Decompre func (r *SqlGetJobSpecRepository) GetJobSpec(ctx context.Context, jobId string) (*api.Job, error) { var rawBytes []byte - err := r.db.BeginTxFunc(ctx, pgx.TxOptions{ - IsoLevel: pgx.RepeatableRead, - AccessMode: pgx.ReadOnly, - DeferrableMode: pgx.Deferrable, - }, func(tx pgx.Tx) error { - rows, err := tx.Query(ctx, "SELECT job_spec FROM job WHERE job_id = $1", jobId) - if err != nil { - return err - } - defer rows.Close() - for rows.Next() { - err := rows.Scan(&rawBytes) - if err != nil { - return err - } - return nil - } - return errors.Errorf("job with id %s not found", jobId) - }) + err := r.db.QueryRow(ctx, "SELECT job_spec FROM job WHERE job_id = $1", jobId).Scan(&rawBytes) if err != nil { + if err == pgx.ErrNoRows { + return nil, errors.Errorf("job with id %s not found", jobId) + } return nil, err } - decompressed, err := r.decompressor.Decompress(rawBytes) if err != nil { log.WithError(err).Error("failed to decompress") diff --git a/internal/lookoutv2/repository/getjobspec_test.go b/internal/lookoutv2/repository/getjobspec_test.go index 47dd22bab37..d7e00d83671 100644 --- a/internal/lookoutv2/repository/getjobspec_test.go +++ b/internal/lookoutv2/repository/getjobspec_test.go @@ -4,7 +4,7 @@ import ( "context" "testing" - "github.com/jackc/pgx/v4/pgxpool" + "github.com/jackc/pgx/v5/pgxpool" "github.com/stretchr/testify/assert" "github.com/armadaproject/armada/internal/common/compress" diff --git a/internal/lookoutv2/repository/groupjobs.go b/internal/lookoutv2/repository/groupjobs.go index 69bec856f94..dd80976dcd6 100644 --- a/internal/lookoutv2/repository/groupjobs.go +++ b/internal/lookoutv2/repository/groupjobs.go @@ -5,8 +5,8 @@ import ( "fmt" "strings" - "github.com/jackc/pgx/v4" - "github.com/jackc/pgx/v4/pgxpool" + "github.com/jackc/pgx/v5" + "github.com/jackc/pgx/v5/pgxpool" "github.com/pkg/errors" "github.com/armadaproject/armada/internal/common/database" @@ -59,7 +59,7 @@ func (r *SqlGroupJobsRepository) GroupBy( var groups []*model.JobGroup var count int - err := r.db.BeginTxFunc(ctx, pgx.TxOptions{ + err := pgx.BeginTxFunc(ctx, r.db, pgx.TxOptions{ IsoLevel: pgx.RepeatableRead, AccessMode: pgx.ReadOnly, DeferrableMode: pgx.Deferrable, diff --git a/internal/lookoutv2/repository/groupjobs_test.go b/internal/lookoutv2/repository/groupjobs_test.go index ea4d85b37b8..1f255029f8c 100644 --- a/internal/lookoutv2/repository/groupjobs_test.go +++ b/internal/lookoutv2/repository/groupjobs_test.go @@ -7,7 +7,7 @@ import ( "time" "github.com/google/uuid" - "github.com/jackc/pgx/v4/pgxpool" + "github.com/jackc/pgx/v5/pgxpool" "github.com/stretchr/testify/assert" "github.com/armadaproject/armada/internal/common/compress" diff --git a/internal/scheduler/common.go b/internal/scheduler/common.go index 5c27148b871..4daceef1b77 100644 --- a/internal/scheduler/common.go +++ b/internal/scheduler/common.go @@ -11,6 +11,7 @@ import ( armadamaps "github.com/armadaproject/armada/internal/common/maps" armadaslices "github.com/armadaproject/armada/internal/common/slices" schedulerconfig "github.com/armadaproject/armada/internal/scheduler/configuration" + schedulercontext "github.com/armadaproject/armada/internal/scheduler/context" "github.com/armadaproject/armada/internal/scheduler/interfaces" "github.com/armadaproject/armada/internal/scheduler/schedulerobjects" ) @@ -24,6 +25,9 @@ type SchedulerResult struct { // For each preempted job, maps the job id to the id of the node on which the job was running. // For each scheduled job, maps the job id to the id of the node on which the job should be scheduled. NodeIdByJobId map[string]string + // The Scheduling Context. Being passed up for metrics decisions made in scheduler.go and scheduler_metrics.go. + // Passing a pointer as the structure is enormous + SchedulingContexts []*schedulercontext.SchedulingContext } func NewSchedulerResult[S ~[]T, T interfaces.LegacySchedulerJob]( diff --git a/internal/scheduler/configuration/configuration.go b/internal/scheduler/configuration/configuration.go index 2b8fc3f40cc..873b861ada2 100644 --- a/internal/scheduler/configuration/configuration.go +++ b/internal/scheduler/configuration/configuration.go @@ -51,6 +51,8 @@ type Configuration struct { DatabaseFetchSize int `validate:"required"` // Timeout to use when sending messages to pulsar PulsarSendTimeout time.Duration `validate:"required"` + // Maximum jobs to return from a single lease call + MaxJobsLeasedPerCall uint `validate:"required"` } type LeaderConfig struct { diff --git a/internal/scheduler/context/context.go b/internal/scheduler/context/context.go index 7f5899fd7c1..30f2c87e6ec 100644 --- a/internal/scheduler/context/context.go +++ b/internal/scheduler/context/context.go @@ -10,7 +10,6 @@ import ( "github.com/pkg/errors" "golang.org/x/exp/maps" "golang.org/x/exp/slices" - "k8s.io/apimachinery/pkg/api/resource" "github.com/armadaproject/armada/internal/armada/configuration" "github.com/armadaproject/armada/internal/common/armadaerrors" @@ -18,6 +17,7 @@ import ( armadaslices "github.com/armadaproject/armada/internal/common/slices" "github.com/armadaproject/armada/internal/common/types" schedulerconfig "github.com/armadaproject/armada/internal/scheduler/configuration" + "github.com/armadaproject/armada/internal/scheduler/fairness" "github.com/armadaproject/armada/internal/scheduler/interfaces" "github.com/armadaproject/armada/internal/scheduler/schedulerobjects" ) @@ -37,19 +37,13 @@ type SchedulingContext struct { // Default priority class. DefaultPriorityClass string // Determines how fairness is computed. - FairnessModel configuration.FairnessModel - // Resources considered when computing DominantResourceFairness. - DominantResourceFairnessResourcesToConsider []string - // Weights used when computing AssetFairness. - ResourceScarcity map[string]float64 + FairnessCostProvider fairness.FairnessCostProvider // Sum of queue weights across all queues. WeightSum float64 // Per-queue scheduling contexts. QueueSchedulingContexts map[string]*QueueSchedulingContext // Total resources across all clusters available at the start of the scheduling cycle. TotalResources schedulerobjects.ResourceList - // = TotalResources.AsWeightedMillis(ResourceScarcity). - TotalResourcesAsWeightedMillis int64 // Resources assigned across all queues during this scheduling cycle. ScheduledResources schedulerobjects.ResourceList ScheduledResourcesByPriorityClass schedulerobjects.QuantityByTAndResourceType[string] @@ -78,7 +72,7 @@ func NewSchedulingContext( pool string, priorityClasses map[string]types.PriorityClass, defaultPriorityClass string, - resourceScarcity map[string]float64, + fairnessCostProvider fairness.FairnessCostProvider, totalResources schedulerobjects.ResourceList, ) *SchedulingContext { return &SchedulingContext{ @@ -87,11 +81,9 @@ func NewSchedulingContext( Pool: pool, PriorityClasses: priorityClasses, DefaultPriorityClass: defaultPriorityClass, - FairnessModel: configuration.AssetFairness, - ResourceScarcity: resourceScarcity, + FairnessCostProvider: fairnessCostProvider, QueueSchedulingContexts: make(map[string]*QueueSchedulingContext), TotalResources: totalResources.DeepCopy(), - TotalResourcesAsWeightedMillis: totalResources.AsWeightedMillis(resourceScarcity), ScheduledResources: schedulerobjects.NewResourceListWithDefaultSize(), ScheduledResourcesByPriorityClass: make(schedulerobjects.QuantityByTAndResourceType[string]), EvictedResourcesByPriorityClass: make(schedulerobjects.QuantityByTAndResourceType[string]), @@ -100,11 +92,6 @@ func NewSchedulingContext( } } -func (sctx *SchedulingContext) EnableDominantResourceFairness(dominantResourceFairnessResourcesToConsider []string) { - sctx.FairnessModel = configuration.DominantResourceFairness - sctx.DominantResourceFairnessResourcesToConsider = dominantResourceFairnessResourcesToConsider -} - func (sctx *SchedulingContext) SchedulingKeyFromLegacySchedulerJob(job interfaces.LegacySchedulerJob) schedulerobjects.SchedulingKey { var priority int32 if priorityClass, ok := sctx.PriorityClasses[job.GetPriorityClassName()]; ok { @@ -163,11 +150,17 @@ func (sctx *SchedulingContext) String() string { return sctx.ReportString(0) } +// GetQueue is necessary to implement the fairness.QueueRepository interface. +func (sctx *SchedulingContext) GetQueue(queue string) (fairness.Queue, bool) { + qctx, ok := sctx.QueueSchedulingContexts[queue] + return qctx, ok +} + // TotalCost returns the sum of the costs across all queues. func (sctx *SchedulingContext) TotalCost() float64 { var rv float64 for _, qctx := range sctx.QueueSchedulingContexts { - rv += qctx.TotalCostForQueue() + rv += sctx.FairnessCostProvider.CostFromQueue(qctx) } return rv } @@ -197,7 +190,7 @@ func (sctx *SchedulingContext) ReportString(verbosity int32) string { fmt.Fprint(w, "Scheduled queues:\n") for queueName, qctx := range scheduled { fmt.Fprintf(w, "\t%s:\n", queueName) - fmt.Fprintf(w, indent.String("\t\t", qctx.ReportString(verbosity-2))) + fmt.Fprint(w, indent.String("\t\t", qctx.ReportString(verbosity-2))) } } preempted := armadamaps.Filter( @@ -212,7 +205,7 @@ func (sctx *SchedulingContext) ReportString(verbosity int32) string { fmt.Fprint(w, "Preempted queues:\n") for queueName, qctx := range preempted { fmt.Fprintf(w, "\t%s:\n", queueName) - fmt.Fprintf(w, indent.String("\t\t", qctx.ReportString(verbosity-2))) + fmt.Fprint(w, indent.String("\t\t", qctx.ReportString(verbosity-2))) } } w.Flush() @@ -371,6 +364,16 @@ func (qctx *QueueSchedulingContext) String() string { return qctx.ReportString(0) } +// GetAllocation is necessary to implement the fairness.Queue interface. +func (qctx *QueueSchedulingContext) GetAllocation() schedulerobjects.ResourceList { + return qctx.Allocated +} + +// GetWeight is necessary to implement the fairness.Queue interface. +func (qctx *QueueSchedulingContext) GetWeight() float64 { + return qctx.Weight +} + const maxJobIdsToPrint = 1 func (qctx *QueueSchedulingContext) ReportString(verbosity int32) string { @@ -517,50 +520,6 @@ func (qctx *QueueSchedulingContext) ClearJobSpecs() { } } -// TotalCostForQueue returns the total cost of this queue. -func (qctx *QueueSchedulingContext) TotalCostForQueue() float64 { - return qctx.TotalCostForQueueWithAllocation(qctx.Allocated) -} - -// TotalCostForQueueWithAllocation returns the total cost of this queue if its total allocation is given by allocated. -func (qctx *QueueSchedulingContext) TotalCostForQueueWithAllocation(allocated schedulerobjects.ResourceList) float64 { - switch qctx.SchedulingContext.FairnessModel { - case configuration.AssetFairness: - return qctx.assetFairnessCostWithAllocation(allocated) - case configuration.DominantResourceFairness: - return qctx.dominantResourceFairnessCostWithAllocation(allocated) - default: - panic(fmt.Sprintf("unknown fairness type: %s", qctx.SchedulingContext.FairnessModel)) - } -} - -func (qctx *QueueSchedulingContext) assetFairnessCostWithAllocation(allocated schedulerobjects.ResourceList) float64 { - if len(qctx.SchedulingContext.ResourceScarcity) == 0 { - panic("ResourceScarcity is not set") - } - return float64(allocated.AsWeightedMillis(qctx.SchedulingContext.ResourceScarcity)) / qctx.Weight -} - -func (qctx *QueueSchedulingContext) dominantResourceFairnessCostWithAllocation(allocated schedulerobjects.ResourceList) float64 { - if len(qctx.SchedulingContext.DominantResourceFairnessResourcesToConsider) == 0 { - panic("DominantResourceFairnessResourcesToConsider is not set") - } - var cost float64 - for _, t := range qctx.SchedulingContext.DominantResourceFairnessResourcesToConsider { - capacity := qctx.SchedulingContext.TotalResources.Get(t) - if capacity.Equal(resource.Quantity{}) { - // Ignore any resources with zero capacity. - continue - } - q := allocated.Get(t) - tcost := float64(q.MilliValue()) / float64(capacity.MilliValue()) - if tcost > cost { - cost = tcost - } - } - return cost / qctx.Weight -} - type GangSchedulingContext struct { Created time.Time Queue string diff --git a/internal/scheduler/context/context_test.go b/internal/scheduler/context/context_test.go index 05e17c24068..3b932f30540 100644 --- a/internal/scheduler/context/context_test.go +++ b/internal/scheduler/context/context_test.go @@ -9,6 +9,7 @@ import ( "k8s.io/apimachinery/pkg/api/resource" armadaslices "github.com/armadaproject/armada/internal/common/slices" + "github.com/armadaproject/armada/internal/scheduler/fairness" "github.com/armadaproject/armada/internal/scheduler/schedulerobjects" "github.com/armadaproject/armada/internal/scheduler/testfixtures" ) @@ -33,13 +34,16 @@ func TestNewGangSchedulingContext(t *testing.T) { } func TestSchedulingContextAccounting(t *testing.T) { + totalResources := schedulerobjects.ResourceList{Resources: map[string]resource.Quantity{"cpu": resource.MustParse("1")}} + fairnessCostProvider, err := fairness.NewAssetFairness(map[string]float64{"cpu": 1}) + require.NoError(t, err) sctx := NewSchedulingContext( "executor", "pool", testfixtures.TestPriorityClasses, testfixtures.TestDefaultPriorityClass, - map[string]float64{"cpu": 1}, - schedulerobjects.ResourceList{Resources: map[string]resource.Quantity{"cpu": resource.MustParse("1")}}, + fairnessCostProvider, + totalResources, ) priorityFactorByQueue := map[string]float64{"A": 1, "B": 1} allocatedByQueueAndPriorityClass := map[string]schedulerobjects.QuantityByTAndResourceType[string]{ @@ -55,7 +59,7 @@ func TestSchedulingContextAccounting(t *testing.T) { expected := sctx.AllocatedByQueueAndPriority() jctxs := testNSmallCpuJobSchedulingContext("A", testfixtures.TestDefaultPriorityClass, 2) gctx := NewGangSchedulingContext(jctxs) - _, err := sctx.AddGangSchedulingContext(gctx) + _, err = sctx.AddGangSchedulingContext(gctx) require.NoError(t, err) for _, jctx := range jctxs { _, err := sctx.EvictJob(jctx.Job) diff --git a/internal/scheduler/database/db.go b/internal/scheduler/database/db.go index a54b3b29d0a..5af3de156f4 100644 --- a/internal/scheduler/database/db.go +++ b/internal/scheduler/database/db.go @@ -7,8 +7,8 @@ package database import ( "context" - "github.com/jackc/pgconn" - "github.com/jackc/pgx/v4" + "github.com/jackc/pgx/v5/pgconn" + "github.com/jackc/pgx/v5" ) type DBTX interface { diff --git a/internal/scheduler/database/db_pruner.go b/internal/scheduler/database/db_pruner.go index b8603af5fc4..728c3c9b71b 100644 --- a/internal/scheduler/database/db_pruner.go +++ b/internal/scheduler/database/db_pruner.go @@ -4,7 +4,7 @@ import ( ctx "context" "time" - "github.com/jackc/pgx/v4" + "github.com/jackc/pgx/v5" "github.com/pkg/errors" log "github.com/sirupsen/logrus" "k8s.io/apimachinery/pkg/util/clock" @@ -56,7 +56,7 @@ func PruneDb(ctx ctx.Context, db *pgx.Conn, batchLimit int, keepAfterCompletion for keepGoing { batchStart := time.Now() batchSize := 0 - err = db.BeginTxFunc(ctx, pgx.TxOptions{ + err := pgx.BeginTxFunc(ctx, db, pgx.TxOptions{ IsoLevel: pgx.ReadCommitted, AccessMode: pgx.ReadWrite, DeferrableMode: pgx.Deferrable, @@ -85,14 +85,12 @@ func PruneDb(ctx ctx.Context, db *pgx.Conn, batchLimit int, keepAfterCompletion DELETE FROM job_run_errors WHERE job_id in (SELECT job_id from batch); DELETE FROM rows_to_delete WHERE job_id in (SELECT job_id from batch); TRUNCATE TABLE batch;`) - if err != nil { - return err - } - return nil + return err }) if err != nil { return errors.Wrapf(err, "Error deleting batch from postgres") } + taken := time.Now().Sub(batchStart) jobsDeleted += batchSize log.Infof("Deleted %d jobs in %s. Deleted %d jobs out of %d", batchSize, taken, jobsDeleted, totalJobsToDelete) diff --git a/internal/scheduler/database/db_pruner_test.go b/internal/scheduler/database/db_pruner_test.go index 950b4ce987f..bd1165ed2d3 100644 --- a/internal/scheduler/database/db_pruner_test.go +++ b/internal/scheduler/database/db_pruner_test.go @@ -7,7 +7,7 @@ import ( "time" "github.com/google/uuid" - "github.com/jackc/pgx/v4/pgxpool" + "github.com/jackc/pgx/v5/pgxpool" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "k8s.io/apimachinery/pkg/util/clock" diff --git a/internal/scheduler/database/executor_repository.go b/internal/scheduler/database/executor_repository.go index 00b9a9c0f1f..c2da2442e54 100644 --- a/internal/scheduler/database/executor_repository.go +++ b/internal/scheduler/database/executor_repository.go @@ -5,7 +5,7 @@ import ( "time" "github.com/gogo/protobuf/proto" - "github.com/jackc/pgx/v4/pgxpool" + "github.com/jackc/pgx/v5/pgxpool" "github.com/pkg/errors" "github.com/armadaproject/armada/internal/common/compress" diff --git a/internal/scheduler/database/executor_repository_test.go b/internal/scheduler/database/executor_repository_test.go index d1de56769a3..2d7bd206512 100644 --- a/internal/scheduler/database/executor_repository_test.go +++ b/internal/scheduler/database/executor_repository_test.go @@ -5,7 +5,7 @@ import ( "testing" "time" - "github.com/jackc/pgx/v4/pgxpool" + "github.com/jackc/pgx/v5/pgxpool" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "golang.org/x/exp/slices" diff --git a/internal/scheduler/database/job_repository.go b/internal/scheduler/database/job_repository.go index 3d4252ef2d0..3592da8d3c0 100644 --- a/internal/scheduler/database/job_repository.go +++ b/internal/scheduler/database/job_repository.go @@ -5,8 +5,8 @@ import ( "fmt" "github.com/google/uuid" - "github.com/jackc/pgx/v4" - "github.com/jackc/pgx/v4/pgxpool" + "github.com/jackc/pgx/v5" + "github.com/jackc/pgx/v5/pgxpool" "github.com/pkg/errors" "github.com/armadaproject/armada/internal/common/compress" @@ -82,7 +82,7 @@ func (r *PostgresJobRepository) FetchJobRunErrors(ctx context.Context, runIds [] errorsByRunId := make(map[uuid.UUID]*armadaevents.Error, len(runIds)) decompressor := compress.NewZlibDecompressor() - err := r.db.BeginTxFunc(ctx, pgx.TxOptions{ + err := pgx.BeginTxFunc(ctx, r.db, pgx.TxOptions{ IsoLevel: pgx.ReadCommitted, AccessMode: pgx.ReadWrite, DeferrableMode: pgx.Deferrable, @@ -130,7 +130,7 @@ func (r *PostgresJobRepository) FetchJobUpdates(ctx context.Context, jobSerial i var updatedRuns []Run = nil // Use a RepeatableRead transaction here so that we get consistency between jobs and dbRuns - err := r.db.BeginTxFunc(ctx, pgx.TxOptions{ + err := pgx.BeginTxFunc(ctx, r.db, pgx.TxOptions{ IsoLevel: pgx.RepeatableRead, AccessMode: pgx.ReadOnly, DeferrableMode: pgx.Deferrable, @@ -182,7 +182,7 @@ func (r *PostgresJobRepository) FetchJobUpdates(ctx context.Context, jobSerial i // Runs are inactive if they don't exist or if they have succeeded, failed or been cancelled func (r *PostgresJobRepository) FindInactiveRuns(ctx context.Context, runIds []uuid.UUID) ([]uuid.UUID, error) { var inactiveRuns []uuid.UUID - err := r.db.BeginTxFunc(ctx, pgx.TxOptions{ + err := pgx.BeginTxFunc(ctx, r.db, pgx.TxOptions{ IsoLevel: pgx.ReadCommitted, AccessMode: pgx.ReadWrite, DeferrableMode: pgx.Deferrable, @@ -223,7 +223,7 @@ func (r *PostgresJobRepository) FindInactiveRuns(ctx context.Context, runIds []u // in excludedRunIds will be excluded func (r *PostgresJobRepository) FetchJobRunLeases(ctx context.Context, executor string, maxResults uint, excludedRunIds []uuid.UUID) ([]*JobRunLease, error) { var newRuns []*JobRunLease - err := r.db.BeginTxFunc(ctx, pgx.TxOptions{ + err := pgx.BeginTxFunc(ctx, r.db, pgx.TxOptions{ IsoLevel: pgx.ReadCommitted, AccessMode: pgx.ReadWrite, DeferrableMode: pgx.Deferrable, diff --git a/internal/scheduler/database/job_repository_test.go b/internal/scheduler/database/job_repository_test.go index b5d0c9a70e5..b236618185b 100644 --- a/internal/scheduler/database/job_repository_test.go +++ b/internal/scheduler/database/job_repository_test.go @@ -7,7 +7,7 @@ import ( "time" "github.com/google/uuid" - "github.com/jackc/pgx/v4/pgxpool" + "github.com/jackc/pgx/v5/pgxpool" "github.com/pkg/errors" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" diff --git a/internal/scheduler/database/sql.yaml b/internal/scheduler/database/sql.yaml index cb53606921b..685544b0b76 100644 --- a/internal/scheduler/database/sql.yaml +++ b/internal/scheduler/database/sql.yaml @@ -8,7 +8,7 @@ sql: go: out: "." package: "database" - sql_package: "pgx/v4" + sql_package: "pgx/v5" emit_prepared_queries: true emit_db_tags: true emit_interface: false \ No newline at end of file diff --git a/internal/scheduler/database/util.go b/internal/scheduler/database/util.go index 266efb53b08..d6539a2a743 100644 --- a/internal/scheduler/database/util.go +++ b/internal/scheduler/database/util.go @@ -6,8 +6,7 @@ import ( _ "embed" "time" - "github.com/jackc/pgtype/pgxtype" - "github.com/jackc/pgx/v4/pgxpool" + "github.com/jackc/pgx/v5/pgxpool" log "github.com/sirupsen/logrus" "github.com/armadaproject/armada/internal/common/database" @@ -16,7 +15,7 @@ import ( //go:embed migrations/*.sql var fs embed.FS -func Migrate(ctx context.Context, db pgxtype.Querier) error { +func Migrate(ctx context.Context, db database.Querier) error { start := time.Now() migrations, err := database.ReadMigrations(fs, "migrations") if err != nil { diff --git a/internal/scheduler/fairness/fairness.go b/internal/scheduler/fairness/fairness.go new file mode 100644 index 00000000000..36509d6562b --- /dev/null +++ b/internal/scheduler/fairness/fairness.go @@ -0,0 +1,86 @@ +package fairness + +import ( + "github.com/pkg/errors" + "k8s.io/apimachinery/pkg/api/resource" + + "github.com/armadaproject/armada/internal/scheduler/schedulerobjects" +) + +// QueueRepository is a minimal representation of a queue repository used for computing fairness. +type QueueRepository interface { + GetQueue(name string) (Queue, bool) +} + +// Queue is a minimal representation of a queue used for computing fairness. +type Queue interface { + // GetAllocation returns the current allocation of the queue. + GetAllocation() schedulerobjects.ResourceList + GetWeight() float64 +} + +// FairnessCostProvider captures algorithms to compute the cost of an allocation. +type FairnessCostProvider interface { + CostFromQueue(queue Queue) float64 + CostFromAllocationAndWeight(allocation schedulerobjects.ResourceList, weight float64) float64 +} + +type AssetFairness struct { + // Weights used when computing asset fairness. + resourceScarcity map[string]float64 +} + +func NewAssetFairness(resourceScarcity map[string]float64) (*AssetFairness, error) { + if len(resourceScarcity) == 0 { + return nil, errors.New("resourceScarcity is empty") + } + return &AssetFairness{ + resourceScarcity: resourceScarcity, + }, nil +} + +func (f *AssetFairness) CostFromQueue(queue Queue) float64 { + return f.CostFromAllocationAndWeight(queue.GetAllocation(), queue.GetWeight()) +} + +func (f *AssetFairness) CostFromAllocationAndWeight(allocation schedulerobjects.ResourceList, weight float64) float64 { + return float64(allocation.AsWeightedMillis(f.resourceScarcity)) / weight +} + +type DominantResourceFairness struct { + // Total resources across all nodes. + totalResources schedulerobjects.ResourceList + // Resources considered when computing DominantResourceFairness. + resourcesToConsider []string +} + +func NewDominantResourceFairness(totalResources schedulerobjects.ResourceList, resourcesToConsider []string) (*DominantResourceFairness, error) { + if len(resourcesToConsider) == 0 { + return nil, errors.New("resourcesToConsider is empty") + } + return &DominantResourceFairness{ + totalResources: totalResources, + resourcesToConsider: resourcesToConsider, + }, nil +} + +func (f *DominantResourceFairness) CostFromQueue(queue Queue) float64 { + return f.CostFromAllocationAndWeight(queue.GetAllocation(), queue.GetWeight()) +} + +func (f *DominantResourceFairness) CostFromAllocationAndWeight(allocation schedulerobjects.ResourceList, weight float64) float64 { + var cost float64 + for _, t := range f.resourcesToConsider { + capacity := f.totalResources.Get(t) + if capacity.Equal(resource.Quantity{}) { + // Ignore any resources with zero capacity. + continue + } + q := allocation.Get(t) + tcost := float64(q.MilliValue()) / float64(capacity.MilliValue()) + if tcost > cost { + cost = tcost + } + } + return cost / weight +} diff --git a/internal/scheduler/fairness/fairness_test.go b/internal/scheduler/fairness/fairness_test.go new file mode 100644 index 00000000000..28791870aa8 --- /dev/null +++ b/internal/scheduler/fairness/fairness_test.go @@ -0,0 +1,273 @@ +package fairness + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "k8s.io/apimachinery/pkg/api/resource" + + "github.com/armadaproject/armada/internal/scheduler/schedulerobjects" +) + +type MinimalQueue struct { + allocation schedulerobjects.ResourceList + weight float64 +} + +func (q MinimalQueue) GetAllocation() schedulerobjects.ResourceList { + return q.allocation +} + +func (q MinimalQueue) GetWeight() float64 { + return q.weight +} + +func TestNewAssetFairness(t *testing.T) { + _, err := NewAssetFairness(map[string]float64{}) + require.Error(t, err) +} + +func TestAssetFairness(t *testing.T) { + tests := map[string]struct { + resourceScarcity map[string]float64 + allocation schedulerobjects.ResourceList + weight float64 + expectedCost float64 + }{ + "single resource 1": { + resourceScarcity: map[string]float64{ + "foo": 1, + "bar": 2, + "baz": 3, + }, + allocation: schedulerobjects.ResourceList{ + Resources: map[string]resource.Quantity{ + "foo": resource.MustParse("0.5"), + }, + }, + weight: 1.0, + expectedCost: 0.5, + }, + "single resource 2": { + resourceScarcity: map[string]float64{ + "foo": 1, + "bar": 2, + "baz": 3, + }, + allocation: schedulerobjects.ResourceList{ + Resources: map[string]resource.Quantity{ + "bar": resource.MustParse("0.5"), + }, + }, + weight: 1.0, + expectedCost: 1.0, + }, + "multiple resources": { + resourceScarcity: map[string]float64{ + "foo": 1, + "bar": 2, + "baz": 3, + }, + allocation: schedulerobjects.ResourceList{ + Resources: map[string]resource.Quantity{ + "foo": resource.MustParse("0.5"), + "bar": resource.MustParse("1"), + }, + }, + weight: 1.0, + expectedCost: 2.5, + }, + "considered resources": { + resourceScarcity: map[string]float64{ + "foo": 1, + "bar": 2, + "baz": 3, + }, + allocation: schedulerobjects.ResourceList{ + Resources: map[string]resource.Quantity{ + "foo": resource.MustParse("0.5"), + "bar": resource.MustParse("1"), + "doesNotExist": resource.MustParse("1"), + }, + }, + weight: 1.0, + expectedCost: 2.5, + }, + "weight": { + resourceScarcity: map[string]float64{ + "foo": 1, + "bar": 2, + "baz": 3, + }, + allocation: schedulerobjects.ResourceList{ + Resources: map[string]resource.Quantity{ + "foo": resource.MustParse("0.5"), + "baz": resource.MustParse("2"), + }, + }, + weight: 2.0, + expectedCost: 6.5 / 2, + }, + } + for name, tc := range tests { + t.Run(name, func(t *testing.T) { + f, err := NewAssetFairness(tc.resourceScarcity) + require.NoError(t, err) + assert.Equal( + t, + 1000*tc.expectedCost, // Convert to millis. + f.CostFromAllocationAndWeight(tc.allocation, tc.weight), + ) + assert.Equal( + t, + f.CostFromAllocationAndWeight(tc.allocation, tc.weight), + f.CostFromQueue(MinimalQueue{allocation: tc.allocation, weight: tc.weight}), + ) + }) + } +} + +func TestNewDominantResourceFairness(t *testing.T) { + _, err := NewDominantResourceFairness( + schedulerobjects.ResourceList{ + Resources: map[string]resource.Quantity{ + "foo": resource.MustParse("1"), + }, + }, + []string{}, + ) + require.Error(t, err) +} + +func TestDominantResourceFairness(t *testing.T) { + tests := map[string]struct { + totalResources schedulerobjects.ResourceList + resourcesToConsider []string + allocation schedulerobjects.ResourceList + weight float64 + expectedCost float64 + }{ + "single resource 1": { + totalResources: schedulerobjects.ResourceList{ + Resources: map[string]resource.Quantity{ + "foo": resource.MustParse("1"), + "bar": resource.MustParse("2"), + "baz": resource.MustParse("3"), + }, + }, + resourcesToConsider: []string{"foo", "bar"}, + allocation: schedulerobjects.ResourceList{ + Resources: map[string]resource.Quantity{ + "foo": resource.MustParse("0.5"), + }, + }, + weight: 1.0, + expectedCost: 0.5, + }, + "single resource 2": { + totalResources: schedulerobjects.ResourceList{ + Resources: map[string]resource.Quantity{ + "foo": resource.MustParse("1"), + "bar": resource.MustParse("2"), + "baz": resource.MustParse("3"), + }, + }, + resourcesToConsider: []string{"foo", "bar"}, + allocation: schedulerobjects.ResourceList{ + Resources: map[string]resource.Quantity{ + "bar": resource.MustParse("0.5"), + }, + }, + weight: 1.0, + expectedCost: 0.25, + }, + "multiple resources": { + totalResources: schedulerobjects.ResourceList{ + Resources: map[string]resource.Quantity{ + "foo": resource.MustParse("1"), + "bar": resource.MustParse("2"), + "baz": resource.MustParse("3"), + }, + }, + resourcesToConsider: []string{"foo", "bar"}, + allocation: schedulerobjects.ResourceList{ + Resources: map[string]resource.Quantity{ + "foo": resource.MustParse("0.5"), + "bar": resource.MustParse("1.1"), + }, + }, + weight: 1.0, + expectedCost: 1.1 / 2, + }, + "considered resources": { + totalResources: schedulerobjects.ResourceList{ + Resources: map[string]resource.Quantity{ + "foo": resource.MustParse("1"), + "bar": resource.MustParse("2"), + "baz": resource.MustParse("3"), + }, + }, + resourcesToConsider: []string{"foo", "bar"}, + allocation: schedulerobjects.ResourceList{ + Resources: map[string]resource.Quantity{ + "foo": resource.MustParse("0.5"), + "baz": resource.MustParse("3"), + }, + }, + weight: 1.0, + expectedCost: 0.5, + }, + "zero available resource": { + totalResources: schedulerobjects.ResourceList{ + Resources: map[string]resource.Quantity{ + "foo": resource.MustParse("1"), + "bar": resource.MustParse("0"), + "baz": resource.MustParse("3"), + }, + }, + resourcesToConsider: []string{"foo", "bar"}, + allocation: schedulerobjects.ResourceList{ + Resources: map[string]resource.Quantity{ + "foo": resource.MustParse("0.5"), + "bar": resource.MustParse("2.0"), + }, + }, + weight: 1.0, + expectedCost: 0.5, + }, + "weight": { + totalResources: schedulerobjects.ResourceList{ + Resources: map[string]resource.Quantity{ + "foo": resource.MustParse("1"), + "bar": resource.MustParse("2"), + "baz": resource.MustParse("3"), + }, + }, + resourcesToConsider: []string{"foo", "bar"}, + allocation: schedulerobjects.ResourceList{ + Resources: map[string]resource.Quantity{ + "foo": resource.MustParse("0.5"), + }, + }, + weight: 2.0, + expectedCost: 0.25, + }, + } + for name, tc := range tests { + t.Run(name, func(t *testing.T) { + f, err := NewDominantResourceFairness(tc.totalResources, tc.resourcesToConsider) + require.NoError(t, err) + assert.Equal( + t, + tc.expectedCost, + f.CostFromAllocationAndWeight(tc.allocation, tc.weight), + ) + assert.Equal( + t, + f.CostFromAllocationAndWeight(tc.allocation, tc.weight), + f.CostFromQueue(MinimalQueue{allocation: tc.allocation, weight: tc.weight}), + ) + }) + } +} diff --git a/internal/scheduler/gang_scheduler_test.go b/internal/scheduler/gang_scheduler_test.go index ea7153b077a..bab895a0421 100644 --- a/internal/scheduler/gang_scheduler_test.go +++ b/internal/scheduler/gang_scheduler_test.go @@ -12,6 +12,7 @@ import ( armadaslices "github.com/armadaproject/armada/internal/common/slices" schedulerconstraints "github.com/armadaproject/armada/internal/scheduler/constraints" schedulercontext "github.com/armadaproject/armada/internal/scheduler/context" + "github.com/armadaproject/armada/internal/scheduler/fairness" "github.com/armadaproject/armada/internal/scheduler/jobdb" "github.com/armadaproject/armada/internal/scheduler/nodedb" "github.com/armadaproject/armada/internal/scheduler/schedulerobjects" @@ -327,12 +328,18 @@ func TestGangScheduler(t *testing.T) { priorityFactorByQueue[job.GetQueue()] = 1 } } + + fairnessCostProvider, err := fairness.NewDominantResourceFairness( + tc.TotalResources, + tc.SchedulingConfig.DominantResourceFairnessResourcesToConsider, + ) + require.NoError(t, err) sctx := schedulercontext.NewSchedulingContext( "executor", "pool", tc.SchedulingConfig.Preemption.PriorityClasses, tc.SchedulingConfig.Preemption.DefaultPriorityClass, - tc.SchedulingConfig.ResourceScarcity, + fairnessCostProvider, tc.TotalResources, ) for queue, priorityFactor := range priorityFactorByQueue { diff --git a/internal/scheduler/jobdb/job.go b/internal/scheduler/jobdb/job.go index c773aa6514f..f75d0dab3a7 100644 --- a/internal/scheduler/jobdb/job.go +++ b/internal/scheduler/jobdb/job.go @@ -3,6 +3,7 @@ package jobdb import ( "time" + "github.com/gogo/protobuf/proto" "github.com/google/uuid" "golang.org/x/exp/maps" v1 "k8s.io/api/core/v1" @@ -431,6 +432,21 @@ func (job *Job) WithJobSchedulingInfo(jobSchedulingInfo *schedulerobjects.JobSch return j } +func (job *Job) DeepCopy() *Job { + copiedSchedulingInfo := proto.Clone(job.JobSchedulingInfo()).(*schedulerobjects.JobSchedulingInfo) + j := job.WithJobSchedulingInfo(copiedSchedulingInfo) + + j.runsById = maps.Clone(j.runsById) + for key, run := range j.runsById { + j.runsById[key] = run.DeepCopy() + } + if j.activeRun != nil { + j.activeRun = job.activeRun.DeepCopy() + } + + return j +} + // copyJob makes a copy of the job func copyJob(j Job) *Job { return &j @@ -441,20 +457,16 @@ type JobPriorityComparer struct{} // Compare jobs first by priority then by created and finally by id. // returns -1 if a should come before b, 1 if a should come after b and 0 if the two jobs are equal func (j JobPriorityComparer) Compare(a, b *Job) int { - if a == b { - return 0 - } - - // Compare the jobs by priority + // Compare the jobs by priority. if a.priority != b.priority { - if a.priority > b.priority { + if a.priority < b.priority { return -1 } else { return 1 } } - // If the jobs have the same priority, compare them by created timestamp + // If the jobs have the same priority, compare them by created timestamp. if a.created != b.created { if a.created < b.created { return -1 @@ -463,7 +475,7 @@ func (j JobPriorityComparer) Compare(a, b *Job) int { } } - // If the jobs have the same priority and created timestamp, compare them by ID + // If the jobs have the same priority and created timestamp, compare them by id. if a.id != b.id { if a.id < b.id { return -1 @@ -472,6 +484,6 @@ func (j JobPriorityComparer) Compare(a, b *Job) int { } } - // If the jobs have the same ID, return 0 + // Jobs are equal; return 0. return 0 } diff --git a/internal/scheduler/jobdb/job_test.go b/internal/scheduler/jobdb/job_test.go index 89f403481f9..2b3df48fb95 100644 --- a/internal/scheduler/jobdb/job_test.go +++ b/internal/scheduler/jobdb/job_test.go @@ -290,6 +290,26 @@ func TestJob_TestWithCreated(t *testing.T) { assert.Equal(t, int64(456), newJob.Created()) } +func TestJob_DeepCopy(t *testing.T) { + original := NewJob("test-job", "test-jobset", "test-queue", 2, schedulingInfo, true, 0, false, false, false, 3) + original = original.WithUpdatedRun(baseJobRun.DeepCopy()) + expected := NewJob("test-job", "test-jobset", "test-queue", 2, schedulingInfo, true, 0, false, false, false, 3) + expected = expected.WithUpdatedRun(baseJobRun.DeepCopy()) + + result := original.DeepCopy() + assert.Equal(t, expected, result) + assert.Equal(t, expected, original) + + // Modify and confirm original hasn't changed + result.activeRun.nodeName = "test" + result.runsById[baseJobRun.id].nodeName = "test" + result.queue = "test" + result.jobSchedulingInfo.Priority = 1 + + assert.NotEqual(t, expected, result) + assert.Equal(t, expected, original) +} + func TestJob_TestWithJobSchedulingInfo(t *testing.T) { newSchedInfo := &schedulerobjects.JobSchedulingInfo{ ObjectRequirements: []*schedulerobjects.ObjectRequirements{ @@ -346,8 +366,8 @@ func TestJobPriorityComparer(t *testing.T) { comparer := JobPriorityComparer{} assert.Equal(t, 0, comparer.Compare(job1, job1)) - assert.Equal(t, -1, comparer.Compare(job1, job1.WithPriority(9))) + assert.Equal(t, 1, comparer.Compare(job1, job1.WithPriority(9))) assert.Equal(t, -1, comparer.Compare(job1, job1.WithCreated(6))) - assert.Equal(t, 1, comparer.Compare(job1, job1.WithPriority(11))) + assert.Equal(t, -1, comparer.Compare(job1, job1.WithPriority(11))) assert.Equal(t, 1, comparer.Compare(job1, job1.WithCreated(4))) } diff --git a/internal/scheduler/jobdb/jobdb.go b/internal/scheduler/jobdb/jobdb.go index 2fd1e0c9f87..700627c67a6 100644 --- a/internal/scheduler/jobdb/jobdb.go +++ b/internal/scheduler/jobdb/jobdb.go @@ -12,8 +12,8 @@ import ( var emptyList = immutable.NewSortedSet[*Job](JobPriorityComparer{}) type JobDb struct { - jobsById map[string]*Job - jobsByRunId map[uuid.UUID]string + jobsById *immutable.Map[string, *Job] + jobsByRunId *immutable.Map[uuid.UUID, string] jobsByQueue map[string]immutable.SortedSet[*Job] copyMutex sync.Mutex writerMutex sync.Mutex @@ -21,8 +21,8 @@ type JobDb struct { func NewJobDb() *JobDb { return &JobDb{ - jobsById: map[string]*Job{}, - jobsByRunId: map[uuid.UUID]string{}, + jobsById: immutable.NewMap[string, *Job](nil), + jobsByRunId: immutable.NewMap[uuid.UUID, string](&UUIDHasher{}), jobsByQueue: map[string]immutable.SortedSet[*Job]{}, copyMutex: sync.Mutex{}, } @@ -33,41 +33,92 @@ func (jobDb *JobDb) Upsert(txn *Txn, jobs []*Job) error { if err := jobDb.checkWritableTransaction(txn); err != nil { return err } - for _, job := range jobs { - existingJob := txn.jobsById[job.id] - if existingJob != nil { - existingQueue, ok := txn.jobsByQueue[existingJob.queue] + + hasJobs := txn.jobsById.Len() > 0 + + // First we need to delete the state of any queued jobs + if hasJobs { + for _, job := range jobs { + existingJob, ok := txn.jobsById.Get(job.id) if ok { - txn.jobsByQueue[existingJob.queue] = existingQueue.Delete(existingJob) + existingQueue, ok := txn.jobsByQueue[existingJob.queue] + if ok { + txn.jobsByQueue[existingJob.queue] = existingQueue.Delete(existingJob) + } } } - txn.jobsById[job.id] = job - for _, run := range job.runsById { - txn.jobsByRunId[run.id] = job.id + } + + // Now need to insert jobs, runs and queuedJobs. This can be done in parallel + wg := sync.WaitGroup{} + wg.Add(3) + + // jobs + go func() { + defer wg.Done() + if hasJobs { + for _, job := range jobs { + txn.jobsById = txn.jobsById.Set(job.id, job) + } + } else { + jobsById := immutable.NewMapBuilder[string, *Job](nil) + for _, job := range jobs { + jobsById.Set(job.id, job) + } + txn.jobsById = jobsById.Map() } - if job.Queued() { - newQueue, ok := txn.jobsByQueue[job.queue] - if !ok { - q := emptyList - newQueue = q + }() + + // runs + go func() { + defer wg.Done() + if hasJobs { + for _, job := range jobs { + for _, run := range job.runsById { + txn.jobsByRunId = txn.jobsByRunId.Set(run.id, job.id) + } + } + } else { + jobsByRunId := immutable.NewMapBuilder[uuid.UUID, string](&UUIDHasher{}) + for _, job := range jobs { + for _, run := range job.runsById { + jobsByRunId.Set(run.id, job.id) + } } - newQueue = newQueue.Add(job) - txn.jobsByQueue[job.queue] = newQueue + txn.jobsByRunId = jobsByRunId.Map() } - } + }() + + // queued Jobs + go func() { + defer wg.Done() + for _, job := range jobs { + if job.Queued() { + newQueue, ok := txn.jobsByQueue[job.queue] + if !ok { + q := emptyList + newQueue = q + } + newQueue = newQueue.Add(job) + txn.jobsByQueue[job.queue] = newQueue + } + } + }() + wg.Wait() return nil } // GetById returns the job with the given Id or nil if no such job exists // The Job returned by this function *must not* be subsequently modified func (jobDb *JobDb) GetById(txn *Txn, id string) *Job { - return txn.jobsById[id] + j, _ := txn.jobsById.Get(id) + return j } // GetByRunId returns the job with the given run id or nil if no such job exists // The Job returned by this function *must not* be subsequently modified func (jobDb *JobDb) GetByRunId(txn *Txn, runId uuid.UUID) *Job { - jobId := txn.jobsByRunId[runId] + jobId, _ := txn.jobsByRunId.Get(runId) return jobDb.GetById(txn, jobId) } @@ -93,21 +144,27 @@ func (jobDb *JobDb) QueuedJobs(txn *Txn, queue string) *immutable.SortedSetItera // GetAll returns all jobs in the database. // The Jobs returned by this function *must not* be subsequently modified func (jobDb *JobDb) GetAll(txn *Txn) []*Job { - return maps.Values(txn.jobsById) + allJobs := make([]*Job, 0, txn.jobsById.Len()) + iter := txn.jobsById.Iterator() + for !iter.Done() { + _, job, _ := iter.Next() + allJobs = append(allJobs, job) + } + return allJobs } -// BatchDelete removes the jobs with the given ids from the database. Any ids that are not in the database will be -// ignored +// BatchDelete deletes the jobs with the given ids from the database. +// Any ids not in the database are ignored. func (jobDb *JobDb) BatchDelete(txn *Txn, ids []string) error { if err := jobDb.checkWritableTransaction(txn); err != nil { return err } for _, id := range ids { - job, present := txn.jobsById[id] + job, present := txn.jobsById.Get(id) if present { - delete(txn.jobsById, id) + txn.jobsById = txn.jobsById.Delete(id) for _, run := range job.runsById { - delete(txn.jobsByRunId, run.id) + txn.jobsByRunId = txn.jobsByRunId.Delete(run.id) } queue, ok := txn.jobsByQueue[job.queue] if ok { @@ -153,8 +210,8 @@ func (jobDb *JobDb) WriteTxn() *Txn { defer jobDb.copyMutex.Unlock() return &Txn{ readOnly: false, - jobsById: maps.Clone(jobDb.jobsById), - jobsByRunId: maps.Clone(jobDb.jobsByRunId), + jobsById: jobDb.jobsById, + jobsByRunId: jobDb.jobsByRunId, jobsByQueue: maps.Clone(jobDb.jobsByQueue), active: true, jobDb: jobDb, @@ -167,8 +224,8 @@ func (jobDb *JobDb) WriteTxn() *Txn { // until the transaction is committed. type Txn struct { readOnly bool - jobsById map[string]*Job - jobsByRunId map[uuid.UUID]string + jobsById *immutable.Map[string, *Job] + jobsByRunId *immutable.Map[uuid.UUID, string] jobsByQueue map[string]immutable.SortedSet[*Job] jobDb *JobDb active bool diff --git a/internal/scheduler/jobdb/jobdb_test.go b/internal/scheduler/jobdb/jobdb_test.go index e55fb63f18c..3d73d0bdd1f 100644 --- a/internal/scheduler/jobdb/jobdb_test.go +++ b/internal/scheduler/jobdb/jobdb_test.go @@ -91,7 +91,8 @@ func TestJobDb_TestQueuedJobs(t *testing.T) { jobs := make([]*Job, 10) for i := 0; i < len(jobs); i++ { jobs[i] = newJob().WithQueued(true) - jobs[i].created = int64(i) // forces an order + jobs[i].priority = 1000 + jobs[i].created = int64(i) // Ensures jobs are ordered. } shuffledJobs := slices.Clone(jobs) rand.Shuffle(len(shuffledJobs), func(i, j int) { shuffledJobs[i], shuffledJobs[j] = shuffledJobs[j], jobs[i] }) @@ -122,7 +123,7 @@ func TestJobDb_TestQueuedJobs(t *testing.T) { assert.Equal(t, []*Job{jobs[0], jobs[2], jobs[6], jobs[8], jobs[9]}, collect()) // change the priority of a job to put it to the front of the queue - updatedJob := jobs[8].WithPriority(100) + updatedJob := jobs[8].WithPriority(0) err = jobDb.Upsert(txn, []*Job{updatedJob}) require.NoError(t, err) assert.Equal(t, []*Job{updatedJob, jobs[0], jobs[2], jobs[6], jobs[9]}, collect()) @@ -175,7 +176,7 @@ func TestJobDb_TestTransactions(t *testing.T) { txn3 := jobDb.ReadTxn() assert.NotNil(t, jobDb.GetById(txn3, job.id)) - assert.Error(t, jobDb.Upsert(txn1, []*Job{job})) // should be error as you can't insert after commmiting + assert.Error(t, jobDb.Upsert(txn1, []*Job{job})) // should be error as you can't insert after committing } func TestJobDb_TestBatchDelete(t *testing.T) { diff --git a/internal/scheduler/jobdb/uuid_hasher.go b/internal/scheduler/jobdb/uuid_hasher.go new file mode 100644 index 00000000000..81d9216e4e0 --- /dev/null +++ b/internal/scheduler/jobdb/uuid_hasher.go @@ -0,0 +1,24 @@ +package jobdb + +import ( + "bytes" + + "github.com/google/uuid" +) + +// UUIDHasher is an implementation of Hasher for UUID. +type UUIDHasher struct{} + +// Hash computes a hash for a UUID. +func (h UUIDHasher) Hash(key uuid.UUID) uint32 { + var hash uint32 + for _, b := range key { + hash = hash*31 + uint32(b) + } + return hash +} + +// Equal checks if two UUIDs are equal. +func (h UUIDHasher) Equal(a, b uuid.UUID) bool { + return bytes.Equal(a[:], b[:]) +} diff --git a/internal/scheduler/jobdb/uuid_hasher_test.go b/internal/scheduler/jobdb/uuid_hasher_test.go new file mode 100644 index 00000000000..6b3e7dcad2d --- /dev/null +++ b/internal/scheduler/jobdb/uuid_hasher_test.go @@ -0,0 +1,74 @@ +package jobdb + +import ( + "testing" + + "github.com/google/uuid" +) + +func TestUUIDHasher_Hash(t *testing.T) { + hasher := UUIDHasher{} + + tests := []struct { + name string + key uuid.UUID + }{ + { + name: "Test with zero UUID", + key: uuid.UUID{}, + }, + { + name: "Test with random UUID", + key: uuid.New(), + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := hasher.Hash(tt.key) + + // Assert the hash value is non-negative (a simple check) + if got < 0 { + t.Errorf("Expected non-negative hash, but got %v", got) + } + }) + } +} + +func TestUUIDHasher_Equal(t *testing.T) { + hasher := UUIDHasher{} + + tests := []struct { + name string + a, b uuid.UUID + want bool + }{ + { + name: "Test with two zero UUIDs", + a: uuid.UUID{}, + b: uuid.UUID{}, + want: true, + }, + { + name: "Test with two different UUIDs", + a: uuid.New(), + b: uuid.New(), + want: false, + }, + { + name: "Test with two same UUIDs", + a: uuid.New(), + b: uuid.MustParse("f47ac10b-58cc-4372-a567-0e02b2c3d479"), // Example UUID, replace with any fixed UUID + want: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := hasher.Equal(tt.a, tt.b) + if got != tt.want { + t.Errorf("Expected %v, but got %v", tt.want, got) + } + }) + } +} diff --git a/internal/scheduler/metrics.go b/internal/scheduler/metrics.go index a972e25de76..15da0d6c478 100644 --- a/internal/scheduler/metrics.go +++ b/internal/scheduler/metrics.go @@ -223,6 +223,8 @@ func (c *MetricsCollector) updateClusterMetrics(ctx context.Context) ([]promethe usedResourceByQueue := map[queueMetricKey]schedulerobjects.ResourceList{} availableResourceByCluster := map[clusterMetricKey]schedulerobjects.ResourceList{} totalResourceByCluster := map[clusterMetricKey]schedulerobjects.ResourceList{} + schedulableNodeCountByCluster := map[clusterMetricKey]int{} + totalNodeCountByCluster := map[clusterMetricKey]int{} txn := c.jobDb.ReadTxn() for _, executor := range executors { @@ -234,8 +236,10 @@ func (c *MetricsCollector) updateClusterMetrics(ctx context.Context) ([]promethe } if !node.Unschedulable { addToResourceListMap(availableResourceByCluster, clusterKey, node.AvailableArmadaResource()) + schedulableNodeCountByCluster[clusterKey]++ } addToResourceListMap(totalResourceByCluster, clusterKey, node.TotalResources) + totalNodeCountByCluster[clusterKey]++ for queueName, resourceUsage := range node.ResourceUsageByQueue { queueKey := queueMetricKey{ @@ -300,6 +304,12 @@ func (c *MetricsCollector) updateClusterMetrics(ctx context.Context) ([]promethe clusterMetrics = append(clusterMetrics, commonmetrics.NewClusterTotalCapacity(resource.QuantityAsFloat64(resourceValue), k.cluster, k.pool, resourceKey, k.nodeType)) } } + for k, v := range schedulableNodeCountByCluster { + clusterMetrics = append(clusterMetrics, commonmetrics.NewClusterAvailableCapacity(float64(v), k.cluster, k.pool, "nodes", k.nodeType)) + } + for k, v := range totalNodeCountByCluster { + clusterMetrics = append(clusterMetrics, commonmetrics.NewClusterTotalCapacity(float64(v), k.cluster, k.pool, "nodes", k.nodeType)) + } return clusterMetrics, nil } diff --git a/internal/scheduler/metrics_test.go b/internal/scheduler/metrics_test.go index 8a6f597284a..52c89eb6641 100644 --- a/internal/scheduler/metrics_test.go +++ b/internal/scheduler/metrics_test.go @@ -163,8 +163,10 @@ func TestMetricsCollector_TestCollect_ClusterMetrics(t *testing.T) { expected: []prometheus.Metric{ commonmetrics.NewClusterAvailableCapacity(64, "cluster-1", testfixtures.TestPool, "cpu", "type-1"), commonmetrics.NewClusterAvailableCapacity(512*1024*1024*1024, "cluster-1", testfixtures.TestPool, "memory", "type-1"), + commonmetrics.NewClusterAvailableCapacity(2, "cluster-1", testfixtures.TestPool, "nodes", "type-1"), commonmetrics.NewClusterTotalCapacity(64, "cluster-1", testfixtures.TestPool, "cpu", "type-1"), commonmetrics.NewClusterTotalCapacity(512*1024*1024*1024, "cluster-1", testfixtures.TestPool, "memory", "type-1"), + commonmetrics.NewClusterTotalCapacity(2, "cluster-1", testfixtures.TestPool, "nodes", "type-1"), }, }, "empty cluster multi node type": { @@ -173,12 +175,16 @@ func TestMetricsCollector_TestCollect_ClusterMetrics(t *testing.T) { expected: []prometheus.Metric{ commonmetrics.NewClusterAvailableCapacity(32, "cluster-1", testfixtures.TestPool, "cpu", "type-1"), commonmetrics.NewClusterAvailableCapacity(256*1024*1024*1024, "cluster-1", testfixtures.TestPool, "memory", "type-1"), + commonmetrics.NewClusterAvailableCapacity(1, "cluster-1", testfixtures.TestPool, "nodes", "type-1"), commonmetrics.NewClusterAvailableCapacity(32, "cluster-1", testfixtures.TestPool, "cpu", "type-2"), commonmetrics.NewClusterAvailableCapacity(256*1024*1024*1024, "cluster-1", testfixtures.TestPool, "memory", "type-2"), + commonmetrics.NewClusterAvailableCapacity(1, "cluster-1", testfixtures.TestPool, "nodes", "type-2"), commonmetrics.NewClusterTotalCapacity(32, "cluster-1", testfixtures.TestPool, "cpu", "type-1"), commonmetrics.NewClusterTotalCapacity(256*1024*1024*1024, "cluster-1", testfixtures.TestPool, "memory", "type-1"), + commonmetrics.NewClusterTotalCapacity(1, "cluster-1", testfixtures.TestPool, "nodes", "type-1"), commonmetrics.NewClusterTotalCapacity(32, "cluster-1", testfixtures.TestPool, "cpu", "type-2"), commonmetrics.NewClusterTotalCapacity(256*1024*1024*1024, "cluster-1", testfixtures.TestPool, "memory", "type-2"), + commonmetrics.NewClusterTotalCapacity(1, "cluster-1", testfixtures.TestPool, "nodes", "type-2"), }, }, "empty cluster with unschedulable node": { @@ -187,8 +193,10 @@ func TestMetricsCollector_TestCollect_ClusterMetrics(t *testing.T) { expected: []prometheus.Metric{ commonmetrics.NewClusterAvailableCapacity(32, "cluster-1", testfixtures.TestPool, "cpu", "type-1"), commonmetrics.NewClusterAvailableCapacity(256*1024*1024*1024, "cluster-1", testfixtures.TestPool, "memory", "type-1"), + commonmetrics.NewClusterAvailableCapacity(1, "cluster-1", testfixtures.TestPool, "nodes", "type-1"), commonmetrics.NewClusterTotalCapacity(64, "cluster-1", testfixtures.TestPool, "cpu", "type-1"), commonmetrics.NewClusterTotalCapacity(512*1024*1024*1024, "cluster-1", testfixtures.TestPool, "memory", "type-1"), + commonmetrics.NewClusterTotalCapacity(2, "cluster-1", testfixtures.TestPool, "nodes", "type-1"), }, }, "cluster with jobs": { @@ -203,8 +211,10 @@ func TestMetricsCollector_TestCollect_ClusterMetrics(t *testing.T) { commonmetrics.NewQueueUsed(1*1024*1024*1024, testfixtures.TestQueue, "cluster-1", testfixtures.TestPool, "memory", "type-1"), commonmetrics.NewClusterAvailableCapacity(32, "cluster-1", testfixtures.TestPool, "cpu", "type-1"), commonmetrics.NewClusterAvailableCapacity(256*1024*1024*1024, "cluster-1", testfixtures.TestPool, "memory", "type-1"), + commonmetrics.NewClusterAvailableCapacity(1, "cluster-1", testfixtures.TestPool, "nodes", "type-1"), commonmetrics.NewClusterTotalCapacity(32, "cluster-1", testfixtures.TestPool, "cpu", "type-1"), commonmetrics.NewClusterTotalCapacity(256*1024*1024*1024, "cluster-1", testfixtures.TestPool, "memory", "type-1"), + commonmetrics.NewClusterTotalCapacity(1, "cluster-1", testfixtures.TestPool, "nodes", "type-1"), }, }, "jobs missing from jobDb": { @@ -215,8 +225,10 @@ func TestMetricsCollector_TestCollect_ClusterMetrics(t *testing.T) { commonmetrics.NewQueueUsed(1*1024*1024*1024, testfixtures.TestQueue, "cluster-1", testfixtures.TestPool, "memory", "type-1"), commonmetrics.NewClusterAvailableCapacity(32, "cluster-1", testfixtures.TestPool, "cpu", "type-1"), commonmetrics.NewClusterAvailableCapacity(256*1024*1024*1024, "cluster-1", testfixtures.TestPool, "memory", "type-1"), + commonmetrics.NewClusterAvailableCapacity(1, "cluster-1", testfixtures.TestPool, "nodes", "type-1"), commonmetrics.NewClusterTotalCapacity(32, "cluster-1", testfixtures.TestPool, "cpu", "type-1"), commonmetrics.NewClusterTotalCapacity(256*1024*1024*1024, "cluster-1", testfixtures.TestPool, "memory", "type-1"), + commonmetrics.NewClusterTotalCapacity(1, "cluster-1", testfixtures.TestPool, "nodes", "type-1"), }, }, } diff --git a/internal/scheduler/nodedb/encoding.go b/internal/scheduler/nodedb/encoding.go index b5ea044b880..a5d88716e80 100644 --- a/internal/scheduler/nodedb/encoding.go +++ b/internal/scheduler/nodedb/encoding.go @@ -25,7 +25,7 @@ func NodeIndexKey(out []byte, nodeTypeId uint64, resources []resource.Quantity) return out } -// RoundedNodeIndexKeyFromResourceList works like NodeIndexKey, except that prior to constructing a the key +// RoundedNodeIndexKeyFromResourceList works like NodeIndexKey, except that prior to constructing the key // the i-th resource is rounded down to the closest multiple of resourceResolutionMillis[i]. // It also takes as arguments a list of resource names and a resourceList, instead of a list of resources. func RoundedNodeIndexKeyFromResourceList(out []byte, nodeTypeId uint64, resourceNames []string, resourceResolutionMillis []int64, rl schedulerobjects.ResourceList) []byte { diff --git a/internal/scheduler/nodedb/nodedb.go b/internal/scheduler/nodedb/nodedb.go index 87aab622a57..a2351d1e75f 100644 --- a/internal/scheduler/nodedb/nodedb.go +++ b/internal/scheduler/nodedb/nodedb.go @@ -18,6 +18,7 @@ import ( "github.com/armadaproject/armada/internal/armada/configuration" "github.com/armadaproject/armada/internal/common/armadaerrors" armadamaps "github.com/armadaproject/armada/internal/common/maps" + armadaslices "github.com/armadaproject/armada/internal/common/slices" "github.com/armadaproject/armada/internal/common/types" "github.com/armadaproject/armada/internal/common/util" schedulerconfig "github.com/armadaproject/armada/internal/scheduler/configuration" @@ -199,6 +200,20 @@ func (nodeDb *NodeDb) CreateAndInsertWithJobDbJobsWithTxn(txn *memdb.Txn, jobs [ return nil } +// EvictedJobSchedulingContext represents an evicted job. +// NodeDb may track these to ensure preemptions are fair. +type EvictedJobSchedulingContext struct { + // Id of the evicted job. + JobId string + // Each evicted job is assigned a unique integer indicating the order in which it is re-scheduled. + // I.e., index establishes a global order among all evicted jobs. + // + // When choosing on which node to schedule a job that would prevent re-scheduling evicted jobs, + // nodeDb choses the node that would prevent re-scheduling jobs with as a large an index as possible. + Index int + JobSchedulingContext *schedulercontext.JobSchedulingContext +} + // NodeDb is the scheduler-internal system used to efficiently find nodes on which a pod could be scheduled. type NodeDb struct { // In-memory database storing *Node. @@ -215,12 +230,13 @@ type NodeDb struct { // Because the number of database indices scales linearly with the number of distinct priorities, // the efficiency of the NodeDb relies on the number of distinct priorities being small. priorityClasses map[string]types.PriorityClass - // Priorities, in increasing order, to try to schedule pods at. - // In particular, if a pod has priority class priority p, try to schedule that pod at priority - // prioritiesToTryAssigningAt[0], ..., prioritiesToTryAssigningAt[i], - // for all i such that prioritiesToTryAssigningAt[i] <= the priority of the pod. - // We do this to, when possible, avoid preempting running jobs. Includes evictedPriority. - prioritiesToTryAssigningAt []int32 + // Prioritiy class priorities in increasing order. + priorityClassPriorities []int32 + // Job priorities supported by the NodeDb. Composed of priority class priorities and nodeDb-internal priorities. + // In particular, if a job has priority class priority p, nodeDb tries to schedule that job at priority + // nodeDbPriorities[0], ..., nodeDbPriorities[i], + // for all i such that nodeDbPriorities[i] <= the priority of the job. + nodeDbPriorities []int32 // Resources, e.g., "cpu", "memory", and "nvidia.com/gpu", // for which indexes are created to enable efficient lookup. indexedResources []string @@ -266,6 +282,9 @@ type NodeDb struct { // Map from podRequirementsNotMetReason Sum64() to the string representation of that reason. // Used to avoid allocs. podRequirementsNotMetReasonStringCache map[uint64]string + + // If true, use experimental preemption strategy. + enableNewPreemptionStrategy bool } func NewNodeDb( @@ -275,13 +294,6 @@ func NewNodeDb( indexedTaints, indexedNodeLabels []string, ) (*NodeDb, error) { - allowedPriorities := map[int32]bool{evictedPriority: true} - for _, pc := range priorityClasses { - allowedPriorities[pc.Priority] = true - } - prioritiesToTryAssigningAt := maps.Keys(allowedPriorities) - slices.Sort(prioritiesToTryAssigningAt) - if len(indexedResources) == 0 { return nil, errors.WithStack(&armadaerrors.ErrInvalidArgument{ Name: "indexedResources", @@ -290,7 +302,14 @@ func NewNodeDb( }) } indexedResourceNames := util.Map(indexedResources, func(v configuration.IndexedResource) string { return v.Name }) - schema, indexNameByPriority := nodeDbSchema(prioritiesToTryAssigningAt, indexedResourceNames) + allowedPriorities := make(map[int32]bool, len(priorityClasses)) + for _, pc := range priorityClasses { + allowedPriorities[pc.Priority] = true + } + priorityClassPriorities := maps.Keys(allowedPriorities) + slices.Sort(priorityClassPriorities) + nodeDbPriorities := armadaslices.Concatenate([]int32{evictedPriority}, priorityClassPriorities) + schema, indexNameByPriority := nodeDbSchema(nodeDbPriorities, indexedResourceNames) db, err := memdb.NewMemDB(schema) if err != nil { return nil, errors.WithStack(err) @@ -323,11 +342,12 @@ func NewNodeDb( } return &NodeDb{ - priorityClasses: priorityClasses, - prioritiesToTryAssigningAt: prioritiesToTryAssigningAt, - maxExtraNodesToConsider: maxExtraNodesToConsider, - indexedResources: indexedResourceNames, - indexedResourcesSet: mapFromSlice(indexedResourceNames), + priorityClasses: priorityClasses, + priorityClassPriorities: priorityClassPriorities, + nodeDbPriorities: nodeDbPriorities, + maxExtraNodesToConsider: maxExtraNodesToConsider, + indexedResources: indexedResourceNames, + indexedResourcesSet: mapFromSlice(indexedResourceNames), indexedResourceResolutionMillis: util.Map( indexedResources, func(v configuration.IndexedResource) int64 { return v.Resolution.MilliValue() }, @@ -345,6 +365,28 @@ func NewNodeDb( }, nil } +// Reset clears out data specific to one scheduling round to prepare for a new scheduling round. +// Only necessary when nodeDb.enableNewPreemptionStrategy is true. +func (nodeDb *NodeDb) Reset() error { + txn := nodeDb.Txn(true) + defer txn.Abort() + it, err := txn.LowerBound("evictedJobs", "id", "") + if err != nil { + return errors.WithStack(err) + } + for obj := it.Next(); obj != nil; obj = it.Next() { + if err := txn.Delete("evictedJobs", obj); err != nil { + return errors.WithStack(err) + } + } + txn.Commit() + return nil +} + +func (nodeDb *NodeDb) EnableNewPreemptionStrategy() { + nodeDb.enableNewPreemptionStrategy = true +} + func (nodeDb *NodeDb) String() string { var sb strings.Builder w := tabwriter.NewWriter(&sb, 1, 1, 1, ' ', 0) @@ -472,6 +514,7 @@ func (nodeDb *NodeDb) ScheduleManyWithTxn(txn *memdb.Txn, jctxs []*schedulercont if err != nil { return false, err } + // If we found a node for this pod, bind it and continue to the next pod. if node != nil { if node, err := bindJobToNode(nodeDb.priorityClasses, jctx.Job, node); err != nil { @@ -484,10 +527,27 @@ func (nodeDb *NodeDb) ScheduleManyWithTxn(txn *memdb.Txn, jctxs []*schedulercont } else { return false, nil } + + // Once a job is scheduled, it should no longer be considered for preemption. + if nodeDb.enableNewPreemptionStrategy { + if err := deleteEvictedJobSchedulingContextIfExistsWithTxn(txn, jctx.JobId); err != nil { + return false, err + } + } } return true, nil } +func deleteEvictedJobSchedulingContextIfExistsWithTxn(txn *memdb.Txn, jobId string) error { + if err := txn.Delete("evictedJobs", &EvictedJobSchedulingContext{JobId: jobId}); err == memdb.ErrNotFound { + return nil + } else if err != nil { + return errors.WithStack(err) + } else { + return nil + } +} + // SelectNodeForJobWithTxn selects a node on which the job can be scheduled. func (nodeDb *NodeDb) SelectNodeForJobWithTxn(txn *memdb.Txn, jctx *schedulercontext.JobSchedulingContext) (*Node, error) { req := jctx.PodRequirements @@ -500,9 +560,10 @@ func (nodeDb *NodeDb) SelectNodeForJobWithTxn(txn *memdb.Txn, jctx *schedulercon // Create a pctx to be returned to the caller. pctx := &schedulercontext.PodSchedulingContext{ - Created: time.Now(), - MatchingNodeTypes: matchingNodeTypes, - NumNodes: nodeDb.numNodes, + Created: time.Now(), + MatchingNodeTypes: matchingNodeTypes, + NumNodes: nodeDb.numNodes, + // TODO: This clone looks unnecessary. NumExcludedNodesByReason: maps.Clone(numExcludedNodesByReason), } jctx.PodSchedulingContext = pctx @@ -522,8 +583,7 @@ func (nodeDb *NodeDb) SelectNodeForJobWithTxn(txn *memdb.Txn, jctx *schedulercon } }() - // If the targetNodeIdAnnocation is set, consider only that node, - // and schedule onto that node even if it requires preempting other jobs. + // If the targetNodeIdAnnocation is set, consider only that node. if nodeId, ok := req.NodeSelector[schedulerconfig.NodeIdLabel]; ok { if it, err := txn.Get("nodes", "id", nodeId); err != nil { return nil, errors.WithStack(err) @@ -536,9 +596,81 @@ func (nodeDb *NodeDb) SelectNodeForJobWithTxn(txn *memdb.Txn, jctx *schedulercon } } - // Try to schedule this pod normally. - // To avoid preempting running jobs, try scheduling at each available priority from lowest to highest. - for _, priority := range nodeDb.prioritiesToTryAssigningAt { + // Try scheduling at evictedPriority. If this succeeds, no preemption is necessary. + pctx.NumExcludedNodesByReason = maps.Clone(numExcludedNodesByReason) + if node, err := nodeDb.selectNodeForPodAtPriority(txn, pctx, evictedPriority, jctx.PodRequirements); err != nil { + return nil, err + } else if err := assertPodSchedulingContextNode(pctx, node); err != nil { + return nil, err + } else if node != nil { + return node, nil + } + + // Try scheduling at the job priority. If this fails, scheduling is impossible and we return. + // This is an optimisation to avoid looking for preemption targets for unschedulable jobs. + pctx.NumExcludedNodesByReason = maps.Clone(numExcludedNodesByReason) + if node, err := nodeDb.selectNodeForPodAtPriority(txn, pctx, jctx.PodRequirements.Priority, jctx.PodRequirements); err != nil { + return nil, err + } else if err := assertPodSchedulingContextNode(pctx, node); err != nil { + return nil, err + } else if node == nil { + return nil, nil + } + pctx.NodeId = "" + pctx.Score = 0 + pctx.ScheduledAtPriority = 0 + + // Schedule by preventing evicted jobs from being re-scheduled. + // This method respect fairness by preventing from re-scheduling jobs that appear as far back in the total order as possible. + if nodeDb.enableNewPreemptionStrategy { + if node, err := nodeDb.selectNodeForJobWithFairPreemption(txn, jctx); err != nil { + return nil, err + } else if err := assertPodSchedulingContextNode(pctx, node); err != nil { + return nil, err + } else if node != nil { + return node, nil + } + } + pctx.NodeId = "" + pctx.Score = 0 + pctx.ScheduledAtPriority = 0 + + // Schedule by kicking off jobs currently bound to a node. + // This method does not respect fairness when choosing on which node to schedule the job. + if node, err := nodeDb.selectNodeForJobWithUrgencyPreemption(txn, jctx); err != nil { + return nil, err + } else if err := assertPodSchedulingContextNode(pctx, node); err != nil { + return nil, err + } else if node != nil { + return node, nil + } + + return nil, nil +} + +func assertPodSchedulingContextNode(pctx *schedulercontext.PodSchedulingContext, node *Node) error { + if node != nil { + if pctx.NodeId == "" { + return errors.New("pctx.NodeId not set") + } + if node.Id != pctx.NodeId { + return errors.Errorf("pctx.NodeId %s does not match node.Id %s", pctx.NodeId, node.Id) + } + } else if pctx.NodeId != "" { + return errors.New("pctx.NodeId is set, but no node was returned") + } + return nil +} + +func (nodeDb *NodeDb) selectNodeForJobWithUrgencyPreemption( + txn *memdb.Txn, + jctx *schedulercontext.JobSchedulingContext, +) (*Node, error) { + pctx := jctx.PodSchedulingContext + req := jctx.PodRequirements + numExcludedNodesByReason := pctx.NumExcludedNodesByReason + // TODO: This doesn't need to include the evictedPriority now. + for _, priority := range nodeDb.priorityClassPriorities { if priority > req.Priority { break } @@ -548,22 +680,13 @@ func (nodeDb *NodeDb) SelectNodeForJobWithTxn(txn *memdb.Txn, jctx *schedulercon pctx.NumExcludedNodesByReason = maps.Clone(numExcludedNodesByReason) // Try to find a node at this priority. - node, err := nodeDb.selectNodeForPodAtPriority(txn, pctx, priority, req) - if err != nil { + if node, err := nodeDb.selectNodeForPodAtPriority(txn, pctx, priority, req); err != nil { return nil, err - } - if node != nil { - if pctx.NodeId == "" { - return nil, errors.New("pctx.NodeId not set") - } - if node.Id != pctx.NodeId { - return nil, errors.New("pctx.NodeId does not match that of the returned node") - } + } else if err := assertPodSchedulingContextNode(pctx, node); err != nil { + return nil, err + } else if node != nil { return node, nil } - if pctx.NodeId != "" { - return nil, errors.New("pctx.NodeId is set, but no node was returned") - } } return nil, nil } @@ -667,6 +790,72 @@ func (nodeDb *NodeDb) selectNodeForPodWithIt( return selectedNode, nil } +// selectNodeForJobWithFairPreemption returns a node onto which the provided job could be scheduled, or nil if none can be found. +// Specifically, it returns the node for which scheduling would result in the most "fair" preemptions. +// +// It does this by considering all evicted jobs in the reverse order they would be scheduled in and preventing +// from being re-scheduled the jobs that would be scheduled last. +func (nodeDb *NodeDb) selectNodeForJobWithFairPreemption(txn *memdb.Txn, jctx *schedulercontext.JobSchedulingContext) (*Node, error) { + pctx := jctx.PodSchedulingContext + var selectedNode *Node + nodesById := make(map[string]*Node) + evictedJobSchedulingContextsByNodeId := make(map[string][]*EvictedJobSchedulingContext) + it, err := txn.ReverseLowerBound("evictedJobs", "index", math.MaxInt) + if err != nil { + return nil, errors.WithStack(err) + } + for obj := it.Next(); obj != nil && selectedNode == nil; obj = it.Next() { + evictedJobSchedulingContext := obj.(*EvictedJobSchedulingContext) + evictedJctx := evictedJobSchedulingContext.JobSchedulingContext + evictedReq := evictedJctx.PodRequirements + + nodeId, ok := evictedReq.NodeSelector[schedulerconfig.NodeIdLabel] + if !ok { + return nil, errors.Errorf("evicted job %s does not have a nodeIdLabel", evictedJctx.JobId) + } + node, ok := nodesById[nodeId] + if !ok { + node, err = nodeDb.GetNodeWithTxn(txn, nodeId) + if err != nil { + return nil, errors.WithStack(err) + } + } + node, err = UnbindJobFromNode(nodeDb.priorityClasses, evictedJctx.Job, node) + if err != nil { + return nil, err + } + nodesById[nodeId] = node + evictedJobSchedulingContextsByNodeId[nodeId] = append(evictedJobSchedulingContextsByNodeId[nodeId], evictedJobSchedulingContext) + + matches, _, reason, err := schedulerobjects.PodRequirementsMet( + node.Taints, + node.Labels, + node.TotalResources, + node.AllocatableByPriority[evictedPriority], + jctx.PodRequirements, + ) + if err != nil { + return nil, err + } + if matches { + selectedNode = node + } else { + s := nodeDb.stringFromPodRequirementsNotMetReason(reason) + pctx.NumExcludedNodesByReason[s] += 1 + } + } + if selectedNode != nil { + pctx.NodeId = selectedNode.Id + pctx.ScheduledAtPriority = jctx.PodRequirements.Priority + for _, evictedJobSchedulingContext := range evictedJobSchedulingContextsByNodeId[selectedNode.Id] { + if err := txn.Delete("evictedJobs", evictedJobSchedulingContext); err != nil { + return nil, errors.WithStack(err) + } + } + } + return selectedNode, nil +} + // bindJobToNode returns a copy of node with job bound to it. func bindJobToNode(priorityClasses map[string]types.PriorityClass, job interfaces.LegacySchedulerJob, node *Node) (*Node, error) { node = node.UnsafeCopy() @@ -801,7 +990,8 @@ func unbindJobFromNodeInPlace(priorityClasses map[string]types.PriorityClass, jo delete(node.EvictedJobRunIds, jobId) if _, ok := node.AllocatedByJobId[jobId]; !ok { - return errors.Errorf("job %s has no resources allocated on node %s", jobId, node.Id) + // Job already unbound; nothing more to do. + return nil } else { delete(node.AllocatedByJobId, jobId) } @@ -878,8 +1068,8 @@ func (nodeDb *NodeDb) Upsert(node *Node) error { } func (nodeDb *NodeDb) UpsertWithTxn(txn *memdb.Txn, node *Node) error { - keys := make([][]byte, len(nodeDb.prioritiesToTryAssigningAt)) - for i, p := range nodeDb.prioritiesToTryAssigningAt { + keys := make([][]byte, len(nodeDb.nodeDbPriorities)) + for i, p := range nodeDb.nodeDbPriorities { keys[i] = nodeDb.nodeDbKey(keys[i], node.NodeTypeId, node.AllocatableByPriority[p]) } node.Keys = keys @@ -902,7 +1092,7 @@ func (nodeDb *NodeDb) ClearAllocated() error { for node := it.NextNode(); node != nil; node = it.NextNode() { node = node.UnsafeCopy() node.AllocatableByPriority = schedulerobjects.NewAllocatableByPriorityAndResourceType( - nodeDb.prioritiesToTryAssigningAt, + nodeDb.nodeDbPriorities, node.TotalResources, ) newNodes = append(newNodes, node) @@ -914,8 +1104,31 @@ func (nodeDb *NodeDb) ClearAllocated() error { return nil } +func (nodeDb *NodeDb) AddEvictedJobSchedulingContextWithTxn(txn *memdb.Txn, index int, jctx *schedulercontext.JobSchedulingContext) error { + if it, err := txn.Get("evictedJobs", "id", jctx.JobId); err != nil { + return errors.WithStack(err) + } else if obj := it.Next(); obj != nil { + return errors.Errorf("tried to insert evicted job %s with duplicate index %d", jctx.JobId, index) + } + if err := txn.Insert("evictedJobs", &EvictedJobSchedulingContext{JobId: jctx.JobId, Index: index, JobSchedulingContext: jctx}); err != nil { + return errors.WithStack(err) + } + return nil +} + func nodeDbSchema(priorities []int32, resources []string) (*memdb.DBSchema, map[int32]string) { - indexes := make(map[string]*memdb.IndexSchema) + nodesTable, indexNameByPriority := nodesTableSchema(priorities, resources) + evictionsTable := evictionsTableSchema() + return &memdb.DBSchema{ + Tables: map[string]*memdb.TableSchema{ + nodesTable.Name: nodesTable, + evictionsTable.Name: evictionsTable, + }, + }, indexNameByPriority +} + +func nodesTableSchema(priorities []int32, resources []string) (*memdb.TableSchema, map[int32]string) { + indexes := make(map[string]*memdb.IndexSchema, len(priorities)+1) indexes["id"] = &memdb.IndexSchema{ Name: "id", Unique: true, @@ -931,14 +1144,28 @@ func nodeDbSchema(priorities []int32, resources []string) (*memdb.DBSchema, map[ Indexer: &NodeIndex{KeyIndex: i}, } } - return &memdb.DBSchema{ - Tables: map[string]*memdb.TableSchema{ - "nodes": { - Name: "nodes", - Indexes: indexes, + return &memdb.TableSchema{ + Name: "nodes", + Indexes: indexes, + }, indexNameByPriority +} + +func evictionsTableSchema() *memdb.TableSchema { + return &memdb.TableSchema{ + Name: "evictedJobs", + Indexes: map[string]*memdb.IndexSchema{ + "id": { + Name: "id", + Unique: true, + Indexer: &memdb.StringFieldIndex{Field: "JobId"}, + }, + "index": { + Name: "index", + Unique: true, + Indexer: &memdb.IntFieldIndex{Field: "Index"}, }, }, - }, indexNameByPriority + } } func nodeIndexName(keyIndex int) string { diff --git a/internal/scheduler/nodedb/nodedb_test.go b/internal/scheduler/nodedb/nodedb_test.go index 78e0198e9d8..50f7f9c5a9c 100644 --- a/internal/scheduler/nodedb/nodedb_test.go +++ b/internal/scheduler/nodedb/nodedb_test.go @@ -152,7 +152,7 @@ func TestNodeBindingEvictionUnbinding(t *testing.T) { require.Error(t, err) _, err = UnbindJobFromNode(testfixtures.TestPriorityClasses, job, entry) - require.Error(t, err) + require.NoError(t, err) _, err = bindJobToNode(testfixtures.TestPriorityClasses, job, boundNode) require.Error(t, err) diff --git a/internal/scheduler/nodedb/nodeiteration_test.go b/internal/scheduler/nodedb/nodeiteration_test.go index 215f1719c40..54447741808 100644 --- a/internal/scheduler/nodedb/nodeiteration_test.go +++ b/internal/scheduler/nodedb/nodeiteration_test.go @@ -432,7 +432,7 @@ func TestNodeTypeIterator(t *testing.T) { indexedResourceRequests[i] = tc.resourceRequests.Get(t) } keyIndex := -1 - for i, p := range nodeDb.prioritiesToTryAssigningAt { + for i, p := range nodeDb.nodeDbPriorities { if p == tc.priority { keyIndex = i } diff --git a/internal/scheduler/preempting_queue_scheduler.go b/internal/scheduler/preempting_queue_scheduler.go index 8e50409e449..ba753af0368 100644 --- a/internal/scheduler/preempting_queue_scheduler.go +++ b/internal/scheduler/preempting_queue_scheduler.go @@ -20,8 +20,11 @@ import ( schedulerconfig "github.com/armadaproject/armada/internal/scheduler/configuration" schedulerconstraints "github.com/armadaproject/armada/internal/scheduler/constraints" schedulercontext "github.com/armadaproject/armada/internal/scheduler/context" + "github.com/armadaproject/armada/internal/scheduler/fairness" "github.com/armadaproject/armada/internal/scheduler/interfaces" + "github.com/armadaproject/armada/internal/scheduler/jobdb" "github.com/armadaproject/armada/internal/scheduler/nodedb" + "github.com/armadaproject/armada/internal/scheduler/schedulerobjects" ) // PreemptingQueueScheduler is a scheduler that makes a unified decisions on which jobs to preempt and schedule. @@ -46,6 +49,8 @@ type PreemptingQueueScheduler struct { skipUnsuccessfulSchedulingKeyCheck bool // If true, asserts that the nodeDb state is consistent with expected changes. enableAssertions bool + // If true, a newer preemption strategy is used. + enableNewPreemptionStrategy bool } func NewPreemptingQueueScheduler( @@ -95,39 +100,23 @@ func (sch *PreemptingQueueScheduler) SkipUnsuccessfulSchedulingKeyCheck() { sch.skipUnsuccessfulSchedulingKeyCheck = true } +func (sch *PreemptingQueueScheduler) EnableNewPreemptionStrategy() { + sch.enableNewPreemptionStrategy = true + sch.nodeDb.EnableNewPreemptionStrategy() +} + // Schedule // - preempts jobs belonging to queues with total allocation above their fair share and // - schedules new jobs belonging to queues with total allocation less than their fair share. func (sch *PreemptingQueueScheduler) Schedule(ctx context.Context) (*SchedulerResult, error) { log := ctxlogrus.Extract(ctx) log = log.WithField("service", "PreemptingQueueScheduler") - if sch.schedulingContext.TotalResources.AsWeightedMillis(sch.schedulingContext.ResourceScarcity) == 0 { - // This refers to resources available across all clusters, i.e., - // it may include resources not currently considered for scheduling. - log.Infof( - "no resources with non-zero weight available for scheduling on any cluster: resource scarcity %v, total resources %v", - sch.schedulingContext.ResourceScarcity, sch.schedulingContext.TotalResources, - ) - return &SchedulerResult{}, nil - } - if rl := sch.nodeDb.TotalResources(); rl.AsWeightedMillis(sch.schedulingContext.ResourceScarcity) == 0 { - // This refers to the resources currently considered for scheduling. - log.Infof( - "no resources with non-zero weight available for scheduling in NodeDb: resource scarcity %v, total resources %v", - sch.schedulingContext.ResourceScarcity, sch.nodeDb.TotalResources(), - ) - return &SchedulerResult{}, nil - } defer func() { sch.schedulingContext.Finished = time.Now() }() preemptedJobsById := make(map[string]interfaces.LegacySchedulerJob) scheduledJobsById := make(map[string]interfaces.LegacySchedulerJob) - log.Infof( - "starting scheduling with total resources %s", - sch.schedulingContext.TotalResources.CompactString(), - ) // NodeDb snapshot prior to making any changes. // We compare against this snapshot after scheduling to detect changes. @@ -157,7 +146,7 @@ func (sch *PreemptingQueueScheduler) Schedule(ctx context.Context) (*SchedulerRe } if qctx, ok := sch.schedulingContext.QueueSchedulingContexts[job.GetQueue()]; ok { fairShare := qctx.Weight / sch.schedulingContext.WeightSum - actualShare := qctx.TotalCostForQueue() / totalCost + actualShare := sch.schedulingContext.FairnessCostProvider.CostFromQueue(qctx) / totalCost fractionOfFairShare := actualShare / fairShare if fractionOfFairShare <= sch.protectedFractionOfFairShare { return false @@ -286,9 +275,10 @@ func (sch *PreemptingQueueScheduler) Schedule(ctx context.Context) (*SchedulerRe } } return &SchedulerResult{ - PreemptedJobs: preemptedJobs, - ScheduledJobs: scheduledJobs, - NodeIdByJobId: sch.nodeIdByJobId, + PreemptedJobs: preemptedJobs, + ScheduledJobs: scheduledJobs, + NodeIdByJobId: sch.nodeIdByJobId, + SchedulingContexts: []*schedulercontext.SchedulingContext{sch.schedulingContext}, }, nil } @@ -296,7 +286,6 @@ func (sch *PreemptingQueueScheduler) evict(ctx context.Context, evictor *Evictor if evictor == nil { return &EvictorResult{}, NewInMemoryJobRepository(sch.schedulingContext.PriorityClasses), nil } - log := ctxlogrus.Extract(ctx) txn := sch.nodeDb.Txn(true) defer txn.Abort() @@ -342,12 +331,18 @@ func (sch *PreemptingQueueScheduler) evict(ctx context.Context, evictor *Evictor if err := sch.evictionAssertions(result.EvictedJobsById, result.AffectedNodesById); err != nil { return nil, nil, err } - if s := JobsSummary(evictedJobs); s != "" { - log.Infof("evicted %d jobs on nodes %v; %s", len(evictedJobs), maps.Keys(result.AffectedNodesById), s) - } inMemoryJobRepo := NewInMemoryJobRepository(sch.schedulingContext.PriorityClasses) inMemoryJobRepo.EnqueueMany(evictedJobs) txn.Commit() + + if sch.enableNewPreemptionStrategy { + if err := sch.nodeDb.Reset(); err != nil { + return nil, nil, err + } + if err := addEvictedJobsToNodeDb(ctx, sch.schedulingContext, sch.nodeDb, inMemoryJobRepo); err != nil { + return nil, nil, err + } + } return result, inMemoryJobRepo, nil } @@ -485,8 +480,79 @@ func (sch *PreemptingQueueScheduler) evictionAssertions(evictedJobsById map[stri return nil } +type MinimalQueueRepository struct { + queues map[string]MinimalQueue +} + +func (qr *MinimalQueueRepository) GetQueue(name string) (fairness.Queue, bool) { + queue, ok := qr.queues[name] + return queue, ok +} + +func NewMinimalQueueRepositoryFromSchedulingContext(sctx *schedulercontext.SchedulingContext) *MinimalQueueRepository { + queues := make(map[string]MinimalQueue, len(sctx.QueueSchedulingContexts)) + for name, qctx := range sctx.QueueSchedulingContexts { + queues[name] = MinimalQueue{allocation: qctx.Allocated.DeepCopy(), weight: qctx.Weight} + } + return &MinimalQueueRepository{queues: queues} +} + +type MinimalQueue struct { + allocation schedulerobjects.ResourceList + weight float64 +} + +func (q MinimalQueue) GetAllocation() schedulerobjects.ResourceList { + return q.allocation +} + +func (q MinimalQueue) GetWeight() float64 { + return q.weight +} + +// addEvictedJobsToNodeDb adds evicted jobs to the NodeDb. +// Needed to enable the nodeDb accounting for these when preempting. +func addEvictedJobsToNodeDb(ctx context.Context, sctx *schedulercontext.SchedulingContext, nodeDb *nodedb.NodeDb, inMemoryJobRepo *InMemoryJobRepository) error { + gangItByQueue := make(map[string]*QueuedGangIterator) + for _, qctx := range sctx.QueueSchedulingContexts { + jobIt, err := inMemoryJobRepo.GetJobIterator(ctx, qctx.Queue) + if err != nil { + return err + } + gangItByQueue[qctx.Queue] = NewQueuedGangIterator(sctx, jobIt, 0) + } + qr := NewMinimalQueueRepositoryFromSchedulingContext(sctx) + candidateGangIterator, err := NewCandidateGangIterator(qr, sctx.FairnessCostProvider, gangItByQueue) + if err != nil { + return err + } + txn := nodeDb.Txn(true) + defer txn.Abort() + i := 0 + for { + if gctx, err := candidateGangIterator.Peek(); err != nil { + return err + } else if gctx == nil { + break + } else { + for _, jctx := range gctx.JobSchedulingContexts { + if err := nodeDb.AddEvictedJobSchedulingContextWithTxn(txn, i, jctx); err != nil { + return err + } + i++ + } + q := qr.queues[gctx.Queue] + q.allocation.Add(gctx.TotalResourceRequests) + } + if err := candidateGangIterator.Clear(); err != nil { + return err + } + } + txn.Commit() + return nil +} + func (sch *PreemptingQueueScheduler) schedule(ctx context.Context, inMemoryJobRepo *InMemoryJobRepository, jobRepo JobRepository) (*SchedulerResult, error) { - log := ctxlogrus.Extract(ctx) jobIteratorByQueue := make(map[string]JobIterator) for _, qctx := range sch.schedulingContext.QueueSchedulingContexts { evictedIt, err := inMemoryJobRepo.GetJobIterator(ctx, qctx.Queue) @@ -525,9 +591,6 @@ func (sch *PreemptingQueueScheduler) schedule(ctx context.Context, inMemoryJobRe if err := sch.updateGangAccounting(nil, result.ScheduledJobs); err != nil { return nil, err } - if s := JobsSummary(result.ScheduledJobs); s != "" { - log.Infof("re-scheduled %d jobs; %s", len(result.ScheduledJobs), s) - } return result, nil } @@ -807,6 +870,13 @@ func (evi *Evictor) Evict(ctx context.Context, it nodedb.NodeIterator) (*Evictor if err != nil { return nil, err } + + for i, evictedJob := range evictedJobs { + if dbJob, ok := evictedJob.(*jobdb.Job); ok { + evictedJobs[i] = dbJob.DeepCopy() + } + } + for _, job := range evictedJobs { evictedJobsById[job.GetId()] = job nodeIdByJobId[job.GetId()] = node.Id diff --git a/internal/scheduler/preempting_queue_scheduler_test.go b/internal/scheduler/preempting_queue_scheduler_test.go index 59a82e899de..923d77296bc 100644 --- a/internal/scheduler/preempting_queue_scheduler_test.go +++ b/internal/scheduler/preempting_queue_scheduler_test.go @@ -19,6 +19,7 @@ import ( armadaslices "github.com/armadaproject/armada/internal/common/slices" schedulerconstraints "github.com/armadaproject/armada/internal/scheduler/constraints" schedulercontext "github.com/armadaproject/armada/internal/scheduler/context" + "github.com/armadaproject/armada/internal/scheduler/fairness" "github.com/armadaproject/armada/internal/scheduler/interfaces" "github.com/armadaproject/armada/internal/scheduler/jobdb" "github.com/armadaproject/armada/internal/scheduler/nodedb" @@ -1355,17 +1356,19 @@ func TestPreemptingQueueScheduler(t *testing.T) { tc.TotalResources = nodeDb.TotalResources() } + fairnessCostProvider, err := fairness.NewDominantResourceFairness( + nodeDb.TotalResources(), + tc.SchedulingConfig.DominantResourceFairnessResourcesToConsider, + ) + require.NoError(t, err) sctx := schedulercontext.NewSchedulingContext( "executor", "pool", tc.SchedulingConfig.Preemption.PriorityClasses, tc.SchedulingConfig.Preemption.DefaultPriorityClass, - tc.SchedulingConfig.ResourceScarcity, + fairnessCostProvider, tc.TotalResources, ) - if tc.SchedulingConfig.FairnessModel == configuration.DominantResourceFairness { - sctx.EnableDominantResourceFairness(tc.SchedulingConfig.DominantResourceFairnessResourcesToConsider) - } for queue, priorityFactor := range tc.PriorityFactorByQueue { weight := 1 / priorityFactor err := sctx.AddQueueSchedulingContext(queue, weight, allocatedByQueueAndPriorityClass[queue]) @@ -1390,6 +1393,9 @@ func TestPreemptingQueueScheduler(t *testing.T) { gangIdByJobId, ) sch.EnableAssertions() + if tc.SchedulingConfig.EnableNewPreemptionStrategy { + sch.EnableNewPreemptionStrategy() + } result, err := sch.Schedule(ctxlogrus.ToContext(context.Background(), log)) require.NoError(t, err) jobIdsByGangId = sch.jobIdsByGangId @@ -1628,12 +1634,17 @@ func BenchmarkPreemptingQueueScheduler(b *testing.B) { } jobRepo.EnqueueMany(jobs) + fairnessCostProvider, err := fairness.NewDominantResourceFairness( + nodeDb.TotalResources(), + tc.SchedulingConfig.DominantResourceFairnessResourcesToConsider, + ) + require.NoError(b, err) sctx := schedulercontext.NewSchedulingContext( "executor", "pool", tc.SchedulingConfig.Preemption.PriorityClasses, tc.SchedulingConfig.Preemption.DefaultPriorityClass, - tc.SchedulingConfig.ResourceScarcity, + fairnessCostProvider, nodeDb.TotalResources(), ) for queue, priorityFactor := range priorityFactorByQueue { @@ -1694,7 +1705,7 @@ func BenchmarkPreemptingQueueScheduler(b *testing.B) { "pool", tc.SchedulingConfig.Preemption.PriorityClasses, tc.SchedulingConfig.Preemption.DefaultPriorityClass, - tc.SchedulingConfig.ResourceScarcity, + fairnessCostProvider, nodeDb.TotalResources(), ) for queue, priorityFactor := range priorityFactorByQueue { diff --git a/internal/scheduler/queue_scheduler.go b/internal/scheduler/queue_scheduler.go index 956232b38f4..ba6c223f49a 100644 --- a/internal/scheduler/queue_scheduler.go +++ b/internal/scheduler/queue_scheduler.go @@ -6,13 +6,13 @@ import ( "reflect" "time" - "github.com/grpc-ecosystem/go-grpc-middleware/logging/logrus/ctxlogrus" "github.com/pkg/errors" "github.com/sirupsen/logrus" "github.com/armadaproject/armada/internal/common/logging" schedulerconstraints "github.com/armadaproject/armada/internal/scheduler/constraints" schedulercontext "github.com/armadaproject/armada/internal/scheduler/context" + "github.com/armadaproject/armada/internal/scheduler/fairness" "github.com/armadaproject/armada/internal/scheduler/interfaces" "github.com/armadaproject/armada/internal/scheduler/nodedb" "github.com/armadaproject/armada/internal/scheduler/schedulerobjects" @@ -45,7 +45,7 @@ func NewQueueScheduler( for queue, it := range jobIteratorByQueue { gangIteratorsByQueue[queue] = NewQueuedGangIterator(sctx, it, constraints.MaxQueueLookback) } - candidateGangIterator, err := NewCandidateGangIterator(sctx, gangIteratorsByQueue) + candidateGangIterator, err := NewCandidateGangIterator(sctx, sctx.FairnessCostProvider, gangIteratorsByQueue) if err != nil { return nil, err } @@ -61,24 +61,6 @@ func (sch *QueueScheduler) SkipUnsuccessfulSchedulingKeyCheck() { } func (sch *QueueScheduler) Schedule(ctx context.Context) (*SchedulerResult, error) { - log := ctxlogrus.Extract(ctx) - if sch.schedulingContext.TotalResources.AsWeightedMillis(sch.schedulingContext.ResourceScarcity) == 0 { - // This refers to resources available across all clusters, i.e., - // it may include resources not currently considered for scheduling. - log.Infof( - "no resources with non-zero weight available for scheduling on any cluster: resource scarcity %v, total resources %v", - sch.schedulingContext.ResourceScarcity, sch.schedulingContext.TotalResources, - ) - return &SchedulerResult{}, nil - } - if rl := sch.gangScheduler.nodeDb.TotalResources(); rl.AsWeightedMillis(sch.schedulingContext.ResourceScarcity) == 0 { - // This refers to the resources currently considered for scheduling. - log.Infof( - "no resources with non-zero weight available for scheduling in NodeDb: resource scarcity %v, total resources %v", - sch.schedulingContext.ResourceScarcity, sch.gangScheduler.nodeDb.TotalResources(), - ) - return &SchedulerResult{}, nil - } nodeIdByJobId := make(map[string]string) scheduledJobs := make([]interfaces.LegacySchedulerJob, 0) for { @@ -134,9 +116,10 @@ func (sch *QueueScheduler) Schedule(ctx context.Context) (*SchedulerResult, erro return nil, errors.Errorf("only %d out of %d jobs mapped to a node", len(nodeIdByJobId), len(scheduledJobs)) } return &SchedulerResult{ - PreemptedJobs: nil, - ScheduledJobs: scheduledJobs, - NodeIdByJobId: nodeIdByJobId, + PreemptedJobs: nil, + ScheduledJobs: scheduledJobs, + NodeIdByJobId: nodeIdByJobId, + SchedulingContexts: []*schedulercontext.SchedulingContext{sch.schedulingContext}, }, nil } @@ -271,7 +254,8 @@ func (it *QueuedGangIterator) hitLookbackLimit() bool { // Specifically, it yields the next gang in the queue with smallest fraction of its fair share, // where the fraction of fair share computation includes the yielded gang. type CandidateGangIterator struct { - SchedulingContext *schedulercontext.SchedulingContext + queueProvier fairness.QueueRepository + fairnessCostProvider fairness.FairnessCostProvider // If true, this iterator only yields gangs where all jobs are evicted. onlyYieldEvicted bool // Reusable buffer to avoid allocations. @@ -282,13 +266,15 @@ type CandidateGangIterator struct { } func NewCandidateGangIterator( - sctx *schedulercontext.SchedulingContext, + queueProvier fairness.QueueRepository, + fairnessCostProvider fairness.FairnessCostProvider, iteratorsByQueue map[string]*QueuedGangIterator, ) (*CandidateGangIterator, error) { it := &CandidateGangIterator{ - SchedulingContext: sctx, - buffer: schedulerobjects.NewResourceListWithDefaultSize(), - pq: make(QueueCandidateGangIteratorPQ, 0, len(iteratorsByQueue)), + queueProvier: queueProvier, + fairnessCostProvider: fairnessCostProvider, + buffer: schedulerobjects.NewResourceListWithDefaultSize(), + pq: make(QueueCandidateGangIteratorPQ, 0, len(iteratorsByQueue)), } for queue, queueIt := range iteratorsByQueue { if _, err := it.updateAndPushPQItem(it.newPQItem(queue, queueIt)); err != nil { @@ -327,7 +313,7 @@ func (it *CandidateGangIterator) updateAndPushPQItem(item *QueueCandidateGangIte func (it *CandidateGangIterator) updatePQItem(item *QueueCandidateGangIteratorItem) error { item.gctx = nil - item.fractionOfFairShare = 0 + item.queueCost = 0 gctx, err := item.it.Peek() if err != nil { return err @@ -339,17 +325,24 @@ func (it *CandidateGangIterator) updatePQItem(item *QueueCandidateGangIteratorIt return errors.Errorf("mismatched queue %s and %s for gctx", gctx.Queue, item.queue) } item.gctx = gctx - item.fractionOfFairShare = it.fractionOfFairShareWithGctx(gctx) + cost, err := it.queueCostWithGctx(gctx) + if err != nil { + return err + } + item.queueCost = cost return nil } -// fractionOfFairShareWithGctx returns the fraction of its fair share this queue would have if the jobs in gctx were scheduled. -func (it *CandidateGangIterator) fractionOfFairShareWithGctx(gctx *schedulercontext.GangSchedulingContext) float64 { - qctx := it.SchedulingContext.QueueSchedulingContexts[gctx.Queue] +// queueCostWithGctx returns the cost associated with a queue if gctx were to be scheduled. +func (it *CandidateGangIterator) queueCostWithGctx(gctx *schedulercontext.GangSchedulingContext) (float64, error) { + queue, ok := it.queueProvier.GetQueue(gctx.Queue) + if !ok { + return 0, errors.Errorf("unknown queue %s", gctx.Queue) + } it.buffer.Zero() - it.buffer.Add(qctx.Allocated) + it.buffer.Add(queue.GetAllocation()) it.buffer.Add(gctx.TotalResourceRequests) - return qctx.TotalCostForQueueWithAllocation(it.buffer) + return it.fairnessCostProvider.CostFromAllocationAndWeight(it.buffer, queue.GetWeight()), nil } // Clear removes the first item in the iterator. @@ -390,9 +383,9 @@ type QueueCandidateGangIteratorItem struct { // Most recent value produced by the iterator. // Cached here to avoid repeating scheduling checks unnecessarily. gctx *schedulercontext.GangSchedulingContext - // Fraction of its fair share this queue would have - // if its next schedulable job were to be scheduled. - fractionOfFairShare float64 + // Cost associated with the queue if the topmost gang in the queue were to be scheduled. + // Used to order queues fairly. + queueCost float64 // The index of the item in the heap. // maintained by the heap.Interface methods. index int @@ -402,10 +395,10 @@ func (pq QueueCandidateGangIteratorPQ) Len() int { return len(pq) } func (pq QueueCandidateGangIteratorPQ) Less(i, j int) bool { // Tie-break by queue name. - if pq[i].fractionOfFairShare == pq[j].fractionOfFairShare { + if pq[i].queueCost == pq[j].queueCost { return pq[i].queue < pq[j].queue } - return pq[i].fractionOfFairShare < pq[j].fractionOfFairShare + return pq[i].queueCost < pq[j].queueCost } func (pq QueueCandidateGangIteratorPQ) Swap(i, j int) { diff --git a/internal/scheduler/queue_scheduler_test.go b/internal/scheduler/queue_scheduler_test.go index 84d42dd3e36..02ea3929aaa 100644 --- a/internal/scheduler/queue_scheduler_test.go +++ b/internal/scheduler/queue_scheduler_test.go @@ -16,6 +16,7 @@ import ( "github.com/armadaproject/armada/internal/common/util" schedulerconstraints "github.com/armadaproject/armada/internal/scheduler/constraints" schedulercontext "github.com/armadaproject/armada/internal/scheduler/context" + "github.com/armadaproject/armada/internal/scheduler/fairness" "github.com/armadaproject/armada/internal/scheduler/interfaces" "github.com/armadaproject/armada/internal/scheduler/jobdb" "github.com/armadaproject/armada/internal/scheduler/nodedb" @@ -421,6 +422,17 @@ func TestQueueScheduler(t *testing.T) { PriorityFactorByQueue: map[string]float64{"A": 1}, ExpectedScheduledIndices: []int{1}, }, + "job priority": { + SchedulingConfig: testfixtures.TestSchedulingConfig(), + Nodes: testfixtures.N32CpuNodes(1, testfixtures.TestPriorities), + Jobs: armadaslices.Concatenate( + testfixtures.WithPriorityJobs(10, testfixtures.N32Cpu256GiJobs("A", testfixtures.PriorityClass0, 1)), + testfixtures.WithPriorityJobs(1, testfixtures.N32Cpu256GiJobs("A", testfixtures.PriorityClass0, 1)), + testfixtures.WithPriorityJobs(20, testfixtures.N32Cpu256GiJobs("A", testfixtures.PriorityClass0, 1)), + ), + PriorityFactorByQueue: map[string]float64{"A": 1}, + ExpectedScheduledIndices: []int{1}, + }, } for name, tc := range tests { t.Run(name, func(t *testing.T) { @@ -450,12 +462,17 @@ func TestQueueScheduler(t *testing.T) { jobRepo := NewInMemoryJobRepository(tc.SchedulingConfig.Preemption.PriorityClasses) jobRepo.EnqueueMany(legacySchedulerJobs) + fairnessCostProvider, err := fairness.NewDominantResourceFairness( + tc.TotalResources, + tc.SchedulingConfig.DominantResourceFairnessResourcesToConsider, + ) + require.NoError(t, err) sctx := schedulercontext.NewSchedulingContext( "executor", "pool", tc.SchedulingConfig.Preemption.PriorityClasses, tc.SchedulingConfig.Preemption.DefaultPriorityClass, - tc.SchedulingConfig.ResourceScarcity, + fairnessCostProvider, tc.TotalResources, ) for queue, priorityFactor := range tc.PriorityFactorByQueue { diff --git a/internal/scheduler/scheduler.go b/internal/scheduler/scheduler.go index c316d3aa3cb..fc1c629dc6f 100644 --- a/internal/scheduler/scheduler.go +++ b/internal/scheduler/scheduler.go @@ -13,6 +13,7 @@ import ( v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/util/clock" + "github.com/armadaproject/armada/internal/armada/configuration" "github.com/armadaproject/armada/internal/common/logging" "github.com/armadaproject/armada/internal/common/stringinterner" "github.com/armadaproject/armada/internal/scheduler/database" @@ -72,6 +73,8 @@ type Scheduler struct { runsSerial int64 // Function that is called every time a cycle is completed. Useful for testing. onCycleCompleted func() + // metrics set for the scheduler. + metrics *SchedulerMetrics } func NewScheduler( @@ -87,6 +90,7 @@ func NewScheduler( executorTimeout time.Duration, maxAttemptedRuns uint, nodeIdLabel string, + schedulerMetrics *SchedulerMetrics, ) (*Scheduler, error) { jobDb := jobdb.NewJobDb() return &Scheduler{ @@ -107,6 +111,7 @@ func NewScheduler( nodeIdLabel: nodeIdLabel, jobsSerial: -1, runsSerial: -1, + metrics: schedulerMetrics, }, nil } @@ -159,11 +164,25 @@ func (s *Scheduler) Run(ctx context.Context) error { // and we must invalidate the held leader token to trigger flushing Pulsar at the next cycle. // // TODO: Once the Pulsar client supports transactions, we can guarantee consistency even in case of errors. - if err := s.cycle(ctx, fullUpdate, leaderToken); err != nil { + + shouldSchedule := s.clock.Now().Sub(s.previousSchedulingRoundEnd) > s.schedulePeriod + + if err := s.cycle(ctx, fullUpdate, leaderToken, shouldSchedule); err != nil { logging.WithStacktrace(log, err).Error("scheduling cycle failure") leaderToken = InvalidLeaderToken() } - log.Infof("scheduling cycle completed in %s", s.clock.Since(start)) + + cycleTime := s.clock.Since(start) + + if shouldSchedule && leaderToken.leader { + // Only the leader token does real scheduling rounds. + s.metrics.ReportScheduleCycleTime(cycleTime) + log.Infof("scheduling cycle completed in %s", cycleTime) + } else { + s.metrics.ReportReconcileCycleTime(cycleTime) + log.Infof("reconciliation cycle completed in %s", cycleTime) + } + prevLeaderToken = leaderToken if s.onCycleCompleted != nil { s.onCycleCompleted() @@ -175,7 +194,7 @@ func (s *Scheduler) Run(ctx context.Context) error { // cycle is a single iteration of the main scheduling loop. // If updateAll is true, we generate events from all jobs in the jobDb. // Otherwise, we only generate events from jobs updated since the last cycle. -func (s *Scheduler) cycle(ctx context.Context, updateAll bool, leaderToken LeaderToken) error { +func (s *Scheduler) cycle(ctx context.Context, updateAll bool, leaderToken LeaderToken, shouldSchedule bool) error { log := ctxlogrus.Extract(ctx) log = log.WithField("function", "cycle") // Update job state. @@ -211,12 +230,21 @@ func (s *Scheduler) cycle(ctx context.Context, updateAll bool, leaderToken Leade events = append(events, expirationEvents...) // Schedule jobs. - if s.clock.Now().Sub(s.previousSchedulingRoundEnd) > s.schedulePeriod { + if shouldSchedule { overallSchedulerResult, err := s.schedulingAlgo.Schedule(ctx, txn, s.jobDb) if err != nil { return err } + // This check feels redundant. It feels like we shouldn't have got here without + // a leader token. + if leaderToken.leader { + // Report various metrics computed from the scheduling cycle. + // TODO: preemptible jobs, possibly other metrics + // TODO: Return this information and deal with metrics after the cycle? + s.metrics.ReportSchedulerResult(overallSchedulerResult) + } + resultEvents, err := s.eventsFromSchedulerResult(txn, overallSchedulerResult) if err != nil { return err @@ -229,9 +257,11 @@ func (s *Scheduler) cycle(ctx context.Context, updateAll bool, leaderToken Leade isLeader := func() bool { return s.leaderController.ValidateToken(leaderToken) } + start := s.clock.Now() if err := s.publisher.PublishMessages(ctx, events, isLeader); err != nil { return err } + log.Infof("published %d events to pulsar in %s", len(events), s.clock.Since(start)) txn.Commit() return nil } @@ -241,11 +271,12 @@ func (s *Scheduler) syncState(ctx context.Context) ([]*jobdb.Job, error) { log := ctxlogrus.Extract(ctx) log = log.WithField("function", "syncState") + start := s.clock.Now() updatedJobs, updatedRuns, err := s.jobRepository.FetchJobUpdates(ctx, s.jobsSerial, s.runsSerial) if err != nil { return nil, err } - log.Infof("received %d updated jobs and %d updated job runs", len(updatedJobs), len(updatedRuns)) + log.Infof("received %d updated jobs and %d updated job runs in %s", len(updatedJobs), len(updatedRuns), s.clock.Since(start)) txn := s.jobDb.WriteTxn() defer txn.Abort() @@ -306,11 +337,11 @@ func (s *Scheduler) syncState(ctx context.Context) ([]*jobdb.Job, error) { } jobsToUpdate := maps.Values(jobsToUpdateById) - err = s.jobDb.BatchDelete(txn, jobsToDelete) + err = s.jobDb.Upsert(txn, jobsToUpdate) if err != nil { return nil, err } - err = s.jobDb.Upsert(txn, jobsToUpdate) + err = s.jobDb.BatchDelete(txn, jobsToDelete) if err != nil { return nil, err } @@ -360,8 +391,25 @@ func (s *Scheduler) addNodeAntiAffinitiesForAttemptedRunsIfSchedulable(job *jobd // eventsFromSchedulerResult generates necessary EventSequences from the provided SchedulerResult. func (s *Scheduler) eventsFromSchedulerResult(txn *jobdb.Txn, result *SchedulerResult) ([]*armadaevents.EventSequence, error) { - events := make([]*armadaevents.EventSequence, 0, len(result.PreemptedJobs)+len(result.ScheduledJobs)) - for _, job := range PreemptedJobsFromSchedulerResult[*jobdb.Job](result) { + return EventsFromSchedulerResult(result, s.clock.Now()) +} + +// EventsFromSchedulerResult generates necessary EventSequences from the provided SchedulerResult. +func EventsFromSchedulerResult(result *SchedulerResult, time time.Time) ([]*armadaevents.EventSequence, error) { + eventSequences := make([]*armadaevents.EventSequence, 0, len(result.PreemptedJobs)+len(result.ScheduledJobs)) + eventSequences, err := AppendEventSequencesFromPreemptedJobs(eventSequences, PreemptedJobsFromSchedulerResult[*jobdb.Job](result), time) + if err != nil { + return nil, err + } + eventSequences, err = AppendEventSequencesFromScheduledJobs(eventSequences, ScheduledJobsFromSchedulerResult[*jobdb.Job](result), time) + if err != nil { + return nil, err + } + return eventSequences, nil +} + +func AppendEventSequencesFromPreemptedJobs(eventSequences []*armadaevents.EventSequence, jobs []*jobdb.Job, time time.Time) ([]*armadaevents.EventSequence, error) { + for _, job := range jobs { jobId, err := armadaevents.ProtoUuidFromUlidString(job.Id()) if err != nil { return nil, err @@ -370,12 +418,12 @@ func (s *Scheduler) eventsFromSchedulerResult(txn *jobdb.Txn, result *SchedulerR if run == nil { return nil, errors.Errorf("attempting to generate preempted events for job %s with no associated runs", job.Id()) } - es := &armadaevents.EventSequence{ + eventSequences = append(eventSequences, &armadaevents.EventSequence{ Queue: job.Queue(), JobSetName: job.Jobset(), Events: []*armadaevents.EventSequence_Event{ { - Created: s.now(), + Created: &time, Event: &armadaevents.EventSequence_Event_JobRunPreempted{ JobRunPreempted: &armadaevents.JobRunPreempted{ PreemptedRunId: armadaevents.ProtoUuidFromUuid(run.Id()), @@ -384,7 +432,7 @@ func (s *Scheduler) eventsFromSchedulerResult(txn *jobdb.Txn, result *SchedulerR }, }, { - Created: s.now(), + Created: &time, Event: &armadaevents.EventSequence_Event_JobRunErrors{ JobRunErrors: &armadaevents.JobRunErrors{ RunId: armadaevents.ProtoUuidFromUuid(run.Id()), @@ -401,7 +449,7 @@ func (s *Scheduler) eventsFromSchedulerResult(txn *jobdb.Txn, result *SchedulerR }, }, { - Created: s.now(), + Created: &time, Event: &armadaevents.EventSequence_Event_JobErrors{ JobErrors: &armadaevents.JobErrors{ JobId: jobId, @@ -417,46 +465,43 @@ func (s *Scheduler) eventsFromSchedulerResult(txn *jobdb.Txn, result *SchedulerR }, }, }, - } - events = append(events, es) + }) } - for _, job := range ScheduledJobsFromSchedulerResult[*jobdb.Job](result) { + return eventSequences, nil +} + +func AppendEventSequencesFromScheduledJobs(eventSequences []*armadaevents.EventSequence, jobs []*jobdb.Job, time time.Time) ([]*armadaevents.EventSequence, error) { + for _, job := range jobs { jobId, err := armadaevents.ProtoUuidFromUlidString(job.Id()) if err != nil { return nil, err } - job = job.WithQueuedVersion(job.QueuedVersion() + 1) - job = job.WithQueued(false) - err = s.jobDb.Upsert(txn, []*jobdb.Job{job}) - if err != nil { - return nil, err + run := job.LatestRun() + if run == nil { + return nil, errors.Errorf("attempting to generate lease events for job %s with no associated runs", job.Id()) } - events = append( - events, - &armadaevents.EventSequence{ - Queue: job.Queue(), - JobSetName: job.Jobset(), // TODO: Rename to JobSet. - Events: []*armadaevents.EventSequence_Event{ - { - Created: s.now(), - Event: &armadaevents.EventSequence_Event_JobRunLeased{ - JobRunLeased: &armadaevents.JobRunLeased{ - RunId: armadaevents.ProtoUuidFromUuid(job.LatestRun().Id()), - JobId: jobId, - ExecutorId: job.LatestRun().Executor(), - // NodeId here refers to the unique identifier of the node in an executor cluster, - // which is referred to as the NodeName within the scheduler. - NodeId: job.LatestRun().NodeName(), - UpdateSequenceNumber: job.QueuedVersion(), - }, + eventSequences = append(eventSequences, &armadaevents.EventSequence{ + Queue: job.Queue(), + JobSetName: job.Jobset(), // TODO: Rename to JobSet. + Events: []*armadaevents.EventSequence_Event{ + { + Created: &time, + Event: &armadaevents.EventSequence_Event_JobRunLeased{ + JobRunLeased: &armadaevents.JobRunLeased{ + RunId: armadaevents.ProtoUuidFromUuid(run.Id()), + JobId: jobId, + ExecutorId: run.Executor(), + // NodeId here refers to the unique identifier of the node in an executor cluster, + // which is referred to as the NodeName within the scheduler. + NodeId: run.NodeName(), + UpdateSequenceNumber: job.QueuedVersion(), }, }, }, }, - ) + }) } - - return events, nil + return eventSequences, nil } // generateUpdateMessages generates EventSequences representing the state changes on updated jobs @@ -549,7 +594,8 @@ func (s *Scheduler) generateUpdateMessagesFromJob(job *jobdb.Job, jobRunErrors m } events = append(events, jobSucceeded) } else if lastRun.Failed() && !job.Queued() { - requeueJob := lastRun.Returned() && job.NumAttempts() < s.maxAttemptedRuns + failFast := job.GetAnnotations()[configuration.FailFastAnnotation] == "true" + requeueJob := !failFast && lastRun.Returned() && job.NumAttempts() < s.maxAttemptedRuns if requeueJob && lastRun.RunAttempted() { jobWithAntiAffinity, schedulable, err := s.addNodeAntiAffinitiesForAttemptedRunsIfSchedulable(job) @@ -589,6 +635,9 @@ func (s *Scheduler) generateUpdateMessagesFromJob(job *jobdb.Job, jobRunErrors m if job.NumAttempts() < s.maxAttemptedRuns { errorMessage = fmt.Sprintf("Job was attempted %d times, and has been tried once on all nodes it can run on - this job will no longer be retried", job.NumAttempts()) } + if failFast { + errorMessage = fmt.Sprintf("Job has fail fast flag set - this job will no longer be retried") + } runError = &armadaevents.Error{ Terminal: true, Reason: &armadaevents.Error_MaxRunsExceeded{ @@ -598,6 +647,12 @@ func (s *Scheduler) generateUpdateMessagesFromJob(job *jobdb.Job, jobRunErrors m }, } } + if runError == nil { + panic( + fmt.Sprintf("No run error found for run %s (job id = %s), this must mean we're out of sync with the database", + lastRun.Id().String(), job.Id()), + ) + } jobErrors := &armadaevents.EventSequence_Event{ Created: s.now(), Event: &armadaevents.EventSequence_Event_JobErrors{ diff --git a/internal/scheduler/scheduler_metrics.go b/internal/scheduler/scheduler_metrics.go new file mode 100644 index 00000000000..b6f79e66612 --- /dev/null +++ b/internal/scheduler/scheduler_metrics.go @@ -0,0 +1,150 @@ +package scheduler + +import ( + "time" + + "github.com/prometheus/client_golang/prometheus" + log "github.com/sirupsen/logrus" + + "github.com/armadaproject/armada/internal/armada/configuration" + "github.com/armadaproject/armada/internal/scheduler/interfaces" +) + +const ( + NAMESPACE = "armada" + SUBSYSTEM = "scheduler" +) + +type SchedulerMetrics struct { + // Cycle time when scheduling, as leader. + scheduleCycleTime prometheus.Histogram + // Cycle time when reconciling, as leader or follower. + reconcileCycleTime prometheus.Histogram + // Number of jobs scheduled per queue. + scheduledJobsPerQueue prometheus.GaugeVec + // Number of jobs preempted per queue. + preemptedJobsPerQueue prometheus.GaugeVec +} + +func NewSchedulerMetrics(config configuration.SchedulerMetricsConfig) *SchedulerMetrics { + scheduleCycleTime := prometheus.NewHistogram( + prometheus.HistogramOpts{ + Namespace: NAMESPACE, + Subsystem: SUBSYSTEM, + Name: "schedule_cycle_times", + Help: "Cycle time when in a scheduling round.", + Buckets: prometheus.ExponentialBuckets( + config.ScheduleCycleTimeHistogramSettings.Start, + config.ScheduleCycleTimeHistogramSettings.Factor, + config.ScheduleCycleTimeHistogramSettings.Count), + }, + ) + + reconcileCycleTime := prometheus.NewHistogram( + prometheus.HistogramOpts{ + Namespace: NAMESPACE, + Subsystem: SUBSYSTEM, + Name: "reconcile_cycle_times", + Help: "Cycle time when outside of a scheduling round.", + Buckets: prometheus.ExponentialBuckets( + config.ReconcileCycleTimeHistogramSettings.Start, + config.ReconcileCycleTimeHistogramSettings.Factor, + config.ReconcileCycleTimeHistogramSettings.Count), + }, + ) + + scheduledJobs := prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: NAMESPACE, + Subsystem: SUBSYSTEM, + Name: "scheduled_jobs", + Help: "Number of jobs scheduled each round.", + }, + []string{ + "queue", + "priority_class", + }, + ) + + preemptedJobs := prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: NAMESPACE, + Subsystem: SUBSYSTEM, + Name: "preempted_jobs", + Help: "Number of jobs preempted each round.", + }, + []string{ + "queue", + "priority_class", + }, + ) + + prometheus.MustRegister(scheduleCycleTime) + prometheus.MustRegister(reconcileCycleTime) + prometheus.MustRegister(scheduledJobs) + prometheus.MustRegister(preemptedJobs) + + return &SchedulerMetrics{ + scheduleCycleTime: scheduleCycleTime, + reconcileCycleTime: reconcileCycleTime, + scheduledJobsPerQueue: *scheduledJobs, + preemptedJobsPerQueue: *preemptedJobs, + } +} + +func (metrics *SchedulerMetrics) ReportScheduleCycleTime(cycleTime time.Duration) { + metrics.scheduleCycleTime.Observe(float64(cycleTime.Milliseconds())) +} + +func (metrics *SchedulerMetrics) ReportReconcileCycleTime(cycleTime time.Duration) { + metrics.reconcileCycleTime.Observe(float64(cycleTime.Milliseconds())) +} + +func (metrics *SchedulerMetrics) ReportSchedulerResult(result *SchedulerResult) { + metrics.reportScheduledJobs(result.ScheduledJobs) + metrics.reportPreemptedJobs(result.PreemptedJobs) +} + +func (metrics *SchedulerMetrics) reportScheduledJobs(scheduledJobs []interfaces.LegacySchedulerJob) { + jobAggregates := aggregateJobs(scheduledJobs) + observeJobAggregates(metrics.scheduledJobsPerQueue, jobAggregates) +} + +func (metrics *SchedulerMetrics) reportPreemptedJobs(preemptedJobs []interfaces.LegacySchedulerJob) { + jobAggregates := aggregateJobs(preemptedJobs) + observeJobAggregates(metrics.preemptedJobsPerQueue, jobAggregates) +} + +type collectionKey struct { + queue string + priorityClass string +} + +// aggregateJobs takes a list of jobs and counts how many there are of each queue, priorityClass pair. +func aggregateJobs[S ~[]E, E interfaces.LegacySchedulerJob](scheduledJobs S) map[collectionKey]int { + groups := make(map[collectionKey]int) + + for _, job := range scheduledJobs { + key := collectionKey{queue: job.GetQueue(), priorityClass: job.GetPriorityClassName()} + groups[key] += 1 + } + + return groups +} + +// observeJobAggregates reports a set of job aggregates to a given HistogramVec by queue and priorityClass. +func observeJobAggregates(metric prometheus.GaugeVec, jobAggregates map[collectionKey]int) { + for key, count := range jobAggregates { + queue := key.queue + priorityClassName := key.priorityClass + + observer, err := metric.GetMetricWithLabelValues(queue, priorityClassName) + + if err != nil { + // A metric failure isn't reason to kill the programme. + log.Error(err) + } else { + observer.Add(float64(count)) + } + } +} diff --git a/internal/scheduler/scheduler_metrics_test.go b/internal/scheduler/scheduler_metrics_test.go new file mode 100644 index 00000000000..e64075db86c --- /dev/null +++ b/internal/scheduler/scheduler_metrics_test.go @@ -0,0 +1,33 @@ +package scheduler + +import ( + "testing" + + "github.com/stretchr/testify/assert" + + "github.com/armadaproject/armada/internal/scheduler/jobdb" + "github.com/armadaproject/armada/internal/scheduler/testfixtures" +) + +func TestAggregateJobs(t *testing.T) { + testJobs := []*jobdb.Job{ + testfixtures.Test1Cpu4GiJob("queue_a", testfixtures.PriorityClass0), + testfixtures.Test1Cpu4GiJob("queue_b", testfixtures.PriorityClass0), + testfixtures.Test1Cpu4GiJob("queue_a", testfixtures.PriorityClass0), + testfixtures.Test1Cpu4GiJob("queue_a", testfixtures.PriorityClass1), + testfixtures.Test1Cpu4GiJob("queue_a", testfixtures.PriorityClass0), + testfixtures.Test1Cpu4GiJob("queue_b", testfixtures.PriorityClass1), + testfixtures.Test1Cpu4GiJob("queue_a", testfixtures.PriorityClass0), + } + + actual := aggregateJobs(testJobs) + + expected := map[collectionKey]int{ + {queue: "queue_a", priorityClass: testfixtures.PriorityClass0}: 4, + {queue: "queue_a", priorityClass: testfixtures.PriorityClass1}: 1, + {queue: "queue_b", priorityClass: testfixtures.PriorityClass0}: 1, + {queue: "queue_b", priorityClass: testfixtures.PriorityClass1}: 1, + } + + assert.Equal(t, expected, actual) +} diff --git a/internal/scheduler/scheduler_test.go b/internal/scheduler/scheduler_test.go index da9a292240d..df889db79f6 100644 --- a/internal/scheduler/scheduler_test.go +++ b/internal/scheduler/scheduler_test.go @@ -14,6 +14,7 @@ import ( v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/util/clock" + "github.com/armadaproject/armada/internal/armada/configuration" protoutil "github.com/armadaproject/armada/internal/common/proto" "github.com/armadaproject/armada/internal/common/stringinterner" "github.com/armadaproject/armada/internal/common/util" @@ -32,6 +33,21 @@ const ( ) var ( + failFastSchedulingInfo = &schedulerobjects.JobSchedulingInfo{ + AtMostOnce: true, + ObjectRequirements: []*schedulerobjects.ObjectRequirements{ + { + Requirements: &schedulerobjects.ObjectRequirements_PodRequirements{ + PodRequirements: &schedulerobjects.PodRequirements{ + Annotations: map[string]string{ + configuration.FailFastAnnotation: "true", + }, + }, + }, + }, + }, + Version: 1, + } schedulingInfo = &schedulerobjects.JobSchedulingInfo{ AtMostOnce: true, ObjectRequirements: []*schedulerobjects.ObjectRequirements{ @@ -60,6 +76,18 @@ var ( Version: 2, } updatedSchedulingInfoBytes = protoutil.MustMarshall(updatedSchedulingInfo) + schedulerMetrics = NewSchedulerMetrics(configuration.SchedulerMetricsConfig{ + ScheduleCycleTimeHistogramSettings: configuration.HistogramConfig{ + Start: 1, + Factor: 1.1, + Count: 100, + }, + ReconcileCycleTimeHistogramSettings: configuration.HistogramConfig{ + Start: 1, + Factor: 1.1, + Count: 100, + }, + }) ) var queuedJob = jobdb.NewJob( @@ -88,6 +116,28 @@ var leasedJob = jobdb.NewJob( false, 1).WithQueued(false).WithNewRun("testExecutor", "test-node", "node") +var defaultJobRunError = &armadaevents.Error{ + Terminal: true, + Reason: &armadaevents.Error_PodError{ + PodError: &armadaevents.PodError{ + Message: "generic pod error", + }, + }, +} + +var leasedFailFastJob = jobdb.NewJob( + util.NewULID(), + "testJobset", + "testQueue", + uint32(10), + failFastSchedulingInfo, + false, + 2, + false, + false, + false, + 1).WithQueued(false).WithNewRun("testExecutor", "test-node", "node") + var ( requeuedJobId = util.NewULID() requeuedJob = jobdb.NewJob( @@ -122,29 +172,30 @@ var ( // Test a single scheduler cycle func TestScheduler_TestCycle(t *testing.T) { tests := map[string]struct { - initialJobs []*jobdb.Job // jobs in the jobdb at the start of the cycle - jobUpdates []database.Job // job updates from the database - runUpdates []database.Run // run updates from the database - staleExecutor bool // if true then the executorRepository will report the executor as stale - fetchError bool // if true then the jobRepository will throw an error - scheduleError bool // if true then the schedulingalgo will throw an error - publishError bool // if true the publisher will throw an error - submitCheckerFailure bool // if true the submit checker will say the job is unschedulable - expectedJobRunLeased []string // ids of jobs we expect to have produced leased messages - expectedJobRunErrors []string // ids of jobs we expect to have produced jobRunErrors messages - expectedJobErrors []string // ids of jobs we expect to have produced jobErrors messages - expectedJobRunPreempted []string // ids of jobs we expect to have produced jobRunPreempted messages - expectedJobCancelled []string // ids of jobs we expect to have produced cancelled messages - expectedJobReprioritised []string // ids of jobs we expect to have produced reprioritised messages - expectedQueued []string // ids of jobs we expect to have produced requeued messages - expectedJobSucceeded []string // ids of jobs we expect to have produced succeeeded messages - expectedLeased []string // ids of jobs we expected to be leased in jobdb at the end of the cycle - expectedRequeued []string // ids of jobs we expected to be requeued in jobdb at the end of the cycle - expectedTerminal []string // ids of jobs we expected to be terminal in jobdb at the end of the cycle - expectedJobPriority map[string]uint32 // expected priority of jobs at the end of the cycle - expectedNodeAntiAffinities []string // list of nodes there is expected to be anti affinities for on job scheduling info - expectedJobSchedulingInfoVersion int // expected scheduling info version of jobs at the end of the cycle - expectedQueuedVersion int32 // expected queued version of jobs atthe end of the cycle + initialJobs []*jobdb.Job // jobs in the jobdb at the start of the cycle + jobUpdates []database.Job // job updates from the database + runUpdates []database.Run // run updates from the database + jobRunErrors map[uuid.UUID]*armadaevents.Error // job run errors in the database + staleExecutor bool // if true then the executorRepository will report the executor as stale + fetchError bool // if true then the jobRepository will throw an error + scheduleError bool // if true then the scheduling algo will throw an error + publishError bool // if true the publisher will throw an error + submitCheckerFailure bool // if true the submit checker will say the job is unschedulable + expectedJobRunLeased []string // ids of jobs we expect to have produced leased messages + expectedJobRunErrors []string // ids of jobs we expect to have produced jobRunErrors messages + expectedJobErrors []string // ids of jobs we expect to have produced jobErrors messages + expectedJobRunPreempted []string // ids of jobs we expect to have produced jobRunPreempted messages + expectedJobCancelled []string // ids of jobs we expect to have produced cancelled messages + expectedJobReprioritised []string // ids of jobs we expect to have produced reprioritised messages + expectedQueued []string // ids of jobs we expect to have produced requeued messages + expectedJobSucceeded []string // ids of jobs we expect to have produced succeeeded messages + expectedLeased []string // ids of jobs we expected to be leased in jobdb at the end of the cycle + expectedRequeued []string // ids of jobs we expected to be requeued in jobdb at the end of the cycle + expectedTerminal []string // ids of jobs we expected to be terminal in jobdb at the end of the cycle + expectedJobPriority map[string]uint32 // expected priority of jobs at the end of the cycle + expectedNodeAntiAffinities []string // list of nodes there is expected to be anti affinities for on job scheduling info + expectedJobSchedulingInfoVersion int // expected scheduling info version of jobs at the end of the cycle + expectedQueuedVersion int32 // expected queued version of jobs at the end of the cycle }{ "Lease a single job already in the db": { initialJobs: []*jobdb.Job{queuedJob}, @@ -301,6 +352,25 @@ func TestScheduler_TestCycle(t *testing.T) { expectedTerminal: []string{leasedJob.Id()}, expectedQueuedVersion: leasedJob.QueuedVersion(), }, + "Lease returned for fail fast job": { + initialJobs: []*jobdb.Job{leasedFailFastJob}, + // Fail fast should mean there is only ever 1 attempted run + runUpdates: []database.Run{ + { + RunID: leasedFailFastJob.LatestRun().Id(), + JobID: leasedFailFastJob.Id(), + JobSet: "testJobSet", + Executor: "testExecutor", + Failed: true, + Returned: true, + RunAttempted: false, + Serial: 1, + }, + }, + expectedJobErrors: []string{leasedFailFastJob.Id()}, + expectedTerminal: []string{leasedFailFastJob.Id()}, + expectedQueuedVersion: leasedFailFastJob.QueuedVersion(), + }, "Job cancelled": { initialJobs: []*jobdb.Job{leasedJob}, jobUpdates: []database.Job{ @@ -352,6 +422,9 @@ func TestScheduler_TestCycle(t *testing.T) { Serial: 1, }, }, + jobRunErrors: map[uuid.UUID]*armadaevents.Error{ + leasedJob.LatestRun().Id(): defaultJobRunError, + }, expectedJobErrors: []string{leasedJob.Id()}, expectedTerminal: []string{leasedJob.Id()}, expectedQueuedVersion: leasedJob.QueuedVersion(), @@ -407,6 +480,7 @@ func TestScheduler_TestCycle(t *testing.T) { jobRepo := &testJobRepository{ updatedJobs: tc.jobUpdates, updatedRuns: tc.runUpdates, + errors: tc.jobRunErrors, shouldError: tc.fetchError, } testClock := clock.NewFakeClock(time.Now()) @@ -440,6 +514,7 @@ func TestScheduler_TestCycle(t *testing.T) { clusterTimeout, maxNumberOfAttempts, nodeIdLabel, + schedulerMetrics, ) require.NoError(t, err) @@ -453,7 +528,7 @@ func TestScheduler_TestCycle(t *testing.T) { // run a scheduler cycle ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) - err = sched.cycle(ctx, false, sched.leaderController.GetToken()) + err = sched.cycle(ctx, false, sched.leaderController.GetToken(), true) if tc.fetchError || tc.publishError || tc.scheduleError { assert.Error(t, err) } else { @@ -516,7 +591,7 @@ func TestScheduler_TestCycle(t *testing.T) { delete(remainingLeased, job.Id()) } if expectedPriority, ok := tc.expectedJobPriority[job.Id()]; ok { - assert.Equal(t, job.Priority(), expectedPriority) + assert.Equal(t, expectedPriority, job.Priority()) } if len(tc.expectedNodeAntiAffinities) > 0 { assert.Len(t, job.JobSchedulingInfo().ObjectRequirements, 1) @@ -532,12 +607,12 @@ func TestScheduler_TestCycle(t *testing.T) { if tc.expectedQueuedVersion != 0 { expectedQueuedVersion = tc.expectedQueuedVersion } - assert.Equal(t, job.QueuedVersion(), expectedQueuedVersion) + assert.Equal(t, expectedQueuedVersion, job.QueuedVersion()) expectedSchedulingInfoVersion := 1 if tc.expectedJobSchedulingInfoVersion != 0 { expectedSchedulingInfoVersion = tc.expectedJobSchedulingInfoVersion } - assert.Equal(t, job.JobSchedulingInfo().Version, uint32(expectedSchedulingInfoVersion)) + assert.Equal(t, uint32(expectedSchedulingInfoVersion), job.JobSchedulingInfo().Version) } assert.Equal(t, 0, len(remainingLeased)) assert.Equal(t, 0, len(remainingQueued)) @@ -603,7 +678,8 @@ func TestRun(t *testing.T) { 15*time.Second, 1*time.Hour, maxNumberOfAttempts, - nodeIdLabel) + nodeIdLabel, + schedulerMetrics) require.NoError(t, err) sched.clock = testClock @@ -734,20 +810,28 @@ func TestScheduler_TestSyncState(t *testing.T) { expectedJobDbIds: []string{queuedJob.Id()}, }, "job succeeded": { - initialJobs: []*jobdb.Job{queuedJob}, + initialJobs: []*jobdb.Job{leasedJob}, jobUpdates: []database.Job{ { - JobID: queuedJob.Id(), - JobSet: queuedJob.Jobset(), - Queue: queuedJob.Queue(), - Submitted: queuedJob.Created(), - Priority: int64(queuedJob.Priority()), + JobID: leasedJob.Id(), + JobSet: leasedJob.Jobset(), + Queue: leasedJob.Queue(), + Submitted: leasedJob.Created(), + Priority: int64(leasedJob.Priority()), SchedulingInfo: schedulingInfoBytes, Succeeded: true, Serial: 1, }, }, - expectedUpdatedJobs: []*jobdb.Job{}, + runUpdates: []database.Run{ + { + RunID: leasedJob.LatestRun().Id(), + JobID: leasedJob.LatestRun().JobId(), + JobSet: leasedJob.GetJobSet(), + Succeeded: true, + }, + }, + expectedUpdatedJobs: []*jobdb.Job{leasedJob.WithUpdatedRun(leasedJob.LatestRun().WithSucceeded(true))}, expectedJobDbIds: []string{}, }, "job requeued": { @@ -804,7 +888,8 @@ func TestScheduler_TestSyncState(t *testing.T) { 5*time.Second, 1*time.Hour, maxNumberOfAttempts, - nodeIdLabel) + nodeIdLabel, + schedulerMetrics) require.NoError(t, err) // insert initial jobs @@ -947,7 +1032,7 @@ func (t *testSchedulingAlgo) Schedule(ctx context.Context, txn *jobdb.Txn, jobDb if !job.Queued() { return nil, errors.Errorf("was asked to lease %s but job was already leased", job.Id()) } - job = job.WithQueued(false).WithNewRun("test-executor", "test-node", "node") + job = job.WithQueuedVersion(job.QueuedVersion()+1).WithQueued(false).WithNewRun("test-executor", "test-node", "node") scheduledJobs = append(scheduledJobs, job) } if err := jobDb.Upsert(txn, preemptedJobs); err != nil { diff --git a/internal/scheduler/schedulerapp.go b/internal/scheduler/schedulerapp.go index 83ed80abde5..fc38734e11d 100644 --- a/internal/scheduler/schedulerapp.go +++ b/internal/scheduler/schedulerapp.go @@ -138,7 +138,7 @@ func Run(config schedulerconfig.Configuration) error { executorRepository, legacyExecutorRepository, allowedPcs, - config.Scheduling.MaximumJobsToSchedule, + config.MaxJobsLeasedPerCall, config.Scheduling.Preemption.NodeIdLabel, config.Scheduling.Preemption.PriorityClassNameOverride, config.Pulsar.MaxAllowedMessageSize, @@ -206,6 +206,7 @@ func Run(config schedulerconfig.Configuration) error { config.ExecutorTimeout, config.Scheduling.MaxRetries+1, config.Scheduling.Preemption.NodeIdLabel, + NewSchedulerMetrics(config.Metrics.Metrics), ) if err != nil { return errors.WithMessage(err, "error creating scheduler") diff --git a/internal/scheduler/schedulerobjects/schedulerobjects.proto b/internal/scheduler/schedulerobjects/schedulerobjects.proto index fd189f081e8..3bcff305487 100644 --- a/internal/scheduler/schedulerobjects/schedulerobjects.proto +++ b/internal/scheduler/schedulerobjects/schedulerobjects.proto @@ -7,8 +7,6 @@ import "k8s.io/api/core/v1/generated.proto"; import "k8s.io/apimachinery/pkg/api/resource/generated.proto"; import "github.com/gogo/protobuf/gogoproto/gogo.proto"; -// option (gogoproto.stable_marshaler) = true; - // Executor represents an executor running on a worker cluster message Executor { // Name of the executor. @@ -176,4 +174,4 @@ message PulsarSchedulerJobDetails { string JobId = 1; string Queue = 2; string JobSet = 3; -} +} \ No newline at end of file diff --git a/internal/scheduler/scheduling_algo.go b/internal/scheduler/scheduling_algo.go index 6bcb1f3b0b5..2c925d465a5 100644 --- a/internal/scheduler/scheduling_algo.go +++ b/internal/scheduler/scheduling_algo.go @@ -21,6 +21,7 @@ import ( schedulerconstraints "github.com/armadaproject/armada/internal/scheduler/constraints" schedulercontext "github.com/armadaproject/armada/internal/scheduler/context" "github.com/armadaproject/armada/internal/scheduler/database" + "github.com/armadaproject/armada/internal/scheduler/fairness" "github.com/armadaproject/armada/internal/scheduler/interfaces" "github.com/armadaproject/armada/internal/scheduler/jobdb" "github.com/armadaproject/armada/internal/scheduler/nodedb" @@ -84,6 +85,18 @@ func (l *FairSchedulingAlgo) Schedule( jobDb *jobdb.JobDb, ) (*SchedulerResult, error) { log := ctxlogrus.Extract(ctx) + + overallSchedulerResult := &SchedulerResult{ + NodeIdByJobId: make(map[string]string), + SchedulingContexts: make([]*schedulercontext.SchedulingContext, 0, 0), + } + + // Exit immediately if scheduling is disabled. + if l.schedulingConfig.DisableScheduling { + log.Info("skipping scheduling - scheduling disabled") + return overallSchedulerResult, nil + } + ctxWithTimeout, cancel := context.WithTimeout(ctx, l.maxSchedulingDuration) defer cancel() @@ -91,9 +104,7 @@ func (l *FairSchedulingAlgo) Schedule( if err != nil { return nil, err } - overallSchedulerResult := &SchedulerResult{ - NodeIdByJobId: make(map[string]string), - } + executorGroups := l.groupExecutors(fsctx.executors) if len(l.executorGroupsToSchedule) == 0 { // Cycle over groups in a consistent order. @@ -123,7 +134,10 @@ func (l *FairSchedulingAlgo) Schedule( // Assume pool and minimumJobSize are consistent within the group. pool := executorGroup[0].Pool minimumJobSize := executorGroup[0].MinimumJobSize - log.Infof("scheduling on executor group %s", executorGroupLabel) + log.Infof( + "scheduling on executor group %s with capacity %s", + executorGroupLabel, fsctx.totalCapacityByPool[pool].CompactString(), + ) schedulerResult, sctx, err := l.scheduleOnExecutors( ctxWithTimeout, fsctx, @@ -160,6 +174,7 @@ func (l *FairSchedulingAlgo) Schedule( // Aggregate changes across executors. overallSchedulerResult.PreemptedJobs = append(overallSchedulerResult.PreemptedJobs, schedulerResult.PreemptedJobs...) overallSchedulerResult.ScheduledJobs = append(overallSchedulerResult.ScheduledJobs, schedulerResult.ScheduledJobs...) + overallSchedulerResult.SchedulingContexts = append(overallSchedulerResult.SchedulingContexts, schedulerResult.SchedulingContexts...) maps.Copy(overallSchedulerResult.NodeIdByJobId, schedulerResult.NodeIdByJobId) // Update fsctx. @@ -335,17 +350,30 @@ func (l *FairSchedulingAlgo) scheduleOnExecutors( if len(executors) == 1 { executorId = executors[0].Id } + totalResources := fsctx.totalCapacityByPool[pool] + var fairnessCostProvider fairness.FairnessCostProvider + if l.schedulingConfig.FairnessModel == configuration.DominantResourceFairness { + fairnessCostProvider, err = fairness.NewDominantResourceFairness( + totalResources, + l.schedulingConfig.DominantResourceFairnessResourcesToConsider, + ) + if err != nil { + return nil, nil, err + } + } else { + fairnessCostProvider, err = fairness.NewAssetFairness(l.schedulingConfig.ResourceScarcity) + if err != nil { + return nil, nil, err + } + } sctx := schedulercontext.NewSchedulingContext( executorId, pool, l.schedulingConfig.Preemption.PriorityClasses, l.schedulingConfig.Preemption.DefaultPriorityClass, - l.schedulingConfig.ResourceScarcity, - fsctx.totalCapacityByPool[pool], + fairnessCostProvider, + totalResources, ) - if l.schedulingConfig.FairnessModel == configuration.DominantResourceFairness { - sctx.EnableDominantResourceFairness(l.schedulingConfig.DominantResourceFairnessResourcesToConsider) - } for queue, priorityFactor := range fsctx.priorityFactorByQueue { if !fsctx.isActiveByQueueName[queue] { // To ensure fair share is computed only from active queues, i.e., queues with jobs queued or running. @@ -375,10 +403,7 @@ func (l *FairSchedulingAlgo) scheduleOnExecutors( l.schedulingConfig.Preemption.NodeEvictionProbability, l.schedulingConfig.Preemption.NodeOversubscriptionEvictionProbability, l.schedulingConfig.Preemption.ProtectedFractionOfFairShare, - &schedulerJobRepositoryAdapter{ - txn: fsctx.txn, - db: fsctx.jobDb, - }, + NewSchedulerJobRepositoryAdapter(fsctx.jobDb, fsctx.txn), nodeDb, fsctx.nodeIdByJobId, fsctx.jobIdsByGangId, @@ -390,11 +415,13 @@ func (l *FairSchedulingAlgo) scheduleOnExecutors( if l.schedulingConfig.EnableAssertions { scheduler.EnableAssertions() } + if l.schedulingConfig.EnableNewPreemptionStrategy { + scheduler.EnableNewPreemptionStrategy() + } result, err := scheduler.Schedule(ctx) if err != nil { return nil, nil, err } - for i, job := range result.PreemptedJobs { jobDbJob := job.(*jobdb.Job) if run := jobDbJob.LatestRun(); run != nil { @@ -413,21 +440,30 @@ func (l *FairSchedulingAlgo) scheduleOnExecutors( if node, err := nodeDb.GetNode(nodeId); err != nil { return nil, nil, err } else { - result.ScheduledJobs[i] = jobDbJob.WithQueued(false).WithNewRun(node.Executor, node.Id, node.Name) + result.ScheduledJobs[i] = jobDbJob.WithQueuedVersion(jobDbJob.QueuedVersion()+1).WithQueued(false).WithNewRun(node.Executor, node.Id, node.Name) } } return result, sctx, nil } // Adapter to make jobDb implement the JobRepository interface. -type schedulerJobRepositoryAdapter struct { +// +// TODO: Pass JobDb into the scheduler instead of using this shim to convert to a JobRepo. +type SchedulerJobRepositoryAdapter struct { db *jobdb.JobDb txn *jobdb.Txn } +func NewSchedulerJobRepositoryAdapter(db *jobdb.JobDb, txn *jobdb.Txn) *SchedulerJobRepositoryAdapter { + return &SchedulerJobRepositoryAdapter{ + db: db, + txn: txn, + } +} + // GetQueueJobIds is necessary to implement the JobRepository interface, which we need while transitioning from the old // to new scheduler. -func (repo *schedulerJobRepositoryAdapter) GetQueueJobIds(queue string) ([]string, error) { +func (repo *SchedulerJobRepositoryAdapter) GetQueueJobIds(queue string) ([]string, error) { rv := make([]string, 0) it := repo.db.QueuedJobs(repo.txn, queue) for v, _ := it.Next(); v != nil; v, _ = it.Next() { @@ -438,7 +474,7 @@ func (repo *schedulerJobRepositoryAdapter) GetQueueJobIds(queue string) ([]strin // GetExistingJobsByIds is necessary to implement the JobRepository interface which we need while transitioning from the // old to new scheduler. -func (repo *schedulerJobRepositoryAdapter) GetExistingJobsByIds(ids []string) ([]interfaces.LegacySchedulerJob, error) { +func (repo *SchedulerJobRepositoryAdapter) GetExistingJobsByIds(ids []string) ([]interfaces.LegacySchedulerJob, error) { rv := make([]interfaces.LegacySchedulerJob, 0, len(ids)) for _, id := range ids { if job := repo.db.GetById(repo.txn, id); job != nil { diff --git a/internal/scheduler/simulator/simulator.go b/internal/scheduler/simulator/simulator.go new file mode 100644 index 00000000000..97666a7a9fc --- /dev/null +++ b/internal/scheduler/simulator/simulator.go @@ -0,0 +1,869 @@ +package simulator + +import ( + "bytes" + "container/heap" + "context" + fmt "fmt" + "os" + "path/filepath" + "strings" + "time" + + "github.com/caarlos0/log" + "github.com/grpc-ecosystem/go-grpc-middleware/logging/logrus/ctxlogrus" + "github.com/mattn/go-zglob" + "github.com/oklog/ulid" + "github.com/pkg/errors" + "github.com/renstrom/shortuuid" + "github.com/sirupsen/logrus" + "github.com/spf13/viper" + "golang.org/x/exp/maps" + "golang.org/x/exp/slices" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/util/yaml" + + "github.com/armadaproject/armada/internal/armada/configuration" + commonconfig "github.com/armadaproject/armada/internal/common/config" + armadaslices "github.com/armadaproject/armada/internal/common/slices" + "github.com/armadaproject/armada/internal/common/util" + "github.com/armadaproject/armada/internal/scheduler" + schedulerconstraints "github.com/armadaproject/armada/internal/scheduler/constraints" + schedulercontext "github.com/armadaproject/armada/internal/scheduler/context" + "github.com/armadaproject/armada/internal/scheduler/fairness" + "github.com/armadaproject/armada/internal/scheduler/jobdb" + "github.com/armadaproject/armada/internal/scheduler/nodedb" + schedulerobjects "github.com/armadaproject/armada/internal/scheduler/schedulerobjects" + "github.com/armadaproject/armada/internal/scheduleringester" + "github.com/armadaproject/armada/pkg/armadaevents" +) + +// Simulator captures the parameters and state of the Armada simulator. +type Simulator struct { + testCase *TestCase + schedulingConfig configuration.SchedulingConfig + // Map from jobId to the jobTemplate from which the job was created. + jobTemplateByJobId map[string]*JobTemplate + // Map from job template ids to slices of templates depending on those ids. + jobTemplatesByDependencyIds map[string]map[string]*JobTemplate + // Map from job template id to jobTemplate for templates for which all jobs have yet to succeed. + activeJobTemplatesById map[string]*JobTemplate + // The JobDb stores all jobs that have yet to terminate. + jobDb *jobdb.JobDb + // Map from node id to the pool to which the node belongs. + poolByNodeId map[string]string + // Separate nodeDb per pool and executorGroup. + nodeDbByPoolAndExecutorGroup map[string][]*nodedb.NodeDb + // Map from executor name to the nodeDb to which it belongs. + nodeDbByExecutorName map[string]*nodedb.NodeDb + // Allocation by pool for each queue and priority class. + // Stored across invocations of the scheduler. + allocationByPoolAndQueueAndPriorityClass map[string]map[string]schedulerobjects.QuantityByTAndResourceType[string] + // Total resources across all executorGroups for each pool. + totalResourcesByPool map[string]schedulerobjects.ResourceList + // Current simulated time. + time time.Time + // Sequence number of the next event to be published. + sequenceNumber int + // Events stored in a priority queue ordered by submit time. + eventLog EventLog + // Simulated events are emitted on this channel in order. + c chan *armadaevents.EventSequence +} + +func NewSimulator(testCase *TestCase, schedulingConfig configuration.SchedulingConfig) (*Simulator, error) { + initialiseTestCase(testCase) + if err := validateTestCase(testCase); err != nil { + return nil, err + } + + // Setup nodes. + nodeDbByPoolAndExecutorGroup := make(map[string][]*nodedb.NodeDb) + totalResourcesByPool := make(map[string]schedulerobjects.ResourceList) + poolByNodeId := make(map[string]string) + // executorGroupByExecutor := make(map[string]string) + nodeDbByExecutorName := make(map[string]*nodedb.NodeDb) + for _, pool := range testCase.Pools { + totalResourcesForPool := schedulerobjects.ResourceList{} + for executorGroupIndex, executorGroup := range pool.ExecutorGroups { + nodeDb, err := nodedb.NewNodeDb( + schedulingConfig.Preemption.PriorityClasses, + schedulingConfig.MaxExtraNodesToConsider, + schedulingConfig.IndexedResources, + schedulingConfig.IndexedTaints, + schedulingConfig.IndexedNodeLabels, + ) + if err != nil { + return nil, err + } + for executorIndex, executor := range executorGroup.Executors { + executorName := fmt.Sprintf("%s-%d-%d", pool.Name, executorGroupIndex, executorIndex) + nodeDbByExecutorName[executorName] = nodeDb + for nodeTemplateIndex, nodeTemplate := range executor.NodeTemplates { + for i := 0; i < int(nodeTemplate.Number); i++ { + nodeId := fmt.Sprintf("%s-%d-%d-%d-%d", pool.Name, executorGroupIndex, executorIndex, nodeTemplateIndex, i) + allocatableByPriorityAndResource := make(map[int32]schedulerobjects.ResourceList) + for _, priorityClass := range schedulingConfig.Preemption.PriorityClasses { + allocatableByPriorityAndResource[priorityClass.Priority] = nodeTemplate.TotalResources.DeepCopy() + } + node := &schedulerobjects.Node{ + Id: nodeId, + Name: nodeId, + Executor: executorName, + Taints: slices.Clone(nodeTemplate.Taints), + Labels: maps.Clone(nodeTemplate.Labels), + TotalResources: nodeTemplate.TotalResources.DeepCopy(), + AllocatableByPriorityAndResource: allocatableByPriorityAndResource, + } + txn := nodeDb.Txn(true) + if err := nodeDb.CreateAndInsertWithApiJobsWithTxn(txn, nil, node); err != nil { + txn.Abort() + return nil, err + } + txn.Commit() + poolByNodeId[nodeId] = pool.Name + } + } + } + nodeDbByPoolAndExecutorGroup[pool.Name] = append(nodeDbByPoolAndExecutorGroup[pool.Name], nodeDb) + totalResourcesForPool.Add(nodeDb.TotalResources()) + } + totalResourcesByPool[pool.Name] = totalResourcesForPool + } + s := &Simulator{ + testCase: testCase, + schedulingConfig: schedulingConfig, + jobTemplateByJobId: make(map[string]*JobTemplate), + jobTemplatesByDependencyIds: make(map[string]map[string]*JobTemplate), + activeJobTemplatesById: make(map[string]*JobTemplate), + jobDb: jobdb.NewJobDb(), + poolByNodeId: poolByNodeId, + nodeDbByPoolAndExecutorGroup: nodeDbByPoolAndExecutorGroup, + nodeDbByExecutorName: nodeDbByExecutorName, + allocationByPoolAndQueueAndPriorityClass: make(map[string]map[string]schedulerobjects.QuantityByTAndResourceType[string]), + totalResourcesByPool: totalResourcesByPool, + c: make(chan *armadaevents.EventSequence), + } + + // Mark all jobTemplates as active. + for _, queue := range testCase.Queues { + for _, jobTemplate := range queue.JobTemplates { + s.activeJobTemplatesById[jobTemplate.Id] = jobTemplate + } + } + + // Publish submitJob messages for all jobTemplates without dependencies. + for _, queue := range testCase.Queues { + for _, jobTemplate := range queue.JobTemplates { + if len(jobTemplate.Dependencies) > 0 { + continue + } + eventSequence := &armadaevents.EventSequence{ + Queue: queue.Name, + JobSetName: jobTemplate.JobSet, + } + for k := 0; k < int(jobTemplate.Number); k++ { + if len(jobTemplate.Dependencies) > 0 { + continue + } + jobId := util.ULID() + eventSequence.Events = append( + eventSequence.Events, + &armadaevents.EventSequence_Event{ + Created: pointer(maxTime(s.time, jobTemplate.MinSubmitTime)), + Event: &armadaevents.EventSequence_Event_SubmitJob{ + SubmitJob: submitJobFromJobTemplate(jobId, jobTemplate), + }, + }, + ) + s.jobTemplateByJobId[jobId.String()] = jobTemplate + } + if len(eventSequence.Events) > 0 { + s.pushEventSequence(eventSequence) + } + } + } + + // Setup the jobTemplate dependency map. + for _, queue := range testCase.Queues { + for _, jobTemplate := range queue.JobTemplates { + for _, dependencyJobTemplateId := range jobTemplate.Dependencies { + dependencyJobTemplate, ok := s.activeJobTemplatesById[dependencyJobTemplateId] + if !ok { + return nil, errors.Errorf( + "jobTemplate %s depends on jobTemplate %s, which does not exist", + jobTemplate.Id, dependencyJobTemplate.Id, + ) + } + m := s.jobTemplatesByDependencyIds[dependencyJobTemplateId] + if m == nil { + m = make(map[string]*JobTemplate) + s.jobTemplatesByDependencyIds[dependencyJobTemplateId] = m + } + m[jobTemplate.Id] = jobTemplate + } + } + } + + // Publish scheduleEvent. + s.pushScheduleEvent(s.time.Add(10 * time.Second)) + return s, nil +} + +func (s *Simulator) C() <-chan *armadaevents.EventSequence { + return s.c +} + +func validateTestCase(testCase *TestCase) error { + poolNames := util.Map(testCase.Pools, func(pool *Pool) string { return pool.Name }) + if !slices.Equal(poolNames, armadaslices.Unique(poolNames)) { + return errors.Errorf("duplicate pool name: %v", poolNames) + } + + executorNames := make([]string, 0) + for _, pool := range testCase.Pools { + for _, executorGroup := range pool.ExecutorGroups { + for _, executor := range executorGroup.Executors { + executorNames = append(executorNames, executor.Name) + } + } + } + if !slices.Equal(executorNames, armadaslices.Unique(executorNames)) { + return errors.Errorf("duplicate executor name: %v", executorNames) + } + + queueNames := util.Map(testCase.Queues, func(queue Queue) string { return queue.Name }) + if !slices.Equal(queueNames, armadaslices.Unique(queueNames)) { + return errors.Errorf("duplicate queue name: %v", queueNames) + } + return nil +} + +func initialiseTestCase(testCase *TestCase) { + // Assign names to executors with none specified. + for _, pool := range testCase.Pools { + for i, executorGroup := range pool.ExecutorGroups { + for j, executor := range executorGroup.Executors { + if executor.Name == "" { + executor.Name = fmt.Sprintf("%s-%d-%d", pool.Name, i, j) + } + } + } + } + + // Assign names to jobTemplates with none specified. + for _, queue := range testCase.Queues { + for i, jobTemplate := range queue.JobTemplates { + if jobTemplate.Id == "" { + jobTemplate.Id = fmt.Sprintf("%s-%d", queue.Name, i) + } + jobTemplate.Queue = queue.Name + } + } +} + +func submitJobFromJobTemplate(jobId ulid.ULID, jobTemplate *JobTemplate) *armadaevents.SubmitJob { + return &armadaevents.SubmitJob{ + JobId: armadaevents.ProtoUuidFromUlid(jobId), + Priority: jobTemplate.QueuePriority, + MainObject: &armadaevents.KubernetesMainObject{ + ObjectMeta: &armadaevents.ObjectMeta{ + Annotations: jobTemplate.Requirements.Annotations, + }, + Object: &armadaevents.KubernetesMainObject_PodSpec{ + PodSpec: &armadaevents.PodSpecWithAvoidList{ + PodSpec: &v1.PodSpec{ + NodeSelector: jobTemplate.Requirements.NodeSelector, + Affinity: jobTemplate.Requirements.Affinity, + Tolerations: jobTemplate.Requirements.Tolerations, + PriorityClassName: jobTemplate.PriorityClassName, + Containers: []v1.Container{ + { + Resources: jobTemplate.Requirements.ResourceRequirements, + }, + }, + }, + }, + }, + }, + } +} + +func (s *Simulator) pushEventSequence(eventSequence *armadaevents.EventSequence) { + if len(eventSequence.Events) == 0 { + return + } + heap.Push( + &s.eventLog, + Event{ + time: *eventSequence.Events[0].Created, + sequenceNumber: s.sequenceNumber, + eventSequenceOrScheduleEvent: eventSequence, + }, + ) + s.sequenceNumber++ +} + +func (s *Simulator) pushScheduleEvent(time time.Time) { + heap.Push( + &s.eventLog, + Event{ + time: time, + sequenceNumber: s.sequenceNumber, + eventSequenceOrScheduleEvent: scheduleEvent{}, + }, + ) + s.sequenceNumber++ +} + +type EventLog []Event + +type Event struct { + // Time at which the event was submitted. + time time.Time + // Each event is assigned a sequence number. + // Events with equal time are ordered by their sequence number. + sequenceNumber int + // One of armadaevents.EventSequence or scheduleEvent.. + eventSequenceOrScheduleEvent any + // Maintained by the heap.Interface methods. + index int +} + +func (el EventLog) Len() int { return len(el) } + +func (el EventLog) Less(i, j int) bool { + if el[i].time == el[j].time { + return el[i].sequenceNumber < el[j].sequenceNumber + } + return el[j].time.After(el[i].time) +} + +func (el EventLog) Swap(i, j int) { + el[i], el[j] = el[j], el[i] + el[i].index = i + el[j].index = j +} + +func (el *EventLog) Push(x any) { + n := len(*el) + item := x.(Event) + item.index = n + *el = append(*el, item) +} + +func (el *EventLog) Pop() any { + old := *el + n := len(old) + item := old[n-1] + old[n-1] = Event{} // avoid memory leak + item.index = -1 // for safety + *el = old[0 : n-1] + return item +} + +// scheduleEvent is an event indicating the scheduler should be run. +type scheduleEvent struct{} + +func (s *Simulator) Run() error { + defer close(s.c) + for s.eventLog.Len() > 0 { + event := heap.Pop(&s.eventLog).(Event) + if err := s.handleSimulatorEvent(event); err != nil { + return err + } + } + return nil +} + +func (s *Simulator) handleSimulatorEvent(event Event) error { + s.time = event.time + switch e := event.eventSequenceOrScheduleEvent.(type) { + case *armadaevents.EventSequence: + if err := s.handleEventSequence(e); err != nil { + return err + } + case scheduleEvent: + if err := s.handleScheduleEvent(); err != nil { + return err + } + } + return nil +} + +func (s *Simulator) handleScheduleEvent() error { + var eventSequences []*armadaevents.EventSequence + txn := s.jobDb.WriteTxn() + defer txn.Abort() + for _, pool := range s.testCase.Pools { + for i := range pool.ExecutorGroups { + nodeDb := s.nodeDbByPoolAndExecutorGroup[pool.Name][i] + if err := nodeDb.Reset(); err != nil { + return err + } + totalResources := s.totalResourcesByPool[pool.Name] + fairnessCostProvider, err := fairness.NewDominantResourceFairness( + totalResources, + s.schedulingConfig.DominantResourceFairnessResourcesToConsider, + ) + if err != nil { + return err + } + sctx := schedulercontext.NewSchedulingContext( + fmt.Sprintf("%s-%d", pool.Name, i), + pool.Name, + s.schedulingConfig.Preemption.PriorityClasses, + s.schedulingConfig.Preemption.DefaultPriorityClass, + fairnessCostProvider, + totalResources, + ) + for _, queue := range s.testCase.Queues { + err := sctx.AddQueueSchedulingContext( + queue.Name, + queue.Weight, + s.allocationByPoolAndQueueAndPriorityClass[pool.Name][queue.Name], + ) + if err != nil { + return err + } + } + constraints := schedulerconstraints.SchedulingConstraintsFromSchedulingConfig( + pool.Name, + totalResources, + // Minimum job size not not used for simulation; use taints/tolerations instead. + schedulerobjects.ResourceList{}, + s.schedulingConfig, + ) + sch := scheduler.NewPreemptingQueueScheduler( + sctx, + constraints, + s.schedulingConfig.Preemption.NodeEvictionProbability, + s.schedulingConfig.Preemption.NodeOversubscriptionEvictionProbability, + s.schedulingConfig.Preemption.ProtectedFractionOfFairShare, + scheduler.NewSchedulerJobRepositoryAdapter(s.jobDb, txn), + nodeDb, + // TODO: Necessary to support partial eviction. + nil, + nil, + nil, + ) + if s.schedulingConfig.EnableNewPreemptionStrategy { + sch.EnableNewPreemptionStrategy() + } + ctx := ctxlogrus.ToContext(context.Background(), logrus.NewEntry(logrus.New())) + result, err := sch.Schedule(ctx) + if err != nil { + return err + } + + // Update jobDb to reflect the decisions by the scheduler. + // Sort jobs to ensure deterministic event ordering. + preemptedJobs := scheduler.PreemptedJobsFromSchedulerResult[*jobdb.Job](result) + scheduledJobs := scheduler.ScheduledJobsFromSchedulerResult[*jobdb.Job](result) + less := func(a, b *jobdb.Job) bool { + if a.Queue() < b.Queue() { + return true + } else if a.Queue() > b.Queue() { + return false + } + if a.Id() < b.Id() { + return true + } else if a.Id() > b.Id() { + return false + } + return false + } + slices.SortFunc(preemptedJobs, less) + slices.SortFunc(scheduledJobs, less) + for i, job := range preemptedJobs { + if run := job.LatestRun(); run != nil { + job = job.WithUpdatedRun(run.WithFailed(true)) + } else { + return errors.Errorf("attempting to preempt job %s with no associated runs", job.Id()) + } + preemptedJobs[i] = job.WithQueued(false).WithFailed(true) + } + for i, job := range scheduledJobs { + nodeId := result.NodeIdByJobId[job.GetId()] + if nodeId == "" { + return errors.Errorf("job %s not mapped to any node", job.GetId()) + } + if node, err := nodeDb.GetNode(nodeId); err != nil { + return err + } else { + scheduledJobs[i] = job.WithQueued(false).WithNewRun(node.Executor, node.Id, node.Name) + } + } + if err := s.jobDb.Upsert(txn, preemptedJobs); err != nil { + return err + } + if err := s.jobDb.Upsert(txn, scheduledJobs); err != nil { + return err + } + + // Update allocation. + s.allocationByPoolAndQueueAndPriorityClass[pool.Name] = sctx.AllocatedByQueueAndPriority() + + // Generate eventSequences. + // TODO: Add time taken to run the scheduler to s.time. + eventSequences, err = scheduler.AppendEventSequencesFromPreemptedJobs(eventSequences, preemptedJobs, s.time) + if err != nil { + return err + } + eventSequences, err = scheduler.AppendEventSequencesFromScheduledJobs(eventSequences, scheduledJobs, s.time) + if err != nil { + return err + } + } + } + txn.Commit() + + // Publish simulator events. + for _, eventSequence := range eventSequences { + s.pushEventSequence(eventSequence) + } + + // Schedule the next run of the scheduler, unless there are no more active jobTemplates. + // TODO: Make timeout configurable. + if len(s.activeJobTemplatesById) > 0 { + s.pushScheduleEvent(s.time.Add(10 * time.Second)) + } + return nil +} + +// TODO: Write events to disk unless they should be discarded. +func (s *Simulator) handleEventSequence(es *armadaevents.EventSequence) error { + txn := s.jobDb.WriteTxn() + defer txn.Abort() + eventsToPublish := make([]*armadaevents.EventSequence_Event, 0, len(es.Events)) + for _, event := range es.Events { + var ok bool + var err error = nil + switch eventType := event.GetEvent().(type) { + case *armadaevents.EventSequence_Event_SubmitJob: + ok, err = s.handleSubmitJob(txn, event.GetSubmitJob(), *event.Created, es) + case *armadaevents.EventSequence_Event_JobRunLeased: + ok, err = s.handleJobRunLeased(txn, event.GetJobRunLeased()) + case *armadaevents.EventSequence_Event_JobSucceeded: + ok, err = s.handleJobSucceeded(txn, event.GetJobSucceeded()) + case *armadaevents.EventSequence_Event_JobRunPreempted: + ok, err = s.handleJobRunPreempted(txn, event.GetJobRunPreempted()) + case *armadaevents.EventSequence_Event_ReprioritisedJob, + *armadaevents.EventSequence_Event_JobDuplicateDetected, + *armadaevents.EventSequence_Event_ResourceUtilisation, + *armadaevents.EventSequence_Event_StandaloneIngressInfo, + *armadaevents.EventSequence_Event_JobRunAssigned, + *armadaevents.EventSequence_Event_JobRunRunning, + *armadaevents.EventSequence_Event_JobRunSucceeded, + *armadaevents.EventSequence_Event_JobRunErrors, + *armadaevents.EventSequence_Event_ReprioritiseJob, + *armadaevents.EventSequence_Event_ReprioritiseJobSet, + *armadaevents.EventSequence_Event_CancelledJob, + *armadaevents.EventSequence_Event_JobRequeued, + *armadaevents.EventSequence_Event_PartitionMarker, + *armadaevents.EventSequence_Event_JobErrors, + *armadaevents.EventSequence_Event_CancelJob, + *armadaevents.EventSequence_Event_CancelJobSet: + // These events can be safely ignored. + log.Debugf("Ignoring event type %T", event) + default: + // This is an event type we haven't consider; log a warning. + log.Warnf("Ignoring unknown event type %T", eventType) + } + if err != nil { + return err + } + if ok { + eventsToPublish = append(eventsToPublish, event) + } + } + txn.Commit() + es.Events = eventsToPublish + if len(es.Events) > 0 { + s.c <- es + } + return nil +} + +func (s *Simulator) handleSubmitJob(txn *jobdb.Txn, e *armadaevents.SubmitJob, time time.Time, eventSequence *armadaevents.EventSequence) (bool, error) { + schedulingInfo, err := scheduleringester.SchedulingInfoFromSubmitJob(e, time, s.schedulingConfig.Preemption.PriorityClasses) + if err != nil { + return false, err + } + job := jobdb.NewJob( + armadaevents.UlidFromProtoUuid(e.JobId).String(), + eventSequence.JobSetName, + eventSequence.Queue, + e.Priority, + schedulingInfo, + true, + 0, + false, + false, + false, + time.UnixNano(), + ) + if err := s.jobDb.Upsert(txn, []*jobdb.Job{job}); err != nil { + return false, err + } + return true, nil +} + +func (s *Simulator) handleJobRunLeased(txn *jobdb.Txn, e *armadaevents.JobRunLeased) (bool, error) { + jobId := armadaevents.UlidFromProtoUuid(e.JobId).String() + job := s.jobDb.GetById(txn, jobId) + // TODO: Randomise runtime. + jobTemplate := s.jobTemplateByJobId[jobId] + if jobTemplate == nil { + return false, errors.Errorf("no jobTemplate associated with job %s", jobId) + } + jobSuccessTime := s.time.Add(time.Duration(jobTemplate.RuntimeMean) * time.Second) + s.pushEventSequence( + &armadaevents.EventSequence{ + Queue: job.Queue(), + JobSetName: job.Jobset(), + Events: []*armadaevents.EventSequence_Event{ + { + Created: &jobSuccessTime, + Event: &armadaevents.EventSequence_Event_JobSucceeded{ + JobSucceeded: &armadaevents.JobSucceeded{ + JobId: e.JobId, + }, + }, + }, + }, + }, + ) + return true, nil +} + +func (s *Simulator) handleJobSucceeded(txn *jobdb.Txn, e *armadaevents.JobSucceeded) (bool, error) { + jobId := armadaevents.UlidFromProtoUuid(e.JobId).String() + job := s.jobDb.GetById(txn, jobId) + if job == nil || job.InTerminalState() { + // Job already terminated; nothing more to do. + return false, nil + } + if err := s.jobDb.BatchDelete(txn, []string{jobId}); err != nil { + return false, err + } + + // Subtract the allocation of this job from the queue allocation. + run := job.LatestRun() + pool := s.poolByNodeId[run.NodeId()] + s.allocationByPoolAndQueueAndPriorityClass[pool][job.Queue()].SubV1ResourceList(job.GetPriorityClassName(), job.GetResourceRequirements().Requests) + + // Unbind the job from the node on which it was scheduled. + if err := s.unbindRunningJob(job); err != nil { + return false, errors.WithMessagef(err, "failed to unbind job %s", job.Id()) + } + + // Increase the successful job count for this jobTemplate. + // If all jobs created from this template have succeeded, update dependent templates + // and submit any templates for which this was the last dependency. + jobTemplate := s.jobTemplateByJobId[job.GetId()] + jobTemplate.NumberSuccessful++ + if jobTemplate.Number == jobTemplate.NumberSuccessful { + delete(s.activeJobTemplatesById, jobTemplate.Id) + for _, dependentJobTemplate := range s.jobTemplatesByDependencyIds[jobTemplate.Id] { + i := slices.Index(dependentJobTemplate.Dependencies, jobTemplate.Id) + dependentJobTemplate.Dependencies = slices.Delete(dependentJobTemplate.Dependencies, i, i+1) + if len(dependentJobTemplate.Dependencies) > 0 { + continue + } + eventSequence := &armadaevents.EventSequence{ + Queue: dependentJobTemplate.Queue, + JobSetName: dependentJobTemplate.JobSet, + } + for k := 0; k < int(dependentJobTemplate.Number); k++ { + jobId := util.ULID() + eventSequence.Events = append( + eventSequence.Events, + &armadaevents.EventSequence_Event{ + Created: pointer(maxTime(s.time, dependentJobTemplate.MinSubmitTime)), + Event: &armadaevents.EventSequence_Event_SubmitJob{ + SubmitJob: submitJobFromJobTemplate(jobId, dependentJobTemplate), + }, + }, + ) + s.jobTemplateByJobId[jobId.String()] = dependentJobTemplate + } + if len(eventSequence.Events) > 0 { + s.pushEventSequence(eventSequence) + } + } + delete(s.jobTemplatesByDependencyIds, jobTemplate.Id) + } + return true, nil +} + +func (s *Simulator) unbindRunningJob(job *jobdb.Job) error { + if job.InTerminalState() { + return errors.Errorf("job %s has terminated", job.Id()) + } + run := job.LatestRun() + if run == nil { + return errors.Errorf("job %s has no runs associated with it", job.Id()) + } + if run.Executor() == "" { + return errors.Errorf("empty executor for run %s of job %s", run.Id(), job.Id()) + } + if run.NodeId() == "" { + return errors.Errorf("empty nodeId for run %s of job %s", run.Id(), job.Id()) + } + nodeDb := s.nodeDbByExecutorName[run.Executor()] + node, err := nodeDb.GetNode(run.NodeId()) + if err != nil { + return err + } else if node == nil { + return errors.Errorf("node %s not found", run.NodeId()) + } + node, err = nodedb.UnbindJobFromNode(s.schedulingConfig.Preemption.PriorityClasses, job, node) + if err != nil { + return err + } + if err := nodeDb.Upsert(node); err != nil { + return err + } + return nil +} + +func (s *Simulator) handleJobRunPreempted(txn *jobdb.Txn, e *armadaevents.JobRunPreempted) (bool, error) { + jobId := armadaevents.UlidFromProtoUuid(e.PreemptedJobId).String() + job := s.jobDb.GetById(txn, jobId) + + // Submit a retry for this job. + jobTemplate := s.jobTemplateByJobId[job.GetId()] + retryJobId := util.ULID() + s.pushEventSequence( + &armadaevents.EventSequence{ + Queue: job.Queue(), + JobSetName: job.Jobset(), + Events: []*armadaevents.EventSequence_Event{ + { + Created: &s.time, + Event: &armadaevents.EventSequence_Event_SubmitJob{ + SubmitJob: submitJobFromJobTemplate(retryJobId, jobTemplate), + }, + }, + }, + }, + ) + s.jobTemplateByJobId[retryJobId.String()] = jobTemplate + return true, nil +} + +// func (a *App) TestPattern(ctx context.Context, pattern string) (*TestSuiteReport, error) { +// testSpecs, err := TestSpecsFromPattern(pattern) +// if err != nil { +// return nil, err +// } +// return a.RunTests(ctx, testSpecs) +// } + +func SchedulingConfigsFromPattern(pattern string) ([]configuration.SchedulingConfig, error) { + filePaths, err := zglob.Glob(pattern) + if err != nil { + return nil, errors.WithStack(err) + } + return SchedulingConfigsFromFilePaths(filePaths) +} + +func SchedulingConfigsFromFilePaths(filePaths []string) ([]configuration.SchedulingConfig, error) { + rv := make([]configuration.SchedulingConfig, len(filePaths)) + for i, filePath := range filePaths { + config, err := SchedulingConfigFromFilePath(filePath) + if err != nil { + return nil, err + } + rv[i] = config + } + return rv, nil +} + +func SchedulingConfigFromFilePath(filePath string) (configuration.SchedulingConfig, error) { + config := configuration.SchedulingConfig{} + v := viper.New() + v.SetConfigFile(filePath) + if err := v.ReadInConfig(); err != nil { + return config, errors.WithStack(err) + } + if err := v.Unmarshal(&config, commonconfig.CustomHooks...); err != nil { + return config, errors.WithStack(err) + } + return config, nil +} + +func TestCasesFromPattern(pattern string) ([]*TestCase, error) { + filePaths, err := zglob.Glob(pattern) + if err != nil { + return nil, errors.WithStack(err) + } + return TestCasesFromFilePaths(filePaths) +} + +func TestCasesFromFilePaths(filePaths []string) ([]*TestCase, error) { + rv := make([]*TestCase, len(filePaths)) + for i, filePath := range filePaths { + testCase, err := TestCaseFromFilePath(filePath) + if err != nil { + return nil, err + } + rv[i] = testCase + } + return rv, nil +} + +func TestCaseFromFilePath(filePath string) (*TestCase, error) { + yamlBytes, err := os.ReadFile(filePath) + if err != nil { + return nil, errors.WithStack(err) + } + if len(yamlBytes) == 0 { + return nil, errors.Errorf("%s does not exist or is empty", filePath) + } + testCase, err := TestCaseFromBytes(yamlBytes) + if err != nil { + return nil, err + } + + // If no test name is provided, set it to be the filename. + if testCase.Name == "" { + fileName := filepath.Base(filePath) + fileName = strings.TrimSuffix(fileName, filepath.Ext(fileName)) + testCase.Name = fileName + } + + // Generate random ids for any job templates without an explicitly set id. + for i, queue := range testCase.Queues { + for j, jobTemplate := range queue.JobTemplates { + if jobTemplate.Id == "" { + jobTemplate.Id = shortuuid.New() + } + queue.JobTemplates[j] = jobTemplate + } + testCase.Queues[i] = queue + } + + return testCase, nil +} + +// TestCaseFromBytes unmarshalls bytes into a TestCase. +func TestCaseFromBytes(yamlBytes []byte) (*TestCase, error) { + var testCase TestCase + if err := yaml.NewYAMLOrJSONDecoder(bytes.NewReader(yamlBytes), 128).Decode(&testCase); err != nil { + return nil, errors.WithStack(err) + } + return &testCase, nil +} + +func maxTime(a, b time.Time) time.Time { + if a.Before(b) { + return b + } + return a +} + +func pointer[T any](t T) *T { + return &t +} diff --git a/internal/scheduler/simulator/simulator.pb.go b/internal/scheduler/simulator/simulator.pb.go new file mode 100644 index 00000000000..fdf830cd63c --- /dev/null +++ b/internal/scheduler/simulator/simulator.pb.go @@ -0,0 +1,2546 @@ +// Code generated by protoc-gen-gogo. DO NOT EDIT. +// source: internal/scheduler/simulator/simulator.proto + +package simulator + +import ( + encoding_binary "encoding/binary" + fmt "fmt" + schedulerobjects "github.com/armadaproject/armada/internal/scheduler/schedulerobjects" + _ "github.com/gogo/protobuf/gogoproto" + proto "github.com/gogo/protobuf/proto" + _ "github.com/gogo/protobuf/types" + github_com_gogo_protobuf_types "github.com/gogo/protobuf/types" + io "io" + v1 "k8s.io/api/core/v1" + math "math" + math_bits "math/bits" + time "time" +) + +// Reference imports to suppress errors if they are not otherwise used. +var _ = proto.Marshal +var _ = fmt.Errorf +var _ = math.Inf +var _ = time.Kitchen + +// This is a compile-time assertion to ensure that this generated file +// is compatible with the proto package it is being compiled against. +// A compilation error at this line likely means your copy of the +// proto package needs to be updated. +const _ = proto.GoGoProtoPackageIsVersion3 // please upgrade the proto package + +// TODO: +// Runtime family. +// Workflow manager delay. +// Job pending delay. +type TestCase struct { + Name string `protobuf:"bytes,1,opt,name=name,proto3" json:"name,omitempty"` + RandomSeed int64 `protobuf:"varint,2,opt,name=random_seed,json=randomSeed,proto3" json:"randomSeed,omitempty"` + Pools []*Pool `protobuf:"bytes,3,rep,name=pools,proto3" json:"pools,omitempty"` + Queues []Queue `protobuf:"bytes,4,rep,name=queues,proto3" json:"queues"` +} + +func (m *TestCase) Reset() { *m = TestCase{} } +func (m *TestCase) String() string { return proto.CompactTextString(m) } +func (*TestCase) ProtoMessage() {} +func (*TestCase) Descriptor() ([]byte, []int) { + return fileDescriptor_63baccdfe9127510, []int{0} +} +func (m *TestCase) XXX_Unmarshal(b []byte) error { + return m.Unmarshal(b) +} +func (m *TestCase) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { + if deterministic { + return xxx_messageInfo_TestCase.Marshal(b, m, deterministic) + } else { + b = b[:cap(b)] + n, err := m.MarshalToSizedBuffer(b) + if err != nil { + return nil, err + } + return b[:n], nil + } +} +func (m *TestCase) XXX_Merge(src proto.Message) { + xxx_messageInfo_TestCase.Merge(m, src) +} +func (m *TestCase) XXX_Size() int { + return m.Size() +} +func (m *TestCase) XXX_DiscardUnknown() { + xxx_messageInfo_TestCase.DiscardUnknown(m) +} + +var xxx_messageInfo_TestCase proto.InternalMessageInfo + +func (m *TestCase) GetName() string { + if m != nil { + return m.Name + } + return "" +} + +func (m *TestCase) GetRandomSeed() int64 { + if m != nil { + return m.RandomSeed + } + return 0 +} + +func (m *TestCase) GetPools() []*Pool { + if m != nil { + return m.Pools + } + return nil +} + +func (m *TestCase) GetQueues() []Queue { + if m != nil { + return m.Queues + } + return nil +} + +type Pool struct { + Name string `protobuf:"bytes,1,opt,name=name,proto3" json:"name,omitempty"` + ExecutorGroups []*ExecutorGroup `protobuf:"bytes,2,rep,name=executor_groups,json=executorGroups,proto3" json:"executorGroups,omitempty"` +} + +func (m *Pool) Reset() { *m = Pool{} } +func (m *Pool) String() string { return proto.CompactTextString(m) } +func (*Pool) ProtoMessage() {} +func (*Pool) Descriptor() ([]byte, []int) { + return fileDescriptor_63baccdfe9127510, []int{1} +} +func (m *Pool) XXX_Unmarshal(b []byte) error { + return m.Unmarshal(b) +} +func (m *Pool) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { + if deterministic { + return xxx_messageInfo_Pool.Marshal(b, m, deterministic) + } else { + b = b[:cap(b)] + n, err := m.MarshalToSizedBuffer(b) + if err != nil { + return nil, err + } + return b[:n], nil + } +} +func (m *Pool) XXX_Merge(src proto.Message) { + xxx_messageInfo_Pool.Merge(m, src) +} +func (m *Pool) XXX_Size() int { + return m.Size() +} +func (m *Pool) XXX_DiscardUnknown() { + xxx_messageInfo_Pool.DiscardUnknown(m) +} + +var xxx_messageInfo_Pool proto.InternalMessageInfo + +func (m *Pool) GetName() string { + if m != nil { + return m.Name + } + return "" +} + +func (m *Pool) GetExecutorGroups() []*ExecutorGroup { + if m != nil { + return m.ExecutorGroups + } + return nil +} + +type ExecutorGroup struct { + Executors []*Executor `protobuf:"bytes,1,rep,name=executors,proto3" json:"executors,omitempty"` +} + +func (m *ExecutorGroup) Reset() { *m = ExecutorGroup{} } +func (m *ExecutorGroup) String() string { return proto.CompactTextString(m) } +func (*ExecutorGroup) ProtoMessage() {} +func (*ExecutorGroup) Descriptor() ([]byte, []int) { + return fileDescriptor_63baccdfe9127510, []int{2} +} +func (m *ExecutorGroup) XXX_Unmarshal(b []byte) error { + return m.Unmarshal(b) +} +func (m *ExecutorGroup) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { + if deterministic { + return xxx_messageInfo_ExecutorGroup.Marshal(b, m, deterministic) + } else { + b = b[:cap(b)] + n, err := m.MarshalToSizedBuffer(b) + if err != nil { + return nil, err + } + return b[:n], nil + } +} +func (m *ExecutorGroup) XXX_Merge(src proto.Message) { + xxx_messageInfo_ExecutorGroup.Merge(m, src) +} +func (m *ExecutorGroup) XXX_Size() int { + return m.Size() +} +func (m *ExecutorGroup) XXX_DiscardUnknown() { + xxx_messageInfo_ExecutorGroup.DiscardUnknown(m) +} + +var xxx_messageInfo_ExecutorGroup proto.InternalMessageInfo + +func (m *ExecutorGroup) GetExecutors() []*Executor { + if m != nil { + return m.Executors + } + return nil +} + +type Executor struct { + Name string `protobuf:"bytes,1,opt,name=name,proto3" json:"name,omitempty"` + NodeTemplates []*NodeTemplate `protobuf:"bytes,2,rep,name=node_templates,json=nodeTemplates,proto3" json:"nodeTemplates,omitempty"` +} + +func (m *Executor) Reset() { *m = Executor{} } +func (m *Executor) String() string { return proto.CompactTextString(m) } +func (*Executor) ProtoMessage() {} +func (*Executor) Descriptor() ([]byte, []int) { + return fileDescriptor_63baccdfe9127510, []int{3} +} +func (m *Executor) XXX_Unmarshal(b []byte) error { + return m.Unmarshal(b) +} +func (m *Executor) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { + if deterministic { + return xxx_messageInfo_Executor.Marshal(b, m, deterministic) + } else { + b = b[:cap(b)] + n, err := m.MarshalToSizedBuffer(b) + if err != nil { + return nil, err + } + return b[:n], nil + } +} +func (m *Executor) XXX_Merge(src proto.Message) { + xxx_messageInfo_Executor.Merge(m, src) +} +func (m *Executor) XXX_Size() int { + return m.Size() +} +func (m *Executor) XXX_DiscardUnknown() { + xxx_messageInfo_Executor.DiscardUnknown(m) +} + +var xxx_messageInfo_Executor proto.InternalMessageInfo + +func (m *Executor) GetName() string { + if m != nil { + return m.Name + } + return "" +} + +func (m *Executor) GetNodeTemplates() []*NodeTemplate { + if m != nil { + return m.NodeTemplates + } + return nil +} + +type NodeTemplate struct { + Number int64 `protobuf:"varint,1,opt,name=number,proto3" json:"number,omitempty"` + Taints []v1.Taint `protobuf:"bytes,2,rep,name=taints,proto3" json:"taints"` + Labels map[string]string `protobuf:"bytes,3,rep,name=labels,proto3" json:"labels,omitempty" protobuf_key:"bytes,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"` + TotalResources schedulerobjects.ResourceList `protobuf:"bytes,4,opt,name=total_resources,json=totalResources,proto3" json:"totalResources"` +} + +func (m *NodeTemplate) Reset() { *m = NodeTemplate{} } +func (m *NodeTemplate) String() string { return proto.CompactTextString(m) } +func (*NodeTemplate) ProtoMessage() {} +func (*NodeTemplate) Descriptor() ([]byte, []int) { + return fileDescriptor_63baccdfe9127510, []int{4} +} +func (m *NodeTemplate) XXX_Unmarshal(b []byte) error { + return m.Unmarshal(b) +} +func (m *NodeTemplate) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { + if deterministic { + return xxx_messageInfo_NodeTemplate.Marshal(b, m, deterministic) + } else { + b = b[:cap(b)] + n, err := m.MarshalToSizedBuffer(b) + if err != nil { + return nil, err + } + return b[:n], nil + } +} +func (m *NodeTemplate) XXX_Merge(src proto.Message) { + xxx_messageInfo_NodeTemplate.Merge(m, src) +} +func (m *NodeTemplate) XXX_Size() int { + return m.Size() +} +func (m *NodeTemplate) XXX_DiscardUnknown() { + xxx_messageInfo_NodeTemplate.DiscardUnknown(m) +} + +var xxx_messageInfo_NodeTemplate proto.InternalMessageInfo + +func (m *NodeTemplate) GetNumber() int64 { + if m != nil { + return m.Number + } + return 0 +} + +func (m *NodeTemplate) GetTaints() []v1.Taint { + if m != nil { + return m.Taints + } + return nil +} + +func (m *NodeTemplate) GetLabels() map[string]string { + if m != nil { + return m.Labels + } + return nil +} + +func (m *NodeTemplate) GetTotalResources() schedulerobjects.ResourceList { + if m != nil { + return m.TotalResources + } + return schedulerobjects.ResourceList{} +} + +type Queue struct { + Name string `protobuf:"bytes,1,opt,name=name,proto3" json:"name,omitempty"` + Weight float64 `protobuf:"fixed64,2,opt,name=weight,proto3" json:"weight,omitempty"` + JobTemplates []*JobTemplate `protobuf:"bytes,3,rep,name=job_templates,json=jobTemplates,proto3" json:"jobTemplates,omitempty"` +} + +func (m *Queue) Reset() { *m = Queue{} } +func (m *Queue) String() string { return proto.CompactTextString(m) } +func (*Queue) ProtoMessage() {} +func (*Queue) Descriptor() ([]byte, []int) { + return fileDescriptor_63baccdfe9127510, []int{5} +} +func (m *Queue) XXX_Unmarshal(b []byte) error { + return m.Unmarshal(b) +} +func (m *Queue) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { + if deterministic { + return xxx_messageInfo_Queue.Marshal(b, m, deterministic) + } else { + b = b[:cap(b)] + n, err := m.MarshalToSizedBuffer(b) + if err != nil { + return nil, err + } + return b[:n], nil + } +} +func (m *Queue) XXX_Merge(src proto.Message) { + xxx_messageInfo_Queue.Merge(m, src) +} +func (m *Queue) XXX_Size() int { + return m.Size() +} +func (m *Queue) XXX_DiscardUnknown() { + xxx_messageInfo_Queue.DiscardUnknown(m) +} + +var xxx_messageInfo_Queue proto.InternalMessageInfo + +func (m *Queue) GetName() string { + if m != nil { + return m.Name + } + return "" +} + +func (m *Queue) GetWeight() float64 { + if m != nil { + return m.Weight + } + return 0 +} + +func (m *Queue) GetJobTemplates() []*JobTemplate { + if m != nil { + return m.JobTemplates + } + return nil +} + +type JobTemplate struct { + // Number of jobs to create from this template. + Number int64 `protobuf:"varint,1,opt,name=number,proto3" json:"number,omitempty"` + // Number of jobs created from this template that have succeeded. + // Maintained by the simulator. + NumberSuccessful int64 `protobuf:"varint,2,opt,name=numberSuccessful,proto3" json:"numberSuccessful,omitempty"` + // Queue to which this template belongs. Populated automatically. + Queue string `protobuf:"bytes,3,opt,name=queue,proto3" json:"queue,omitempty"` + // Unique id for this template. An id is generated if empty. + Id string `protobuf:"bytes,4,opt,name=id,proto3" json:"id,omitempty"` + JobSet string `protobuf:"bytes,5,opt,name=job_set,json=jobSet,proto3" json:"jobSet,omitempty"` + QueuePriority uint32 `protobuf:"varint,6,opt,name=queue_priority,json=queuePriority,proto3" json:"queuePriority,omitempty"` + PriorityClassName string `protobuf:"bytes,7,opt,name=priority_class_name,json=priorityClassName,proto3" json:"priorityClassName,omitempty"` + Requirements schedulerobjects.PodRequirements `protobuf:"bytes,8,opt,name=requirements,proto3" json:"requirements"` + // List of template ids that must be completed before this template is submitted. + Dependencies []string `protobuf:"bytes,9,rep,name=dependencies,proto3" json:"dependencies,omitempty"` + // Minimum time from which jobs are created from this template. + MinSubmitTime time.Time `protobuf:"bytes,10,opt,name=min_submit_time,json=minSubmitTime,proto3,stdtime" json:"minSubmitTime"` + // Job runtime mean in seconds. + RuntimeMean int64 `protobuf:"varint,11,opt,name=runtime_mean,json=runtimeMean,proto3" json:"runtimeMean,omitempty"` + // Job runtime variance in seconds squared. + // If zero, runtime is deterministic. + RuntimeVariance int64 `protobuf:"varint,12,opt,name=runtime_variance,json=runtimeVariance,proto3" json:"runtimeVariance,omitempty"` +} + +func (m *JobTemplate) Reset() { *m = JobTemplate{} } +func (m *JobTemplate) String() string { return proto.CompactTextString(m) } +func (*JobTemplate) ProtoMessage() {} +func (*JobTemplate) Descriptor() ([]byte, []int) { + return fileDescriptor_63baccdfe9127510, []int{6} +} +func (m *JobTemplate) XXX_Unmarshal(b []byte) error { + return m.Unmarshal(b) +} +func (m *JobTemplate) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { + if deterministic { + return xxx_messageInfo_JobTemplate.Marshal(b, m, deterministic) + } else { + b = b[:cap(b)] + n, err := m.MarshalToSizedBuffer(b) + if err != nil { + return nil, err + } + return b[:n], nil + } +} +func (m *JobTemplate) XXX_Merge(src proto.Message) { + xxx_messageInfo_JobTemplate.Merge(m, src) +} +func (m *JobTemplate) XXX_Size() int { + return m.Size() +} +func (m *JobTemplate) XXX_DiscardUnknown() { + xxx_messageInfo_JobTemplate.DiscardUnknown(m) +} + +var xxx_messageInfo_JobTemplate proto.InternalMessageInfo + +func (m *JobTemplate) GetNumber() int64 { + if m != nil { + return m.Number + } + return 0 +} + +func (m *JobTemplate) GetNumberSuccessful() int64 { + if m != nil { + return m.NumberSuccessful + } + return 0 +} + +func (m *JobTemplate) GetQueue() string { + if m != nil { + return m.Queue + } + return "" +} + +func (m *JobTemplate) GetId() string { + if m != nil { + return m.Id + } + return "" +} + +func (m *JobTemplate) GetJobSet() string { + if m != nil { + return m.JobSet + } + return "" +} + +func (m *JobTemplate) GetQueuePriority() uint32 { + if m != nil { + return m.QueuePriority + } + return 0 +} + +func (m *JobTemplate) GetPriorityClassName() string { + if m != nil { + return m.PriorityClassName + } + return "" +} + +func (m *JobTemplate) GetRequirements() schedulerobjects.PodRequirements { + if m != nil { + return m.Requirements + } + return schedulerobjects.PodRequirements{} +} + +func (m *JobTemplate) GetDependencies() []string { + if m != nil { + return m.Dependencies + } + return nil +} + +func (m *JobTemplate) GetMinSubmitTime() time.Time { + if m != nil { + return m.MinSubmitTime + } + return time.Time{} +} + +func (m *JobTemplate) GetRuntimeMean() int64 { + if m != nil { + return m.RuntimeMean + } + return 0 +} + +func (m *JobTemplate) GetRuntimeVariance() int64 { + if m != nil { + return m.RuntimeVariance + } + return 0 +} + +func init() { + proto.RegisterType((*TestCase)(nil), "simulator.TestCase") + proto.RegisterType((*Pool)(nil), "simulator.Pool") + proto.RegisterType((*ExecutorGroup)(nil), "simulator.ExecutorGroup") + proto.RegisterType((*Executor)(nil), "simulator.Executor") + proto.RegisterType((*NodeTemplate)(nil), "simulator.NodeTemplate") + proto.RegisterMapType((map[string]string)(nil), "simulator.NodeTemplate.LabelsEntry") + proto.RegisterType((*Queue)(nil), "simulator.Queue") + proto.RegisterType((*JobTemplate)(nil), "simulator.JobTemplate") +} + +func init() { + proto.RegisterFile("internal/scheduler/simulator/simulator.proto", fileDescriptor_63baccdfe9127510) +} + +var fileDescriptor_63baccdfe9127510 = []byte{ + // 1025 bytes of a gzipped FileDescriptorProto + 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0x94, 0x56, 0x4f, 0x6f, 0xdb, 0x36, + 0x14, 0x8f, 0xe2, 0xc6, 0x8d, 0xe9, 0x3f, 0x49, 0x99, 0x2c, 0x51, 0xdc, 0xcd, 0xf2, 0x5c, 0x60, + 0xf0, 0x80, 0x54, 0x46, 0xbb, 0x4b, 0x16, 0x14, 0x03, 0xa6, 0xa2, 0xd8, 0x10, 0x74, 0x5d, 0xea, + 0x04, 0x1d, 0xb0, 0x60, 0x10, 0x68, 0xe9, 0xd5, 0x61, 0x22, 0x89, 0xaa, 0x48, 0x65, 0xcb, 0xa7, + 0x58, 0x4f, 0x3b, 0xed, 0x73, 0xec, 0x33, 0xf4, 0xd8, 0xe3, 0x4e, 0xda, 0x90, 0xdc, 0xf4, 0x29, + 0x06, 0x51, 0x54, 0x4c, 0x27, 0xdd, 0x90, 0x9d, 0x2c, 0xfe, 0xfe, 0x3c, 0x3e, 0x3e, 0xbf, 0x27, + 0x0a, 0x6d, 0xd3, 0x48, 0x40, 0x12, 0x91, 0x60, 0xc4, 0xbd, 0x63, 0xf0, 0xd3, 0x00, 0x92, 0x11, + 0xa7, 0x61, 0x1a, 0x10, 0xc1, 0xb4, 0x27, 0x3b, 0x4e, 0x98, 0x60, 0xb8, 0x71, 0x05, 0x74, 0xad, + 0x29, 0x63, 0xd3, 0x00, 0x46, 0x92, 0x98, 0xa4, 0xaf, 0x47, 0x82, 0x86, 0xc0, 0x05, 0x09, 0xe3, + 0x52, 0xdb, 0x1d, 0x9c, 0xee, 0x70, 0x9b, 0xb2, 0x11, 0x89, 0xe9, 0xc8, 0x63, 0x09, 0x8c, 0xce, + 0x1e, 0x8d, 0xa6, 0x10, 0x41, 0x42, 0x04, 0xf8, 0x4a, 0xf3, 0x70, 0x4a, 0xc5, 0x71, 0x3a, 0xb1, + 0x3d, 0x16, 0x8e, 0xa6, 0x6c, 0xca, 0x66, 0xd1, 0x8a, 0x95, 0x5c, 0xc8, 0x27, 0x25, 0xdf, 0xfd, + 0x50, 0xb2, 0xd5, 0x13, 0x9b, 0x9c, 0x80, 0x27, 0xf8, 0x0d, 0xa0, 0xf4, 0x0e, 0x2e, 0x0d, 0xb4, + 0x7c, 0x08, 0x5c, 0x3c, 0x25, 0x1c, 0xf0, 0x67, 0xe8, 0x4e, 0x44, 0x42, 0x30, 0x8d, 0xbe, 0x31, + 0x6c, 0x38, 0x38, 0xcf, 0xac, 0x4e, 0xb1, 0xde, 0x66, 0x21, 0x15, 0x10, 0xc6, 0xe2, 0x7c, 0x2c, + 0x79, 0xfc, 0x25, 0x6a, 0x26, 0x24, 0xf2, 0x59, 0xe8, 0x72, 0x00, 0xdf, 0x5c, 0xec, 0x1b, 0xc3, + 0x9a, 0x63, 0xe6, 0x99, 0xb5, 0x5e, 0xc2, 0x07, 0x00, 0xbe, 0x66, 0x42, 0x33, 0x14, 0xef, 0xa2, + 0xa5, 0x98, 0xb1, 0x80, 0x9b, 0xb5, 0x7e, 0x6d, 0xd8, 0x7c, 0xbc, 0x62, 0xcf, 0x6a, 0xb9, 0xcf, + 0x58, 0xe0, 0xac, 0xe5, 0x99, 0xb5, 0x22, 0x15, 0x5a, 0x80, 0xd2, 0x82, 0x77, 0x50, 0xfd, 0x4d, + 0x0a, 0x29, 0x70, 0xf3, 0x8e, 0x34, 0xaf, 0x6a, 0xe6, 0x97, 0x05, 0xe1, 0x74, 0xde, 0x65, 0xd6, + 0x42, 0x9e, 0x59, 0x4a, 0x37, 0x56, 0xbf, 0x83, 0x5f, 0x0d, 0x74, 0xa7, 0x08, 0x7f, 0xeb, 0x13, + 0xba, 0x68, 0x05, 0x7e, 0x01, 0x2f, 0x15, 0x2c, 0x71, 0xa7, 0x09, 0x4b, 0x63, 0x6e, 0x2e, 0xca, + 0x3d, 0x4d, 0x6d, 0xcf, 0x67, 0x4a, 0xf1, 0x4d, 0x21, 0x70, 0x3e, 0xce, 0x33, 0xcb, 0x04, 0x1d, + 0xd2, 0x8f, 0xd0, 0x99, 0x67, 0x06, 0x47, 0xa8, 0x3d, 0x67, 0xc7, 0x7b, 0xa8, 0x51, 0x49, 0xb8, + 0x69, 0xc8, 0xbd, 0xd6, 0x3e, 0xb0, 0x97, 0xb3, 0x99, 0x67, 0xd6, 0xda, 0x95, 0x52, 0xdb, 0x61, + 0x66, 0x2f, 0x8e, 0xbb, 0x5c, 0x19, 0x6e, 0x7d, 0xe4, 0x23, 0xd4, 0x89, 0x98, 0x0f, 0x6e, 0x01, + 0x06, 0x44, 0x40, 0x75, 0xe2, 0x4d, 0x2d, 0x8b, 0x17, 0xcc, 0x87, 0x43, 0xc5, 0x3b, 0xf7, 0xf3, + 0xcc, 0xda, 0x8c, 0x34, 0x44, 0xcf, 0xa6, 0x3d, 0x47, 0x0c, 0x7e, 0xab, 0xa1, 0x96, 0x6e, 0xc6, + 0xdb, 0xa8, 0x1e, 0xa5, 0xe1, 0x04, 0x12, 0x99, 0x57, 0xcd, 0x59, 0xcf, 0x33, 0x6b, 0xb5, 0x44, + 0xb4, 0x28, 0x4a, 0x83, 0xbf, 0x46, 0x75, 0x41, 0x68, 0x24, 0xaa, 0x9c, 0xb6, 0xec, 0x72, 0x8a, + 0x6c, 0x12, 0x53, 0xbb, 0x98, 0x22, 0xfb, 0xec, 0x91, 0x7d, 0x58, 0x28, 0x66, 0x2d, 0x50, 0x1a, + 0xc6, 0xea, 0x17, 0xbf, 0x44, 0xf5, 0x80, 0x4c, 0xe0, 0xaa, 0xf3, 0x1e, 0xfc, 0xcb, 0xb1, 0xec, + 0xe7, 0x52, 0xf5, 0x2c, 0x12, 0xc9, 0x79, 0x99, 0x55, 0x69, 0xd3, 0xb3, 0x2a, 0x91, 0xa2, 0x49, + 0x04, 0x13, 0x24, 0x70, 0x13, 0xe0, 0x2c, 0x4d, 0x3c, 0xd9, 0x98, 0xc6, 0xb0, 0xf9, 0xb8, 0x67, + 0xdf, 0x98, 0xb6, 0xb1, 0x92, 0x3c, 0xa7, 0x5c, 0x38, 0x1b, 0x2a, 0xc7, 0x8e, 0xb4, 0x57, 0x14, + 0x1f, 0x5f, 0x5b, 0x77, 0x09, 0x6a, 0x6a, 0xd9, 0xe0, 0x07, 0xa8, 0x76, 0x0a, 0xe7, 0xea, 0x8f, + 0xbc, 0x97, 0x67, 0x56, 0xfb, 0x14, 0xce, 0xb5, 0xbc, 0x0a, 0x16, 0x7f, 0x8e, 0x96, 0xce, 0x48, + 0x90, 0x82, 0x9c, 0xca, 0x46, 0x39, 0x4f, 0x12, 0xd0, 0xe7, 0x49, 0x02, 0xbb, 0x8b, 0x3b, 0xc6, + 0xe0, 0x0f, 0x03, 0x2d, 0xc9, 0xd9, 0xb9, 0x75, 0x9f, 0x6c, 0xa3, 0xfa, 0xcf, 0x40, 0xa7, 0xc7, + 0x42, 0xee, 0x60, 0x94, 0x35, 0x2a, 0x11, 0xbd, 0x46, 0x25, 0x82, 0x7f, 0x40, 0xed, 0x13, 0x36, + 0xd1, 0x9a, 0xaa, 0xac, 0xfe, 0x86, 0x56, 0xfd, 0x3d, 0x36, 0xb9, 0xea, 0xa9, 0x6e, 0x9e, 0x59, + 0x1b, 0x27, 0x33, 0x40, 0x2f, 0x7b, 0x4b, 0xc7, 0x07, 0xbf, 0xd7, 0x51, 0x53, 0x73, 0xfe, 0xcf, + 0x86, 0xda, 0x43, 0x8a, 0x3b, 0x48, 0x3d, 0x0f, 0x38, 0x7f, 0x9d, 0x06, 0xea, 0x35, 0xd6, 0xcb, + 0x33, 0xab, 0x7b, 0x9d, 0xd3, 0x22, 0xdc, 0xf0, 0x15, 0x15, 0x97, 0xaf, 0x19, 0xb3, 0x36, 0xab, + 0xb8, 0x04, 0xf4, 0x8a, 0x4b, 0x00, 0xf7, 0xd1, 0x22, 0xf5, 0x65, 0x93, 0x34, 0x9c, 0xd5, 0x3c, + 0xb3, 0x5a, 0x54, 0x7f, 0x4f, 0x2e, 0x52, 0x1f, 0x3f, 0x44, 0x77, 0x8b, 0x7a, 0x71, 0x10, 0xe6, + 0x92, 0x94, 0xc9, 0x73, 0x9c, 0xb0, 0xc9, 0x01, 0xcc, 0x95, 0xb7, 0x44, 0xb0, 0x83, 0x3a, 0x32, + 0xb2, 0x1b, 0x27, 0x94, 0x25, 0x54, 0x9c, 0x9b, 0xf5, 0xbe, 0x31, 0x6c, 0x97, 0xb3, 0x29, 0x99, + 0x7d, 0x45, 0xe8, 0xb3, 0x39, 0x47, 0xe0, 0xef, 0xd1, 0x5a, 0xe5, 0x76, 0xbd, 0x80, 0x70, 0xee, + 0xca, 0x3e, 0xb8, 0x2b, 0xb7, 0xb7, 0xf2, 0xcc, 0xba, 0x5f, 0xd1, 0x4f, 0x0b, 0xf6, 0xc5, 0x7c, + 0x53, 0xdc, 0xbb, 0x41, 0xe2, 0x23, 0xd4, 0x4a, 0xe0, 0x4d, 0x4a, 0x13, 0x08, 0xa1, 0x98, 0xd9, + 0x65, 0x39, 0x14, 0x9f, 0xde, 0x1c, 0x8a, 0x7d, 0xe6, 0x8f, 0x35, 0xa1, 0xb3, 0xae, 0xe6, 0x62, + 0xce, 0x3e, 0x9e, 0x5b, 0xe1, 0xaf, 0x50, 0xcb, 0x87, 0x18, 0x22, 0x1f, 0x22, 0x8f, 0x02, 0x37, + 0x1b, 0xfd, 0xda, 0xb0, 0x51, 0xf6, 0x8d, 0x8e, 0xeb, 0x7d, 0xa3, 0xe3, 0xf8, 0x27, 0xb4, 0x12, + 0xd2, 0xc8, 0xe5, 0xe9, 0x24, 0xa4, 0xc2, 0x2d, 0x6e, 0x67, 0x13, 0xc9, 0xfc, 0xba, 0x76, 0x79, + 0x75, 0xdb, 0xd5, 0x65, 0x6b, 0x1f, 0x56, 0x57, 0xb7, 0xb3, 0xa5, 0x12, 0x6b, 0x87, 0x34, 0x3a, + 0x90, 0xce, 0x82, 0x7b, 0xfb, 0x97, 0x65, 0x8c, 0xe7, 0x21, 0xfc, 0x04, 0xb5, 0x92, 0x34, 0x2a, + 0xc2, 0xba, 0x21, 0x90, 0xc8, 0x6c, 0xca, 0xa6, 0xda, 0xca, 0x33, 0xeb, 0x23, 0x85, 0x7f, 0x07, + 0x24, 0xd2, 0xb2, 0x6b, 0x6a, 0x30, 0xfe, 0x16, 0xad, 0x56, 0xee, 0x33, 0x92, 0x50, 0x12, 0x79, + 0x60, 0xb6, 0x64, 0x84, 0x4f, 0xf2, 0xcc, 0xda, 0x52, 0xdc, 0x2b, 0x45, 0x69, 0x51, 0x56, 0xae, + 0x51, 0xce, 0xab, 0x77, 0x17, 0x3d, 0xe3, 0xfd, 0x45, 0xcf, 0xf8, 0xfb, 0xa2, 0x67, 0xbc, 0xbd, + 0xec, 0x2d, 0xbc, 0xbf, 0xec, 0x2d, 0xfc, 0x79, 0xd9, 0x5b, 0xf8, 0xf1, 0x89, 0xf6, 0x71, 0x41, + 0x92, 0x90, 0xf8, 0x24, 0x4e, 0x58, 0xf1, 0x7f, 0xa8, 0xd5, 0xe8, 0xbf, 0x3e, 0x7d, 0x26, 0x75, + 0x59, 0x9d, 0x2f, 0xfe, 0x09, 0x00, 0x00, 0xff, 0xff, 0x90, 0x45, 0x44, 0xb8, 0x21, 0x09, 0x00, + 0x00, +} + +func (m *TestCase) Marshal() (dAtA []byte, err error) { + size := m.Size() + dAtA = make([]byte, size) + n, err := m.MarshalToSizedBuffer(dAtA[:size]) + if err != nil { + return nil, err + } + return dAtA[:n], nil +} + +func (m *TestCase) MarshalTo(dAtA []byte) (int, error) { + size := m.Size() + return m.MarshalToSizedBuffer(dAtA[:size]) +} + +func (m *TestCase) MarshalToSizedBuffer(dAtA []byte) (int, error) { + i := len(dAtA) + _ = i + var l int + _ = l + if len(m.Queues) > 0 { + for iNdEx := len(m.Queues) - 1; iNdEx >= 0; iNdEx-- { + { + size, err := m.Queues[iNdEx].MarshalToSizedBuffer(dAtA[:i]) + if err != nil { + return 0, err + } + i -= size + i = encodeVarintSimulator(dAtA, i, uint64(size)) + } + i-- + dAtA[i] = 0x22 + } + } + if len(m.Pools) > 0 { + for iNdEx := len(m.Pools) - 1; iNdEx >= 0; iNdEx-- { + { + size, err := m.Pools[iNdEx].MarshalToSizedBuffer(dAtA[:i]) + if err != nil { + return 0, err + } + i -= size + i = encodeVarintSimulator(dAtA, i, uint64(size)) + } + i-- + dAtA[i] = 0x1a + } + } + if m.RandomSeed != 0 { + i = encodeVarintSimulator(dAtA, i, uint64(m.RandomSeed)) + i-- + dAtA[i] = 0x10 + } + if len(m.Name) > 0 { + i -= len(m.Name) + copy(dAtA[i:], m.Name) + i = encodeVarintSimulator(dAtA, i, uint64(len(m.Name))) + i-- + dAtA[i] = 0xa + } + return len(dAtA) - i, nil +} + +func (m *Pool) Marshal() (dAtA []byte, err error) { + size := m.Size() + dAtA = make([]byte, size) + n, err := m.MarshalToSizedBuffer(dAtA[:size]) + if err != nil { + return nil, err + } + return dAtA[:n], nil +} + +func (m *Pool) MarshalTo(dAtA []byte) (int, error) { + size := m.Size() + return m.MarshalToSizedBuffer(dAtA[:size]) +} + +func (m *Pool) MarshalToSizedBuffer(dAtA []byte) (int, error) { + i := len(dAtA) + _ = i + var l int + _ = l + if len(m.ExecutorGroups) > 0 { + for iNdEx := len(m.ExecutorGroups) - 1; iNdEx >= 0; iNdEx-- { + { + size, err := m.ExecutorGroups[iNdEx].MarshalToSizedBuffer(dAtA[:i]) + if err != nil { + return 0, err + } + i -= size + i = encodeVarintSimulator(dAtA, i, uint64(size)) + } + i-- + dAtA[i] = 0x12 + } + } + if len(m.Name) > 0 { + i -= len(m.Name) + copy(dAtA[i:], m.Name) + i = encodeVarintSimulator(dAtA, i, uint64(len(m.Name))) + i-- + dAtA[i] = 0xa + } + return len(dAtA) - i, nil +} + +func (m *ExecutorGroup) Marshal() (dAtA []byte, err error) { + size := m.Size() + dAtA = make([]byte, size) + n, err := m.MarshalToSizedBuffer(dAtA[:size]) + if err != nil { + return nil, err + } + return dAtA[:n], nil +} + +func (m *ExecutorGroup) MarshalTo(dAtA []byte) (int, error) { + size := m.Size() + return m.MarshalToSizedBuffer(dAtA[:size]) +} + +func (m *ExecutorGroup) MarshalToSizedBuffer(dAtA []byte) (int, error) { + i := len(dAtA) + _ = i + var l int + _ = l + if len(m.Executors) > 0 { + for iNdEx := len(m.Executors) - 1; iNdEx >= 0; iNdEx-- { + { + size, err := m.Executors[iNdEx].MarshalToSizedBuffer(dAtA[:i]) + if err != nil { + return 0, err + } + i -= size + i = encodeVarintSimulator(dAtA, i, uint64(size)) + } + i-- + dAtA[i] = 0xa + } + } + return len(dAtA) - i, nil +} + +func (m *Executor) Marshal() (dAtA []byte, err error) { + size := m.Size() + dAtA = make([]byte, size) + n, err := m.MarshalToSizedBuffer(dAtA[:size]) + if err != nil { + return nil, err + } + return dAtA[:n], nil +} + +func (m *Executor) MarshalTo(dAtA []byte) (int, error) { + size := m.Size() + return m.MarshalToSizedBuffer(dAtA[:size]) +} + +func (m *Executor) MarshalToSizedBuffer(dAtA []byte) (int, error) { + i := len(dAtA) + _ = i + var l int + _ = l + if len(m.NodeTemplates) > 0 { + for iNdEx := len(m.NodeTemplates) - 1; iNdEx >= 0; iNdEx-- { + { + size, err := m.NodeTemplates[iNdEx].MarshalToSizedBuffer(dAtA[:i]) + if err != nil { + return 0, err + } + i -= size + i = encodeVarintSimulator(dAtA, i, uint64(size)) + } + i-- + dAtA[i] = 0x12 + } + } + if len(m.Name) > 0 { + i -= len(m.Name) + copy(dAtA[i:], m.Name) + i = encodeVarintSimulator(dAtA, i, uint64(len(m.Name))) + i-- + dAtA[i] = 0xa + } + return len(dAtA) - i, nil +} + +func (m *NodeTemplate) Marshal() (dAtA []byte, err error) { + size := m.Size() + dAtA = make([]byte, size) + n, err := m.MarshalToSizedBuffer(dAtA[:size]) + if err != nil { + return nil, err + } + return dAtA[:n], nil +} + +func (m *NodeTemplate) MarshalTo(dAtA []byte) (int, error) { + size := m.Size() + return m.MarshalToSizedBuffer(dAtA[:size]) +} + +func (m *NodeTemplate) MarshalToSizedBuffer(dAtA []byte) (int, error) { + i := len(dAtA) + _ = i + var l int + _ = l + { + size, err := m.TotalResources.MarshalToSizedBuffer(dAtA[:i]) + if err != nil { + return 0, err + } + i -= size + i = encodeVarintSimulator(dAtA, i, uint64(size)) + } + i-- + dAtA[i] = 0x22 + if len(m.Labels) > 0 { + for k := range m.Labels { + v := m.Labels[k] + baseI := i + i -= len(v) + copy(dAtA[i:], v) + i = encodeVarintSimulator(dAtA, i, uint64(len(v))) + i-- + dAtA[i] = 0x12 + i -= len(k) + copy(dAtA[i:], k) + i = encodeVarintSimulator(dAtA, i, uint64(len(k))) + i-- + dAtA[i] = 0xa + i = encodeVarintSimulator(dAtA, i, uint64(baseI-i)) + i-- + dAtA[i] = 0x1a + } + } + if len(m.Taints) > 0 { + for iNdEx := len(m.Taints) - 1; iNdEx >= 0; iNdEx-- { + { + size, err := m.Taints[iNdEx].MarshalToSizedBuffer(dAtA[:i]) + if err != nil { + return 0, err + } + i -= size + i = encodeVarintSimulator(dAtA, i, uint64(size)) + } + i-- + dAtA[i] = 0x12 + } + } + if m.Number != 0 { + i = encodeVarintSimulator(dAtA, i, uint64(m.Number)) + i-- + dAtA[i] = 0x8 + } + return len(dAtA) - i, nil +} + +func (m *Queue) Marshal() (dAtA []byte, err error) { + size := m.Size() + dAtA = make([]byte, size) + n, err := m.MarshalToSizedBuffer(dAtA[:size]) + if err != nil { + return nil, err + } + return dAtA[:n], nil +} + +func (m *Queue) MarshalTo(dAtA []byte) (int, error) { + size := m.Size() + return m.MarshalToSizedBuffer(dAtA[:size]) +} + +func (m *Queue) MarshalToSizedBuffer(dAtA []byte) (int, error) { + i := len(dAtA) + _ = i + var l int + _ = l + if len(m.JobTemplates) > 0 { + for iNdEx := len(m.JobTemplates) - 1; iNdEx >= 0; iNdEx-- { + { + size, err := m.JobTemplates[iNdEx].MarshalToSizedBuffer(dAtA[:i]) + if err != nil { + return 0, err + } + i -= size + i = encodeVarintSimulator(dAtA, i, uint64(size)) + } + i-- + dAtA[i] = 0x1a + } + } + if m.Weight != 0 { + i -= 8 + encoding_binary.LittleEndian.PutUint64(dAtA[i:], uint64(math.Float64bits(float64(m.Weight)))) + i-- + dAtA[i] = 0x11 + } + if len(m.Name) > 0 { + i -= len(m.Name) + copy(dAtA[i:], m.Name) + i = encodeVarintSimulator(dAtA, i, uint64(len(m.Name))) + i-- + dAtA[i] = 0xa + } + return len(dAtA) - i, nil +} + +func (m *JobTemplate) Marshal() (dAtA []byte, err error) { + size := m.Size() + dAtA = make([]byte, size) + n, err := m.MarshalToSizedBuffer(dAtA[:size]) + if err != nil { + return nil, err + } + return dAtA[:n], nil +} + +func (m *JobTemplate) MarshalTo(dAtA []byte) (int, error) { + size := m.Size() + return m.MarshalToSizedBuffer(dAtA[:size]) +} + +func (m *JobTemplate) MarshalToSizedBuffer(dAtA []byte) (int, error) { + i := len(dAtA) + _ = i + var l int + _ = l + if m.RuntimeVariance != 0 { + i = encodeVarintSimulator(dAtA, i, uint64(m.RuntimeVariance)) + i-- + dAtA[i] = 0x60 + } + if m.RuntimeMean != 0 { + i = encodeVarintSimulator(dAtA, i, uint64(m.RuntimeMean)) + i-- + dAtA[i] = 0x58 + } + n2, err2 := github_com_gogo_protobuf_types.StdTimeMarshalTo(m.MinSubmitTime, dAtA[i-github_com_gogo_protobuf_types.SizeOfStdTime(m.MinSubmitTime):]) + if err2 != nil { + return 0, err2 + } + i -= n2 + i = encodeVarintSimulator(dAtA, i, uint64(n2)) + i-- + dAtA[i] = 0x52 + if len(m.Dependencies) > 0 { + for iNdEx := len(m.Dependencies) - 1; iNdEx >= 0; iNdEx-- { + i -= len(m.Dependencies[iNdEx]) + copy(dAtA[i:], m.Dependencies[iNdEx]) + i = encodeVarintSimulator(dAtA, i, uint64(len(m.Dependencies[iNdEx]))) + i-- + dAtA[i] = 0x4a + } + } + { + size, err := m.Requirements.MarshalToSizedBuffer(dAtA[:i]) + if err != nil { + return 0, err + } + i -= size + i = encodeVarintSimulator(dAtA, i, uint64(size)) + } + i-- + dAtA[i] = 0x42 + if len(m.PriorityClassName) > 0 { + i -= len(m.PriorityClassName) + copy(dAtA[i:], m.PriorityClassName) + i = encodeVarintSimulator(dAtA, i, uint64(len(m.PriorityClassName))) + i-- + dAtA[i] = 0x3a + } + if m.QueuePriority != 0 { + i = encodeVarintSimulator(dAtA, i, uint64(m.QueuePriority)) + i-- + dAtA[i] = 0x30 + } + if len(m.JobSet) > 0 { + i -= len(m.JobSet) + copy(dAtA[i:], m.JobSet) + i = encodeVarintSimulator(dAtA, i, uint64(len(m.JobSet))) + i-- + dAtA[i] = 0x2a + } + if len(m.Id) > 0 { + i -= len(m.Id) + copy(dAtA[i:], m.Id) + i = encodeVarintSimulator(dAtA, i, uint64(len(m.Id))) + i-- + dAtA[i] = 0x22 + } + if len(m.Queue) > 0 { + i -= len(m.Queue) + copy(dAtA[i:], m.Queue) + i = encodeVarintSimulator(dAtA, i, uint64(len(m.Queue))) + i-- + dAtA[i] = 0x1a + } + if m.NumberSuccessful != 0 { + i = encodeVarintSimulator(dAtA, i, uint64(m.NumberSuccessful)) + i-- + dAtA[i] = 0x10 + } + if m.Number != 0 { + i = encodeVarintSimulator(dAtA, i, uint64(m.Number)) + i-- + dAtA[i] = 0x8 + } + return len(dAtA) - i, nil +} + +func encodeVarintSimulator(dAtA []byte, offset int, v uint64) int { + offset -= sovSimulator(v) + base := offset + for v >= 1<<7 { + dAtA[offset] = uint8(v&0x7f | 0x80) + v >>= 7 + offset++ + } + dAtA[offset] = uint8(v) + return base +} +func (m *TestCase) Size() (n int) { + if m == nil { + return 0 + } + var l int + _ = l + l = len(m.Name) + if l > 0 { + n += 1 + l + sovSimulator(uint64(l)) + } + if m.RandomSeed != 0 { + n += 1 + sovSimulator(uint64(m.RandomSeed)) + } + if len(m.Pools) > 0 { + for _, e := range m.Pools { + l = e.Size() + n += 1 + l + sovSimulator(uint64(l)) + } + } + if len(m.Queues) > 0 { + for _, e := range m.Queues { + l = e.Size() + n += 1 + l + sovSimulator(uint64(l)) + } + } + return n +} + +func (m *Pool) Size() (n int) { + if m == nil { + return 0 + } + var l int + _ = l + l = len(m.Name) + if l > 0 { + n += 1 + l + sovSimulator(uint64(l)) + } + if len(m.ExecutorGroups) > 0 { + for _, e := range m.ExecutorGroups { + l = e.Size() + n += 1 + l + sovSimulator(uint64(l)) + } + } + return n +} + +func (m *ExecutorGroup) Size() (n int) { + if m == nil { + return 0 + } + var l int + _ = l + if len(m.Executors) > 0 { + for _, e := range m.Executors { + l = e.Size() + n += 1 + l + sovSimulator(uint64(l)) + } + } + return n +} + +func (m *Executor) Size() (n int) { + if m == nil { + return 0 + } + var l int + _ = l + l = len(m.Name) + if l > 0 { + n += 1 + l + sovSimulator(uint64(l)) + } + if len(m.NodeTemplates) > 0 { + for _, e := range m.NodeTemplates { + l = e.Size() + n += 1 + l + sovSimulator(uint64(l)) + } + } + return n +} + +func (m *NodeTemplate) Size() (n int) { + if m == nil { + return 0 + } + var l int + _ = l + if m.Number != 0 { + n += 1 + sovSimulator(uint64(m.Number)) + } + if len(m.Taints) > 0 { + for _, e := range m.Taints { + l = e.Size() + n += 1 + l + sovSimulator(uint64(l)) + } + } + if len(m.Labels) > 0 { + for k, v := range m.Labels { + _ = k + _ = v + mapEntrySize := 1 + len(k) + sovSimulator(uint64(len(k))) + 1 + len(v) + sovSimulator(uint64(len(v))) + n += mapEntrySize + 1 + sovSimulator(uint64(mapEntrySize)) + } + } + l = m.TotalResources.Size() + n += 1 + l + sovSimulator(uint64(l)) + return n +} + +func (m *Queue) Size() (n int) { + if m == nil { + return 0 + } + var l int + _ = l + l = len(m.Name) + if l > 0 { + n += 1 + l + sovSimulator(uint64(l)) + } + if m.Weight != 0 { + n += 9 + } + if len(m.JobTemplates) > 0 { + for _, e := range m.JobTemplates { + l = e.Size() + n += 1 + l + sovSimulator(uint64(l)) + } + } + return n +} + +func (m *JobTemplate) Size() (n int) { + if m == nil { + return 0 + } + var l int + _ = l + if m.Number != 0 { + n += 1 + sovSimulator(uint64(m.Number)) + } + if m.NumberSuccessful != 0 { + n += 1 + sovSimulator(uint64(m.NumberSuccessful)) + } + l = len(m.Queue) + if l > 0 { + n += 1 + l + sovSimulator(uint64(l)) + } + l = len(m.Id) + if l > 0 { + n += 1 + l + sovSimulator(uint64(l)) + } + l = len(m.JobSet) + if l > 0 { + n += 1 + l + sovSimulator(uint64(l)) + } + if m.QueuePriority != 0 { + n += 1 + sovSimulator(uint64(m.QueuePriority)) + } + l = len(m.PriorityClassName) + if l > 0 { + n += 1 + l + sovSimulator(uint64(l)) + } + l = m.Requirements.Size() + n += 1 + l + sovSimulator(uint64(l)) + if len(m.Dependencies) > 0 { + for _, s := range m.Dependencies { + l = len(s) + n += 1 + l + sovSimulator(uint64(l)) + } + } + l = github_com_gogo_protobuf_types.SizeOfStdTime(m.MinSubmitTime) + n += 1 + l + sovSimulator(uint64(l)) + if m.RuntimeMean != 0 { + n += 1 + sovSimulator(uint64(m.RuntimeMean)) + } + if m.RuntimeVariance != 0 { + n += 1 + sovSimulator(uint64(m.RuntimeVariance)) + } + return n +} + +func sovSimulator(x uint64) (n int) { + return (math_bits.Len64(x|1) + 6) / 7 +} +func sozSimulator(x uint64) (n int) { + return sovSimulator(uint64((x << 1) ^ uint64((int64(x) >> 63)))) +} +func (m *TestCase) Unmarshal(dAtA []byte) error { + l := len(dAtA) + iNdEx := 0 + for iNdEx < l { + preIndex := iNdEx + var wire uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSimulator + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + wire |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } + fieldNum := int32(wire >> 3) + wireType := int(wire & 0x7) + if wireType == 4 { + return fmt.Errorf("proto: TestCase: wiretype end group for non-group") + } + if fieldNum <= 0 { + return fmt.Errorf("proto: TestCase: illegal tag %d (wire type %d)", fieldNum, wire) + } + switch fieldNum { + case 1: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field Name", wireType) + } + var stringLen uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSimulator + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + stringLen |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } + intStringLen := int(stringLen) + if intStringLen < 0 { + return ErrInvalidLengthSimulator + } + postIndex := iNdEx + intStringLen + if postIndex < 0 { + return ErrInvalidLengthSimulator + } + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.Name = string(dAtA[iNdEx:postIndex]) + iNdEx = postIndex + case 2: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field RandomSeed", wireType) + } + m.RandomSeed = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSimulator + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + m.RandomSeed |= int64(b&0x7F) << shift + if b < 0x80 { + break + } + } + case 3: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field Pools", wireType) + } + var msglen int + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSimulator + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + msglen |= int(b&0x7F) << shift + if b < 0x80 { + break + } + } + if msglen < 0 { + return ErrInvalidLengthSimulator + } + postIndex := iNdEx + msglen + if postIndex < 0 { + return ErrInvalidLengthSimulator + } + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.Pools = append(m.Pools, &Pool{}) + if err := m.Pools[len(m.Pools)-1].Unmarshal(dAtA[iNdEx:postIndex]); err != nil { + return err + } + iNdEx = postIndex + case 4: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field Queues", wireType) + } + var msglen int + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSimulator + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + msglen |= int(b&0x7F) << shift + if b < 0x80 { + break + } + } + if msglen < 0 { + return ErrInvalidLengthSimulator + } + postIndex := iNdEx + msglen + if postIndex < 0 { + return ErrInvalidLengthSimulator + } + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.Queues = append(m.Queues, Queue{}) + if err := m.Queues[len(m.Queues)-1].Unmarshal(dAtA[iNdEx:postIndex]); err != nil { + return err + } + iNdEx = postIndex + default: + iNdEx = preIndex + skippy, err := skipSimulator(dAtA[iNdEx:]) + if err != nil { + return err + } + if (skippy < 0) || (iNdEx+skippy) < 0 { + return ErrInvalidLengthSimulator + } + if (iNdEx + skippy) > l { + return io.ErrUnexpectedEOF + } + iNdEx += skippy + } + } + + if iNdEx > l { + return io.ErrUnexpectedEOF + } + return nil +} +func (m *Pool) Unmarshal(dAtA []byte) error { + l := len(dAtA) + iNdEx := 0 + for iNdEx < l { + preIndex := iNdEx + var wire uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSimulator + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + wire |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } + fieldNum := int32(wire >> 3) + wireType := int(wire & 0x7) + if wireType == 4 { + return fmt.Errorf("proto: Pool: wiretype end group for non-group") + } + if fieldNum <= 0 { + return fmt.Errorf("proto: Pool: illegal tag %d (wire type %d)", fieldNum, wire) + } + switch fieldNum { + case 1: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field Name", wireType) + } + var stringLen uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSimulator + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + stringLen |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } + intStringLen := int(stringLen) + if intStringLen < 0 { + return ErrInvalidLengthSimulator + } + postIndex := iNdEx + intStringLen + if postIndex < 0 { + return ErrInvalidLengthSimulator + } + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.Name = string(dAtA[iNdEx:postIndex]) + iNdEx = postIndex + case 2: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field ExecutorGroups", wireType) + } + var msglen int + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSimulator + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + msglen |= int(b&0x7F) << shift + if b < 0x80 { + break + } + } + if msglen < 0 { + return ErrInvalidLengthSimulator + } + postIndex := iNdEx + msglen + if postIndex < 0 { + return ErrInvalidLengthSimulator + } + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.ExecutorGroups = append(m.ExecutorGroups, &ExecutorGroup{}) + if err := m.ExecutorGroups[len(m.ExecutorGroups)-1].Unmarshal(dAtA[iNdEx:postIndex]); err != nil { + return err + } + iNdEx = postIndex + default: + iNdEx = preIndex + skippy, err := skipSimulator(dAtA[iNdEx:]) + if err != nil { + return err + } + if (skippy < 0) || (iNdEx+skippy) < 0 { + return ErrInvalidLengthSimulator + } + if (iNdEx + skippy) > l { + return io.ErrUnexpectedEOF + } + iNdEx += skippy + } + } + + if iNdEx > l { + return io.ErrUnexpectedEOF + } + return nil +} +func (m *ExecutorGroup) Unmarshal(dAtA []byte) error { + l := len(dAtA) + iNdEx := 0 + for iNdEx < l { + preIndex := iNdEx + var wire uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSimulator + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + wire |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } + fieldNum := int32(wire >> 3) + wireType := int(wire & 0x7) + if wireType == 4 { + return fmt.Errorf("proto: ExecutorGroup: wiretype end group for non-group") + } + if fieldNum <= 0 { + return fmt.Errorf("proto: ExecutorGroup: illegal tag %d (wire type %d)", fieldNum, wire) + } + switch fieldNum { + case 1: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field Executors", wireType) + } + var msglen int + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSimulator + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + msglen |= int(b&0x7F) << shift + if b < 0x80 { + break + } + } + if msglen < 0 { + return ErrInvalidLengthSimulator + } + postIndex := iNdEx + msglen + if postIndex < 0 { + return ErrInvalidLengthSimulator + } + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.Executors = append(m.Executors, &Executor{}) + if err := m.Executors[len(m.Executors)-1].Unmarshal(dAtA[iNdEx:postIndex]); err != nil { + return err + } + iNdEx = postIndex + default: + iNdEx = preIndex + skippy, err := skipSimulator(dAtA[iNdEx:]) + if err != nil { + return err + } + if (skippy < 0) || (iNdEx+skippy) < 0 { + return ErrInvalidLengthSimulator + } + if (iNdEx + skippy) > l { + return io.ErrUnexpectedEOF + } + iNdEx += skippy + } + } + + if iNdEx > l { + return io.ErrUnexpectedEOF + } + return nil +} +func (m *Executor) Unmarshal(dAtA []byte) error { + l := len(dAtA) + iNdEx := 0 + for iNdEx < l { + preIndex := iNdEx + var wire uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSimulator + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + wire |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } + fieldNum := int32(wire >> 3) + wireType := int(wire & 0x7) + if wireType == 4 { + return fmt.Errorf("proto: Executor: wiretype end group for non-group") + } + if fieldNum <= 0 { + return fmt.Errorf("proto: Executor: illegal tag %d (wire type %d)", fieldNum, wire) + } + switch fieldNum { + case 1: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field Name", wireType) + } + var stringLen uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSimulator + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + stringLen |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } + intStringLen := int(stringLen) + if intStringLen < 0 { + return ErrInvalidLengthSimulator + } + postIndex := iNdEx + intStringLen + if postIndex < 0 { + return ErrInvalidLengthSimulator + } + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.Name = string(dAtA[iNdEx:postIndex]) + iNdEx = postIndex + case 2: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field NodeTemplates", wireType) + } + var msglen int + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSimulator + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + msglen |= int(b&0x7F) << shift + if b < 0x80 { + break + } + } + if msglen < 0 { + return ErrInvalidLengthSimulator + } + postIndex := iNdEx + msglen + if postIndex < 0 { + return ErrInvalidLengthSimulator + } + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.NodeTemplates = append(m.NodeTemplates, &NodeTemplate{}) + if err := m.NodeTemplates[len(m.NodeTemplates)-1].Unmarshal(dAtA[iNdEx:postIndex]); err != nil { + return err + } + iNdEx = postIndex + default: + iNdEx = preIndex + skippy, err := skipSimulator(dAtA[iNdEx:]) + if err != nil { + return err + } + if (skippy < 0) || (iNdEx+skippy) < 0 { + return ErrInvalidLengthSimulator + } + if (iNdEx + skippy) > l { + return io.ErrUnexpectedEOF + } + iNdEx += skippy + } + } + + if iNdEx > l { + return io.ErrUnexpectedEOF + } + return nil +} +func (m *NodeTemplate) Unmarshal(dAtA []byte) error { + l := len(dAtA) + iNdEx := 0 + for iNdEx < l { + preIndex := iNdEx + var wire uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSimulator + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + wire |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } + fieldNum := int32(wire >> 3) + wireType := int(wire & 0x7) + if wireType == 4 { + return fmt.Errorf("proto: NodeTemplate: wiretype end group for non-group") + } + if fieldNum <= 0 { + return fmt.Errorf("proto: NodeTemplate: illegal tag %d (wire type %d)", fieldNum, wire) + } + switch fieldNum { + case 1: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field Number", wireType) + } + m.Number = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSimulator + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + m.Number |= int64(b&0x7F) << shift + if b < 0x80 { + break + } + } + case 2: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field Taints", wireType) + } + var msglen int + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSimulator + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + msglen |= int(b&0x7F) << shift + if b < 0x80 { + break + } + } + if msglen < 0 { + return ErrInvalidLengthSimulator + } + postIndex := iNdEx + msglen + if postIndex < 0 { + return ErrInvalidLengthSimulator + } + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.Taints = append(m.Taints, v1.Taint{}) + if err := m.Taints[len(m.Taints)-1].Unmarshal(dAtA[iNdEx:postIndex]); err != nil { + return err + } + iNdEx = postIndex + case 3: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field Labels", wireType) + } + var msglen int + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSimulator + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + msglen |= int(b&0x7F) << shift + if b < 0x80 { + break + } + } + if msglen < 0 { + return ErrInvalidLengthSimulator + } + postIndex := iNdEx + msglen + if postIndex < 0 { + return ErrInvalidLengthSimulator + } + if postIndex > l { + return io.ErrUnexpectedEOF + } + if m.Labels == nil { + m.Labels = make(map[string]string) + } + var mapkey string + var mapvalue string + for iNdEx < postIndex { + entryPreIndex := iNdEx + var wire uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSimulator + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + wire |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } + fieldNum := int32(wire >> 3) + if fieldNum == 1 { + var stringLenmapkey uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSimulator + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + stringLenmapkey |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } + intStringLenmapkey := int(stringLenmapkey) + if intStringLenmapkey < 0 { + return ErrInvalidLengthSimulator + } + postStringIndexmapkey := iNdEx + intStringLenmapkey + if postStringIndexmapkey < 0 { + return ErrInvalidLengthSimulator + } + if postStringIndexmapkey > l { + return io.ErrUnexpectedEOF + } + mapkey = string(dAtA[iNdEx:postStringIndexmapkey]) + iNdEx = postStringIndexmapkey + } else if fieldNum == 2 { + var stringLenmapvalue uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSimulator + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + stringLenmapvalue |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } + intStringLenmapvalue := int(stringLenmapvalue) + if intStringLenmapvalue < 0 { + return ErrInvalidLengthSimulator + } + postStringIndexmapvalue := iNdEx + intStringLenmapvalue + if postStringIndexmapvalue < 0 { + return ErrInvalidLengthSimulator + } + if postStringIndexmapvalue > l { + return io.ErrUnexpectedEOF + } + mapvalue = string(dAtA[iNdEx:postStringIndexmapvalue]) + iNdEx = postStringIndexmapvalue + } else { + iNdEx = entryPreIndex + skippy, err := skipSimulator(dAtA[iNdEx:]) + if err != nil { + return err + } + if (skippy < 0) || (iNdEx+skippy) < 0 { + return ErrInvalidLengthSimulator + } + if (iNdEx + skippy) > postIndex { + return io.ErrUnexpectedEOF + } + iNdEx += skippy + } + } + m.Labels[mapkey] = mapvalue + iNdEx = postIndex + case 4: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field TotalResources", wireType) + } + var msglen int + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSimulator + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + msglen |= int(b&0x7F) << shift + if b < 0x80 { + break + } + } + if msglen < 0 { + return ErrInvalidLengthSimulator + } + postIndex := iNdEx + msglen + if postIndex < 0 { + return ErrInvalidLengthSimulator + } + if postIndex > l { + return io.ErrUnexpectedEOF + } + if err := m.TotalResources.Unmarshal(dAtA[iNdEx:postIndex]); err != nil { + return err + } + iNdEx = postIndex + default: + iNdEx = preIndex + skippy, err := skipSimulator(dAtA[iNdEx:]) + if err != nil { + return err + } + if (skippy < 0) || (iNdEx+skippy) < 0 { + return ErrInvalidLengthSimulator + } + if (iNdEx + skippy) > l { + return io.ErrUnexpectedEOF + } + iNdEx += skippy + } + } + + if iNdEx > l { + return io.ErrUnexpectedEOF + } + return nil +} +func (m *Queue) Unmarshal(dAtA []byte) error { + l := len(dAtA) + iNdEx := 0 + for iNdEx < l { + preIndex := iNdEx + var wire uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSimulator + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + wire |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } + fieldNum := int32(wire >> 3) + wireType := int(wire & 0x7) + if wireType == 4 { + return fmt.Errorf("proto: Queue: wiretype end group for non-group") + } + if fieldNum <= 0 { + return fmt.Errorf("proto: Queue: illegal tag %d (wire type %d)", fieldNum, wire) + } + switch fieldNum { + case 1: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field Name", wireType) + } + var stringLen uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSimulator + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + stringLen |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } + intStringLen := int(stringLen) + if intStringLen < 0 { + return ErrInvalidLengthSimulator + } + postIndex := iNdEx + intStringLen + if postIndex < 0 { + return ErrInvalidLengthSimulator + } + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.Name = string(dAtA[iNdEx:postIndex]) + iNdEx = postIndex + case 2: + if wireType != 1 { + return fmt.Errorf("proto: wrong wireType = %d for field Weight", wireType) + } + var v uint64 + if (iNdEx + 8) > l { + return io.ErrUnexpectedEOF + } + v = uint64(encoding_binary.LittleEndian.Uint64(dAtA[iNdEx:])) + iNdEx += 8 + m.Weight = float64(math.Float64frombits(v)) + case 3: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field JobTemplates", wireType) + } + var msglen int + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSimulator + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + msglen |= int(b&0x7F) << shift + if b < 0x80 { + break + } + } + if msglen < 0 { + return ErrInvalidLengthSimulator + } + postIndex := iNdEx + msglen + if postIndex < 0 { + return ErrInvalidLengthSimulator + } + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.JobTemplates = append(m.JobTemplates, &JobTemplate{}) + if err := m.JobTemplates[len(m.JobTemplates)-1].Unmarshal(dAtA[iNdEx:postIndex]); err != nil { + return err + } + iNdEx = postIndex + default: + iNdEx = preIndex + skippy, err := skipSimulator(dAtA[iNdEx:]) + if err != nil { + return err + } + if (skippy < 0) || (iNdEx+skippy) < 0 { + return ErrInvalidLengthSimulator + } + if (iNdEx + skippy) > l { + return io.ErrUnexpectedEOF + } + iNdEx += skippy + } + } + + if iNdEx > l { + return io.ErrUnexpectedEOF + } + return nil +} +func (m *JobTemplate) Unmarshal(dAtA []byte) error { + l := len(dAtA) + iNdEx := 0 + for iNdEx < l { + preIndex := iNdEx + var wire uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSimulator + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + wire |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } + fieldNum := int32(wire >> 3) + wireType := int(wire & 0x7) + if wireType == 4 { + return fmt.Errorf("proto: JobTemplate: wiretype end group for non-group") + } + if fieldNum <= 0 { + return fmt.Errorf("proto: JobTemplate: illegal tag %d (wire type %d)", fieldNum, wire) + } + switch fieldNum { + case 1: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field Number", wireType) + } + m.Number = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSimulator + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + m.Number |= int64(b&0x7F) << shift + if b < 0x80 { + break + } + } + case 2: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field NumberSuccessful", wireType) + } + m.NumberSuccessful = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSimulator + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + m.NumberSuccessful |= int64(b&0x7F) << shift + if b < 0x80 { + break + } + } + case 3: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field Queue", wireType) + } + var stringLen uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSimulator + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + stringLen |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } + intStringLen := int(stringLen) + if intStringLen < 0 { + return ErrInvalidLengthSimulator + } + postIndex := iNdEx + intStringLen + if postIndex < 0 { + return ErrInvalidLengthSimulator + } + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.Queue = string(dAtA[iNdEx:postIndex]) + iNdEx = postIndex + case 4: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field Id", wireType) + } + var stringLen uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSimulator + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + stringLen |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } + intStringLen := int(stringLen) + if intStringLen < 0 { + return ErrInvalidLengthSimulator + } + postIndex := iNdEx + intStringLen + if postIndex < 0 { + return ErrInvalidLengthSimulator + } + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.Id = string(dAtA[iNdEx:postIndex]) + iNdEx = postIndex + case 5: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field JobSet", wireType) + } + var stringLen uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSimulator + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + stringLen |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } + intStringLen := int(stringLen) + if intStringLen < 0 { + return ErrInvalidLengthSimulator + } + postIndex := iNdEx + intStringLen + if postIndex < 0 { + return ErrInvalidLengthSimulator + } + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.JobSet = string(dAtA[iNdEx:postIndex]) + iNdEx = postIndex + case 6: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field QueuePriority", wireType) + } + m.QueuePriority = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSimulator + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + m.QueuePriority |= uint32(b&0x7F) << shift + if b < 0x80 { + break + } + } + case 7: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field PriorityClassName", wireType) + } + var stringLen uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSimulator + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + stringLen |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } + intStringLen := int(stringLen) + if intStringLen < 0 { + return ErrInvalidLengthSimulator + } + postIndex := iNdEx + intStringLen + if postIndex < 0 { + return ErrInvalidLengthSimulator + } + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.PriorityClassName = string(dAtA[iNdEx:postIndex]) + iNdEx = postIndex + case 8: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field Requirements", wireType) + } + var msglen int + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSimulator + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + msglen |= int(b&0x7F) << shift + if b < 0x80 { + break + } + } + if msglen < 0 { + return ErrInvalidLengthSimulator + } + postIndex := iNdEx + msglen + if postIndex < 0 { + return ErrInvalidLengthSimulator + } + if postIndex > l { + return io.ErrUnexpectedEOF + } + if err := m.Requirements.Unmarshal(dAtA[iNdEx:postIndex]); err != nil { + return err + } + iNdEx = postIndex + case 9: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field Dependencies", wireType) + } + var stringLen uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSimulator + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + stringLen |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } + intStringLen := int(stringLen) + if intStringLen < 0 { + return ErrInvalidLengthSimulator + } + postIndex := iNdEx + intStringLen + if postIndex < 0 { + return ErrInvalidLengthSimulator + } + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.Dependencies = append(m.Dependencies, string(dAtA[iNdEx:postIndex])) + iNdEx = postIndex + case 10: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field MinSubmitTime", wireType) + } + var msglen int + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSimulator + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + msglen |= int(b&0x7F) << shift + if b < 0x80 { + break + } + } + if msglen < 0 { + return ErrInvalidLengthSimulator + } + postIndex := iNdEx + msglen + if postIndex < 0 { + return ErrInvalidLengthSimulator + } + if postIndex > l { + return io.ErrUnexpectedEOF + } + if err := github_com_gogo_protobuf_types.StdTimeUnmarshal(&m.MinSubmitTime, dAtA[iNdEx:postIndex]); err != nil { + return err + } + iNdEx = postIndex + case 11: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field RuntimeMean", wireType) + } + m.RuntimeMean = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSimulator + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + m.RuntimeMean |= int64(b&0x7F) << shift + if b < 0x80 { + break + } + } + case 12: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field RuntimeVariance", wireType) + } + m.RuntimeVariance = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSimulator + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + m.RuntimeVariance |= int64(b&0x7F) << shift + if b < 0x80 { + break + } + } + default: + iNdEx = preIndex + skippy, err := skipSimulator(dAtA[iNdEx:]) + if err != nil { + return err + } + if (skippy < 0) || (iNdEx+skippy) < 0 { + return ErrInvalidLengthSimulator + } + if (iNdEx + skippy) > l { + return io.ErrUnexpectedEOF + } + iNdEx += skippy + } + } + + if iNdEx > l { + return io.ErrUnexpectedEOF + } + return nil +} +func skipSimulator(dAtA []byte) (n int, err error) { + l := len(dAtA) + iNdEx := 0 + depth := 0 + for iNdEx < l { + var wire uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return 0, ErrIntOverflowSimulator + } + if iNdEx >= l { + return 0, io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + wire |= (uint64(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + wireType := int(wire & 0x7) + switch wireType { + case 0: + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return 0, ErrIntOverflowSimulator + } + if iNdEx >= l { + return 0, io.ErrUnexpectedEOF + } + iNdEx++ + if dAtA[iNdEx-1] < 0x80 { + break + } + } + case 1: + iNdEx += 8 + case 2: + var length int + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return 0, ErrIntOverflowSimulator + } + if iNdEx >= l { + return 0, io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + length |= (int(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + if length < 0 { + return 0, ErrInvalidLengthSimulator + } + iNdEx += length + case 3: + depth++ + case 4: + if depth == 0 { + return 0, ErrUnexpectedEndOfGroupSimulator + } + depth-- + case 5: + iNdEx += 4 + default: + return 0, fmt.Errorf("proto: illegal wireType %d", wireType) + } + if iNdEx < 0 { + return 0, ErrInvalidLengthSimulator + } + if depth == 0 { + return iNdEx, nil + } + } + return 0, io.ErrUnexpectedEOF +} + +var ( + ErrInvalidLengthSimulator = fmt.Errorf("proto: negative length found during unmarshaling") + ErrIntOverflowSimulator = fmt.Errorf("proto: integer overflow") + ErrUnexpectedEndOfGroupSimulator = fmt.Errorf("proto: unexpected end of group") +) diff --git a/internal/scheduler/simulator/simulator.proto b/internal/scheduler/simulator/simulator.proto new file mode 100644 index 00000000000..dbc02fb1b6b --- /dev/null +++ b/internal/scheduler/simulator/simulator.proto @@ -0,0 +1,71 @@ +syntax = 'proto3'; +package simulator; +option go_package = "github.com/armadaproject/armada/internal/scheduler/simulator"; + +import "google/protobuf/timestamp.proto"; +import "k8s.io/api/core/v1/generated.proto"; +import "github.com/gogo/protobuf/gogoproto/gogo.proto"; +import "internal/scheduler/schedulerobjects/schedulerobjects.proto"; + +// TODO: +// Runtime family. +// Workflow manager delay. +// Job pending delay. +message TestCase { + string name = 1; + int64 random_seed = 2; + repeated Pool pools = 3; + repeated Queue queues = 4 [(gogoproto.nullable) = false]; +} + +message Pool { + string name = 1; + repeated ExecutorGroup executor_groups = 2; +} + +message ExecutorGroup { + repeated Executor executors = 1; +} + +message Executor { + string name = 1; + repeated NodeTemplate node_templates = 2; +} + +message NodeTemplate { + int64 number = 1; + repeated k8s.io.api.core.v1.Taint taints = 2 [(gogoproto.nullable) = false]; + map labels = 3; + schedulerobjects.ResourceList total_resources = 4 [(gogoproto.nullable) = false]; +} + +message Queue { + string name = 1; + double weight = 2; + repeated JobTemplate job_templates = 3; +} + +message JobTemplate { + // Number of jobs to create from this template. + int64 number = 1; + // Number of jobs created from this template that have succeeded. + // Maintained by the simulator. + int64 numberSuccessful = 2; + // Queue to which this template belongs. Populated automatically. + string queue = 3; + // Unique id for this template. An id is generated if empty. + string id = 4; + string job_set = 5; + uint32 queue_priority = 6; + string priority_class_name = 7; + schedulerobjects.PodRequirements requirements = 8 [(gogoproto.nullable) = false]; + // List of template ids that must be completed before this template is submitted. + repeated string dependencies = 9; + // Minimum time from which jobs are created from this template. + google.protobuf.Timestamp min_submit_time = 10 [(gogoproto.nullable) = false, (gogoproto.stdtime) = true]; + // Job runtime mean in seconds. + int64 runtime_mean = 11; + // Job runtime variance in seconds squared. + // If zero, runtime is deterministic. + int64 runtime_variance = 12; +} \ No newline at end of file diff --git a/internal/scheduler/simulator/simulator_test.go b/internal/scheduler/simulator/simulator_test.go new file mode 100644 index 00000000000..0d891d3cfff --- /dev/null +++ b/internal/scheduler/simulator/simulator_test.go @@ -0,0 +1,443 @@ +package simulator + +import ( + fmt "fmt" + "strings" + "testing" + "time" + + "github.com/stretchr/testify/require" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + + "github.com/armadaproject/armada/internal/armada/configuration" + armadaslices "github.com/armadaproject/armada/internal/common/slices" + "github.com/armadaproject/armada/internal/common/util" + "github.com/armadaproject/armada/internal/scheduler/schedulerobjects" + "github.com/armadaproject/armada/internal/scheduler/testfixtures" + "github.com/armadaproject/armada/pkg/armadaevents" +) + +func TestSimulator(t *testing.T) { + tests := map[string]struct { + testCase *TestCase + schedulingConfig configuration.SchedulingConfig + expectedEventSequences []*armadaevents.EventSequence + }{ + "Two jobs in parallel": { + testCase: &TestCase{ + Name: "basic", + Pools: []*Pool{Pool32Cpu("Pool", 1, 1, 2)}, + Queues: []Queue{ + WithJobTemplatesQueue( + Queue{Name: "A", Weight: 1}, + JobTemplate32Cpu(2, "foo", testfixtures.TestDefaultPriorityClass), + ), + }, + }, + schedulingConfig: testfixtures.TestSchedulingConfig(), + expectedEventSequences: []*armadaevents.EventSequence{ + {Queue: "A", JobSetName: "foo", Events: testfixtures.Repeat(SubmitJob(), 2)}, + {Queue: "A", JobSetName: "foo", Events: []*armadaevents.EventSequence_Event{JobRunLeased()}}, + {Queue: "A", JobSetName: "foo", Events: []*armadaevents.EventSequence_Event{JobRunLeased()}}, + {Queue: "A", JobSetName: "foo", Events: []*armadaevents.EventSequence_Event{JobSucceeded()}}, + {Queue: "A", JobSetName: "foo", Events: []*armadaevents.EventSequence_Event{JobSucceeded()}}, + }, + }, + "Two jobs in sequence": { + testCase: &TestCase{ + Name: "basic", + Pools: []*Pool{Pool32Cpu("Pool", 1, 1, 1)}, + Queues: []Queue{ + WithJobTemplatesQueue( + Queue{Name: "A", Weight: 1}, + JobTemplate32Cpu(2, "foo", testfixtures.TestDefaultPriorityClass), + ), + }, + }, + schedulingConfig: testfixtures.TestSchedulingConfig(), + expectedEventSequences: []*armadaevents.EventSequence{ + {Queue: "A", JobSetName: "foo", Events: testfixtures.Repeat(SubmitJob(), 2)}, + {Queue: "A", JobSetName: "foo", Events: []*armadaevents.EventSequence_Event{JobRunLeased()}}, + {Queue: "A", JobSetName: "foo", Events: []*armadaevents.EventSequence_Event{JobSucceeded()}}, + {Queue: "A", JobSetName: "foo", Events: []*armadaevents.EventSequence_Event{JobRunLeased()}}, + {Queue: "A", JobSetName: "foo", Events: []*armadaevents.EventSequence_Event{JobSucceeded()}}, + }, + }, + "10 jobs in sequence": { + testCase: &TestCase{ + Name: "basic", + Pools: []*Pool{Pool32Cpu("Pool", 1, 1, 1)}, + Queues: []Queue{ + WithJobTemplatesQueue( + Queue{Name: "A", Weight: 1}, + JobTemplate32Cpu(10, "foo", testfixtures.TestDefaultPriorityClass), + ), + }, + }, + schedulingConfig: testfixtures.TestSchedulingConfig(), + expectedEventSequences: append( + []*armadaevents.EventSequence{ + {Queue: "A", JobSetName: "foo", Events: testfixtures.Repeat(SubmitJob(), 10)}, + }, + armadaslices.Repeat( + 10, + &armadaevents.EventSequence{Queue: "A", JobSetName: "foo", Events: []*armadaevents.EventSequence_Event{JobRunLeased()}}, + &armadaevents.EventSequence{Queue: "A", JobSetName: "foo", Events: []*armadaevents.EventSequence_Event{JobSucceeded()}}, + )..., + ), + }, + "JobTemplate dependencies": { + testCase: &TestCase{ + Name: "basic", + Pools: []*Pool{Pool32Cpu("Pool", 1, 1, 3)}, + Queues: []Queue{ + WithJobTemplatesQueue( + Queue{Name: "A", Weight: 1}, + WithIdJobTemplate( + JobTemplate32Cpu(2, "foo", testfixtures.TestDefaultPriorityClass), + "jobTemplate", + ), + WithDependenciesJobTemplate( + JobTemplate32Cpu(1, "foo", testfixtures.TestDefaultPriorityClass), + "jobTemplate", + ), + ), + }, + }, + schedulingConfig: testfixtures.TestSchedulingConfig(), + expectedEventSequences: []*armadaevents.EventSequence{ + {Queue: "A", JobSetName: "foo", Events: testfixtures.Repeat(SubmitJob(), 2)}, + {Queue: "A", JobSetName: "foo", Events: []*armadaevents.EventSequence_Event{JobRunLeased()}}, + {Queue: "A", JobSetName: "foo", Events: []*armadaevents.EventSequence_Event{JobRunLeased()}}, + {Queue: "A", JobSetName: "foo", Events: []*armadaevents.EventSequence_Event{JobSucceeded()}}, + {Queue: "A", JobSetName: "foo", Events: []*armadaevents.EventSequence_Event{JobSucceeded()}}, + {Queue: "A", JobSetName: "foo", Events: testfixtures.Repeat(SubmitJob(), 1)}, + {Queue: "A", JobSetName: "foo", Events: []*armadaevents.EventSequence_Event{JobRunLeased()}}, + {Queue: "A", JobSetName: "foo", Events: []*armadaevents.EventSequence_Event{JobSucceeded()}}, + }, + }, + "Preemption": { + testCase: &TestCase{ + Name: "basic", + Pools: []*Pool{Pool32Cpu("Pool", 1, 1, 2)}, + Queues: []Queue{ + WithJobTemplatesQueue( + Queue{Name: "A", Weight: 1}, + JobTemplate32Cpu(2, "foo", testfixtures.PriorityClass0), + ), + WithJobTemplatesQueue( + Queue{Name: "B", Weight: 1}, + WithMinSubmitTimeJobTemplate( + JobTemplate32Cpu(1, "bar", testfixtures.PriorityClass0), + time.Time{}.Add(30*time.Second), + ), + ), + }, + }, + schedulingConfig: testfixtures.TestSchedulingConfig(), + expectedEventSequences: []*armadaevents.EventSequence{ + {Queue: "A", JobSetName: "foo", Events: armadaslices.Repeat(2, SubmitJob())}, + {Queue: "A", JobSetName: "foo", Events: []*armadaevents.EventSequence_Event{JobRunLeased()}}, + {Queue: "A", JobSetName: "foo", Events: []*armadaevents.EventSequence_Event{JobRunLeased()}}, + {Queue: "B", JobSetName: "bar", Events: []*armadaevents.EventSequence_Event{SubmitJob()}}, + {Queue: "A", JobSetName: "foo", Events: []*armadaevents.EventSequence_Event{JobRunPreempted()}}, + {Queue: "B", JobSetName: "bar", Events: []*armadaevents.EventSequence_Event{JobRunLeased()}}, + {Queue: "A", JobSetName: "foo", Events: []*armadaevents.EventSequence_Event{SubmitJob()}}, + {Queue: "A", JobSetName: "foo", Events: []*armadaevents.EventSequence_Event{JobSucceeded()}}, + {Queue: "A", JobSetName: "foo", Events: []*armadaevents.EventSequence_Event{JobRunLeased()}}, + {Queue: "B", JobSetName: "bar", Events: []*armadaevents.EventSequence_Event{JobSucceeded()}}, + {Queue: "A", JobSetName: "foo", Events: []*armadaevents.EventSequence_Event{JobSucceeded()}}, + }, + }, + "Preemption cascade": { + testCase: &TestCase{ + Name: "test", + Pools: []*Pool{ + WithExecutorGroupsPool( + &Pool{Name: "Pool"}, + ExecutorGroup32Cpu(1, 1), + ExecutorGroup32Cpu(1, 1), + ExecutorGroup32Cpu(1, 1), + ), + }, + Queues: []Queue{ + WithJobTemplatesQueue( + Queue{Name: "B", Weight: 1}, + JobTemplate32Cpu(1, "foo", testfixtures.PriorityClass0), + ), + WithJobTemplatesQueue( + Queue{Name: "C", Weight: 1}, + JobTemplate32Cpu(2, "foo", testfixtures.PriorityClass0), + ), + WithJobTemplatesQueue( + Queue{Name: "A", Weight: 1}, + WithMinSubmitTimeJobTemplate( + JobTemplate32Cpu(1, "foo", testfixtures.PriorityClass0), + time.Time{}.Add(30*time.Second), + ), + ), + }, + }, + schedulingConfig: testfixtures.TestSchedulingConfig(), + expectedEventSequences: []*armadaevents.EventSequence{ + {Queue: "B", JobSetName: "foo", Events: []*armadaevents.EventSequence_Event{SubmitJob()}}, + {Queue: "C", JobSetName: "foo", Events: armadaslices.Repeat(2, SubmitJob())}, + {Queue: "B", JobSetName: "foo", Events: []*armadaevents.EventSequence_Event{JobRunLeased()}}, + {Queue: "C", JobSetName: "foo", Events: []*armadaevents.EventSequence_Event{JobRunLeased()}}, + {Queue: "C", JobSetName: "foo", Events: []*armadaevents.EventSequence_Event{JobRunLeased()}}, + {Queue: "A", JobSetName: "foo", Events: []*armadaevents.EventSequence_Event{SubmitJob()}}, + {Queue: "B", JobSetName: "foo", Events: []*armadaevents.EventSequence_Event{JobRunPreempted()}}, + {Queue: "A", JobSetName: "foo", Events: []*armadaevents.EventSequence_Event{JobRunLeased()}}, + {Queue: "B", JobSetName: "foo", Events: []*armadaevents.EventSequence_Event{SubmitJob()}}, + {Queue: "C", JobSetName: "foo", Events: []*armadaevents.EventSequence_Event{JobRunPreempted()}}, + {Queue: "B", JobSetName: "foo", Events: []*armadaevents.EventSequence_Event{JobRunLeased()}}, + {Queue: "C", JobSetName: "foo", Events: []*armadaevents.EventSequence_Event{SubmitJob()}}, + {Queue: "C", JobSetName: "foo", Events: []*armadaevents.EventSequence_Event{JobSucceeded()}}, + {Queue: "C", JobSetName: "foo", Events: []*armadaevents.EventSequence_Event{JobRunLeased()}}, + {Queue: "A", JobSetName: "foo", Events: []*armadaevents.EventSequence_Event{JobSucceeded()}}, + {Queue: "B", JobSetName: "foo", Events: []*armadaevents.EventSequence_Event{JobSucceeded()}}, + {Queue: "C", JobSetName: "foo", Events: []*armadaevents.EventSequence_Event{JobSucceeded()}}, + }, + }, + "No preemption cascade with unified scheduling": { + testCase: &TestCase{ + Name: "test", + Pools: []*Pool{ + WithExecutorGroupsPool( + &Pool{Name: "Pool"}, + ExecutorGroup32Cpu(3, 1), + ), + }, + Queues: []Queue{ + WithJobTemplatesQueue( + Queue{Name: "B", Weight: 1}, + JobTemplate32Cpu(1, "foo", testfixtures.PriorityClass0), + ), + WithJobTemplatesQueue( + Queue{Name: "C", Weight: 1}, + JobTemplate32Cpu(2, "foo", testfixtures.PriorityClass0), + ), + WithJobTemplatesQueue( + Queue{Name: "A", Weight: 1}, + WithMinSubmitTimeJobTemplate( + JobTemplate32Cpu(1, "foo", testfixtures.PriorityClass0), + time.Time{}.Add(30*time.Second), + ), + ), + }, + }, + schedulingConfig: testfixtures.TestSchedulingConfig(), + expectedEventSequences: []*armadaevents.EventSequence{ + {Queue: "B", JobSetName: "foo", Events: []*armadaevents.EventSequence_Event{SubmitJob()}}, + {Queue: "C", JobSetName: "foo", Events: armadaslices.Repeat(2, SubmitJob())}, + {Queue: "B", JobSetName: "foo", Events: []*armadaevents.EventSequence_Event{JobRunLeased()}}, + {Queue: "C", JobSetName: "foo", Events: []*armadaevents.EventSequence_Event{JobRunLeased()}}, + {Queue: "C", JobSetName: "foo", Events: []*armadaevents.EventSequence_Event{JobRunLeased()}}, + {Queue: "A", JobSetName: "foo", Events: []*armadaevents.EventSequence_Event{SubmitJob()}}, + {Queue: "C", JobSetName: "foo", Events: []*armadaevents.EventSequence_Event{JobRunPreempted()}}, + {Queue: "A", JobSetName: "foo", Events: []*armadaevents.EventSequence_Event{JobRunLeased()}}, + {Queue: "C", JobSetName: "foo", Events: []*armadaevents.EventSequence_Event{SubmitJob()}}, + {Queue: "B", JobSetName: "foo", Events: []*armadaevents.EventSequence_Event{JobSucceeded()}}, + {Queue: "C", JobSetName: "foo", Events: []*armadaevents.EventSequence_Event{JobSucceeded()}}, + {Queue: "C", JobSetName: "foo", Events: []*armadaevents.EventSequence_Event{JobRunLeased()}}, + {Queue: "A", JobSetName: "foo", Events: []*armadaevents.EventSequence_Event{JobSucceeded()}}, + {Queue: "C", JobSetName: "foo", Events: []*armadaevents.EventSequence_Event{JobSucceeded()}}, + }, + }, + } + for name, tc := range tests { + t.Run(name, func(t *testing.T) { + s, err := NewSimulator(tc.testCase, tc.schedulingConfig) + require.NoError(t, err) + go func() { err = s.Run() }() + actualEventSequences := make([]*armadaevents.EventSequence, 0, len(tc.expectedEventSequences)) + for eventSequence := range s.C() { + t.Log(*eventSequence.Events[0].Created, eventSequenceSummary(eventSequence)) + actualEventSequences = append(actualEventSequences, eventSequence) + } + require.NoError(t, err) + t.Logf("Simulated time: %s", s.time.Sub(time.Time{})) + if tc.expectedEventSequences != nil { + require.Equal( + t, + util.Map(tc.expectedEventSequences, func(eventSequence *armadaevents.EventSequence) string { return eventSequenceSummary(eventSequence) }), + util.Map(actualEventSequences, func(eventSequence *armadaevents.EventSequence) string { return eventSequenceSummary(eventSequence) }), + "Expected:\n%s\nReceived:\n%s", + eventSequencesSummary(tc.expectedEventSequences), + eventSequencesSummary(actualEventSequences), + ) + } + }) + } +} + +func WithExecutorGroupsPool(pool *Pool, executorGroups ...*ExecutorGroup) *Pool { + pool.ExecutorGroups = append(pool.ExecutorGroups, executorGroups...) + return pool +} + +func WithExecutorsExecutorGroup(executorGroup *ExecutorGroup, executors ...*Executor) *ExecutorGroup { + executorGroup.Executors = append(executorGroup.Executors, executors...) + return executorGroup +} + +func WithNodeTemplatesExecutor(executor *Executor, nodeTemplates ...*NodeTemplate) *Executor { + executor.NodeTemplates = append(executor.NodeTemplates, nodeTemplates...) + return executor +} + +func Pool32Cpu(name string, numExecutorGroups, numExecutorsPerGroup, numNodesPerExecutor int64) *Pool { + executorGroups := make([]*ExecutorGroup, numExecutorGroups) + for i := 0; i < int(numExecutorGroups); i++ { + executorGroups[i] = ExecutorGroup32Cpu(numExecutorsPerGroup, numNodesPerExecutor) + } + return &Pool{ + Name: name, + ExecutorGroups: executorGroups, + } +} + +func ExecutorGroup32Cpu(numExecutors, numNodesPerExecutor int64) *ExecutorGroup { + executors := make([]*Executor, numExecutors) + for i := 0; i < int(numExecutors); i++ { + executors[i] = Executor32Cpu(numNodesPerExecutor) + } + return &ExecutorGroup{ + Executors: executors, + } +} + +func Executor32Cpu(numNodes int64) *Executor { + return &Executor{ + NodeTemplates: []*NodeTemplate{ + NodeTemplate32Cpu(numNodes), + }, + } +} + +func NodeTemplate32Cpu(n int64) *NodeTemplate { + return &NodeTemplate{ + Number: n, + TotalResources: schedulerobjects.ResourceList{ + Resources: map[string]resource.Quantity{ + "cpu": resource.MustParse("32"), + "memory": resource.MustParse("256Gi"), + }, + }, + } +} + +func WithJobTemplatesQueue(queue Queue, jobTemplate ...*JobTemplate) Queue { + queue.JobTemplates = append(queue.JobTemplates, jobTemplate...) + return queue +} + +func WithIdJobTemplate(jobTemplate *JobTemplate, id string) *JobTemplate { + jobTemplate.Id = id + return jobTemplate +} + +func WithDependenciesJobTemplate(jobTemplate *JobTemplate, dependencyIds ...string) *JobTemplate { + jobTemplate.Dependencies = append(jobTemplate.Dependencies, dependencyIds...) + return jobTemplate +} + +func WithMinSubmitTimeJobTemplate(jobTemplate *JobTemplate, minSubmitTime time.Time) *JobTemplate { + jobTemplate.MinSubmitTime = minSubmitTime + return jobTemplate +} + +func JobTemplate32Cpu(n int64, jobSet, priorityClassName string) *JobTemplate { + return &JobTemplate{ + Number: n, + JobSet: jobSet, + PriorityClassName: priorityClassName, + Requirements: schedulerobjects.PodRequirements{ + ResourceRequirements: v1.ResourceRequirements{ + Requests: v1.ResourceList{ + "cpu": resource.MustParse("32"), + "memory": resource.MustParse("256Gi"), + }, + }, + }, + RuntimeMean: 60, + } +} + +func JobTemplate1Cpu(n int64, jobSet, priorityClassName string) *JobTemplate { + return &JobTemplate{ + Number: n, + JobSet: jobSet, + PriorityClassName: priorityClassName, + Requirements: schedulerobjects.PodRequirements{ + ResourceRequirements: v1.ResourceRequirements{ + Requests: v1.ResourceList{ + "cpu": resource.MustParse("1"), + "memory": resource.MustParse("8Gi"), + }, + }, + }, + RuntimeMean: 60, + } +} + +func SubmitJob() *armadaevents.EventSequence_Event { + return &armadaevents.EventSequence_Event{ + Event: &armadaevents.EventSequence_Event_SubmitJob{ + SubmitJob: &armadaevents.SubmitJob{}, + }, + } +} + +func JobRunLeased() *armadaevents.EventSequence_Event { + return &armadaevents.EventSequence_Event{ + Event: &armadaevents.EventSequence_Event_JobRunLeased{ + JobRunLeased: &armadaevents.JobRunLeased{}, + }, + } +} + +func JobRunPreempted() *armadaevents.EventSequence_Event { + return &armadaevents.EventSequence_Event{ + Event: &armadaevents.EventSequence_Event_JobRunPreempted{ + JobRunPreempted: &armadaevents.JobRunPreempted{}, + }, + } +} + +func JobSucceeded() *armadaevents.EventSequence_Event { + return &armadaevents.EventSequence_Event{ + Event: &armadaevents.EventSequence_Event_JobSucceeded{ + JobSucceeded: &armadaevents.JobSucceeded{}, + }, + } +} + +func eventSequencesSummary(eventSequences []*armadaevents.EventSequence) string { + var sb strings.Builder + for i, eventSequence := range eventSequences { + sb.WriteString(eventSequenceSummary(eventSequence)) + if i != len(eventSequences)-1 { + sb.WriteString("\n") + } + } + return sb.String() +} + +func eventSequenceSummary(eventSequence *armadaevents.EventSequence) string { + var sb strings.Builder + sb.WriteString(fmt.Sprintf("EventSequence{Queue: %s, JobSetName: %s, Events: [", eventSequence.Queue, eventSequence.JobSetName)) + for i, event := range eventSequence.Events { + sb.WriteString(eventSummary(event)) + if i != len(eventSequence.Events)-1 { + sb.WriteString(", ") + } + } + sb.WriteString("]}") + return sb.String() +} + +func eventSummary(event *armadaevents.EventSequence_Event) string { + return strings.ReplaceAll(fmt.Sprintf("%T", event.Event), "*armadaevents.EventSequence_Event_", "") +} diff --git a/internal/scheduler/simulator/testdata/diva-plat.yaml b/internal/scheduler/simulator/testdata/diva-plat.yaml new file mode 100644 index 00000000000..a4106287879 --- /dev/null +++ b/internal/scheduler/simulator/testdata/diva-plat.yaml @@ -0,0 +1,30 @@ +name: "DIVA-plat" +pools: + - name: "CPU" + executorGroups: + - executors: + - name: "Executor-CPU-1" + nodeTemplates: + - number: 1 + totalResources: + resources: + cpu: "1" + memory: "1Gi" + - name: "Executor-CPU-2" + nodeTemplates: + - number: 2 + totalResources: + resources: + cpu: "1" + memory: "1Gi" + - name: "GPU" + executorGroups: + - executors: + - name: "Executor-GPU" + nodeTemplates: + - number: 2 + totalResources: + resources: + cpu: "1" + memory: "1Gi" + ndivia.com/gpu: "1" \ No newline at end of file diff --git a/internal/scheduler/submitcheck.go b/internal/scheduler/submitcheck.go index 220a7103daa..6221e2611e9 100644 --- a/internal/scheduler/submitcheck.go +++ b/internal/scheduler/submitcheck.go @@ -3,7 +3,6 @@ package scheduler import ( "context" "fmt" - "os" "strings" "sync" "time" @@ -56,6 +55,7 @@ type SubmitChecker struct { mu sync.Mutex schedulingKeyGenerator *schedulerobjects.SchedulingKeyGenerator jobSchedulingResultsCache *lru.Cache + ExecutorUpdateFrequency time.Duration } func NewSubmitChecker( @@ -80,23 +80,14 @@ func NewSubmitChecker( clock: clock.RealClock{}, schedulingKeyGenerator: schedulerobjects.NewSchedulingKeyGenerator(), jobSchedulingResultsCache: jobSchedulingResultsCache, + ExecutorUpdateFrequency: schedulingConfig.ExecutorUpdateFrequency, } } func (srv *SubmitChecker) Run(ctx context.Context) error { srv.updateExecutors(ctx) - var ticker *time.Ticker - intervalStr, set := os.LookupEnv("EXECUTOR_UPDATE_INTERVAL") - if !set { - intervalStr = "1m" - } - - interval, err := time.ParseDuration(strings.TrimSpace(intervalStr)) - if err != nil { - return err - } - ticker = time.NewTicker(interval) + ticker := time.NewTicker(srv.ExecutorUpdateFrequency) for { select { case <-ctx.Done(): diff --git a/internal/scheduler/testfixtures/testfixtures.go b/internal/scheduler/testfixtures/testfixtures.go index bbd91d2e002..7c6e01f39c4 100644 --- a/internal/scheduler/testfixtures/testfixtures.go +++ b/internal/scheduler/testfixtures/testfixtures.go @@ -100,6 +100,7 @@ func TestSchedulingConfig() configuration.SchedulingConfig { DominantResourceFairnessResourcesToConsider: TestResourceNames, ExecutorTimeout: 15 * time.Minute, MaxUnacknowledgedJobsPerExecutor: math.MaxInt, + EnableNewPreemptionStrategy: true, } } @@ -233,6 +234,13 @@ func WithNodeSelectorPodReq(selector map[string]string, req *schedulerobjects.Po return req } +func WithPriorityJobs(priority uint32, jobs []*jobdb.Job) []*jobdb.Job { + for i, job := range jobs { + jobs[i] = job.WithPriority(priority) + } + return jobs +} + func WithNodeUniformityLabelAnnotationJobs(label string, jobs []*jobdb.Job) []*jobdb.Job { for _, job := range jobs { req := job.PodRequirements() diff --git a/internal/scheduleringester/instructions.go b/internal/scheduleringester/instructions.go index e44d580c7fb..429ab2d9112 100644 --- a/internal/scheduleringester/instructions.go +++ b/internal/scheduleringester/instructions.go @@ -7,6 +7,7 @@ import ( "github.com/gogo/protobuf/proto" "github.com/pkg/errors" log "github.com/sirupsen/logrus" + "golang.org/x/exp/maps" "golang.org/x/exp/slices" "github.com/armadaproject/armada/internal/common/compress" @@ -354,9 +355,13 @@ func (c *InstructionConverter) handlePartitionMarker(pm *armadaevents.PartitionM }}, nil } -// schedulingInfoFromSubmitJob returns a minimal representation of a job -// containing only the info needed by the scheduler. +// schedulingInfoFromSubmitJob returns a minimal representation of a job containing only the info needed by the scheduler. func (c *InstructionConverter) schedulingInfoFromSubmitJob(submitJob *armadaevents.SubmitJob, submitTime time.Time) (*schedulerobjects.JobSchedulingInfo, error) { + return SchedulingInfoFromSubmitJob(submitJob, submitTime, c.priorityClasses) +} + +// SchedulingInfoFromSubmitJob returns a minimal representation of a job containing only the info needed by the scheduler. +func SchedulingInfoFromSubmitJob(submitJob *armadaevents.SubmitJob, submitTime time.Time, priorityClasses map[string]types.PriorityClass) (*schedulerobjects.JobSchedulingInfo, error) { // Component common to all jobs. schedulingInfo := &schedulerobjects.JobSchedulingInfo{ Lifetime: submitJob.Lifetime, @@ -373,8 +378,16 @@ func (c *InstructionConverter) schedulingInfoFromSubmitJob(submitJob *armadaeven case *armadaevents.KubernetesMainObject_PodSpec: podSpec := object.PodSpec.PodSpec schedulingInfo.PriorityClassName = podSpec.PriorityClassName - podRequirements := adapters.PodRequirementsFromPodSpec(podSpec, c.priorityClasses) - podRequirements.Annotations = submitJob.ObjectMeta.Annotations + podRequirements := adapters.PodRequirementsFromPodSpec(podSpec, priorityClasses) + if submitJob.ObjectMeta != nil { + podRequirements.Annotations = maps.Clone(submitJob.ObjectMeta.Annotations) + } + if submitJob.MainObject.ObjectMeta != nil { + if podRequirements.Annotations == nil { + podRequirements.Annotations = make(map[string]string, len(submitJob.MainObject.ObjectMeta.Annotations)) + } + maps.Copy(podRequirements.Annotations, submitJob.MainObject.ObjectMeta.Annotations) + } schedulingInfo.ObjectRequirements = append( schedulingInfo.ObjectRequirements, &schedulerobjects.ObjectRequirements{ diff --git a/internal/scheduleringester/schedulerdb.go b/internal/scheduleringester/schedulerdb.go index 9b944836c2e..e1ce855504b 100644 --- a/internal/scheduleringester/schedulerdb.go +++ b/internal/scheduleringester/schedulerdb.go @@ -5,8 +5,8 @@ import ( "time" "github.com/google/uuid" - "github.com/jackc/pgx/v4" - "github.com/jackc/pgx/v4/pgxpool" + "github.com/jackc/pgx/v5" + "github.com/jackc/pgx/v5/pgxpool" "github.com/pkg/errors" "golang.org/x/exp/maps" @@ -47,7 +47,7 @@ func NewSchedulerDb( // This function locks the postgres table to avoid write conflicts; see acquireLock() for details. func (s *SchedulerDb) Store(ctx context.Context, instructions *DbOperationsWithMessageIds) error { return ingest.WithRetry(func() (bool, error) { - err := s.db.BeginTxFunc(ctx, pgx.TxOptions{ + err := pgx.BeginTxFunc(ctx, s.db, pgx.TxOptions{ IsoLevel: pgx.ReadCommitted, AccessMode: pgx.ReadWrite, DeferrableMode: pgx.Deferrable, diff --git a/internal/scheduleringester/schedulerdb_test.go b/internal/scheduleringester/schedulerdb_test.go index e390950aa28..8317e421aff 100644 --- a/internal/scheduleringester/schedulerdb_test.go +++ b/internal/scheduleringester/schedulerdb_test.go @@ -6,8 +6,8 @@ import ( "time" "github.com/google/uuid" - "github.com/jackc/pgx/v4" - "github.com/jackc/pgx/v4/pgxpool" + "github.com/jackc/pgx/v5" + "github.com/jackc/pgx/v5/pgxpool" "github.com/pkg/errors" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -316,7 +316,7 @@ func assertOpSuccess(t *testing.T, schedulerDb *SchedulerDb, serials map[string] defer cancel() // Apply the op to the database. - err := schedulerDb.db.BeginTxFunc(ctx, pgx.TxOptions{ + err := pgx.BeginTxFunc(ctx, schedulerDb.db, pgx.TxOptions{ IsoLevel: pgx.ReadCommitted, AccessMode: pgx.ReadWrite, DeferrableMode: pgx.Deferrable, diff --git a/internal/testsuite/app.go b/internal/testsuite/app.go index 65629e07150..218c7269288 100644 --- a/internal/testsuite/app.go +++ b/internal/testsuite/app.go @@ -6,7 +6,6 @@ import ( "crypto/rand" "fmt" "io" - "io/ioutil" "os" "path/filepath" "strings" @@ -100,7 +99,7 @@ func TestSpecsFromFilePaths(filePaths []string) ([]*api.TestSpec, error) { func TestSpecFromFilePath(filePath string) (*api.TestSpec, error) { testSpec := &api.TestSpec{} - yamlBytes, err := ioutil.ReadFile(filePath) + yamlBytes, err := os.ReadFile(filePath) if err != nil { return nil, errors.WithStack(err) } diff --git a/magefiles/airflow.go b/magefiles/airflow.go index 7028b99a062..c7b51503572 100644 --- a/magefiles/airflow.go +++ b/magefiles/airflow.go @@ -2,6 +2,7 @@ package main import ( "fmt" + "os" "github.com/magefile/mage/sh" ) @@ -38,34 +39,61 @@ func Airflow(arg string) error { func startAirflow() error { fmt.Println("Starting airflow...") - err := sh.Run("mkdir", "-p", "localdev/airflow/opt") + err := sh.Run("mkdir", "-p", "developer/airflow/opt") if err != nil { return err } // Copy aramda python packages to be used within airflow docker image. - err = sh.Run("cp", "-r", "client/python", "localdev/airflow/opt/client") + err = sh.Run("cp", "-r", "client/python", "developer/airflow/opt/client") if err != nil { return err } - err = sh.Run("cp", "-r", "third_party/airflow", "localdev/airflow/opt/airflow") + err = sh.Run("cp", "-r", "third_party/airflow", "developer/airflow/opt/airflow") if err != nil { return err } // Arise - return dockerRun("compose", "--project-directory", "./localdev/airflow/", + return dockerRun("compose", "--project-directory", "./developer/airflow/", "up", "--build", "--always-recreate-deps", "-d") } func stopAirflow() error { fmt.Println("Stopping airflow...") - err := dockerRun("compose", "-f", "localdev/airflow/docker-compose.yaml", "down") + err := dockerRun("compose", "-f", "developer/airflow/docker-compose.yaml", "down") if err != nil { return err } - return sh.Run("rm", "-rf", "localdev/airflow/opt/") + return sh.Run("rm", "-rf", "developer/airflow/opt/") +} + +// AirflowOperator builds the Airflow Operator +func AirflowOperator() error { + fmt.Println("Building Airflow Operator...") + + err := os.RemoveAll("proto-airflow") + if err != nil { + return fmt.Errorf("failed to remove proto-airflow directory: %w", err) + } + + err = os.MkdirAll("proto-airflow", os.ModePerm) + if err != nil { + return fmt.Errorf("failed to create proto-airflow directory: %w", err) + } + + err = dockerRun("buildx", "build", "-o", "type=docker", "-t", "armada-airflow-operator-builder", "-f", "./build/airflow-operator/Dockerfile", ".") + if err != nil { + return fmt.Errorf("failed to build Airflow Operator: %w", err) + } + + err = dockerRun("run", "--rm", "-v", "${PWD}/proto-airflow:/proto-airflow", "-v", "${PWD}:/go/src/armada", "-w", "/go/src/armada", "armada-airflow-operator-builder", "./scripts/build-airflow-operator.sh") + if err != nil { + return fmt.Errorf("failed to run build-airflow-operator.sh script: %w", err) + } + + return nil } diff --git a/magefiles/cmd.go b/magefiles/cmd.go new file mode 100644 index 00000000000..968f1e65451 --- /dev/null +++ b/magefiles/cmd.go @@ -0,0 +1,153 @@ +package main + +import ( + "fmt" + "os" + "strings" + + "github.com/magefile/mage/sh" +) + +var ( + GOPATH string + DockerGopath string + Platform string + Host_arch string +) + +func dockerGoPath() { + var err error + GOPATH, err = sh.Output("go", "env", "GOPATH") + if err != nil || GOPATH == "" { + GOPATH = ".go" + } + DockerGopath = GOPATH +} + +func platformGet() error { + var err error + Platform, err = sh.Output("uname", "-s") + if err != nil { + return err + } + Host_arch, err = sh.Output("uname", "-m") + if err != nil { + return err + } + return nil +} + +func dockerRunAsUser() (string, error) { + if err := platformGet(); err != nil { + return "", fmt.Errorf("unable to get platform information: %v", err) + } + + DockerRunAsUser := os.Getenv("DOCKER_RUN_AS_USER") + if DockerRunAsUser == "" && Platform != "windows32" { + userId, err := sh.Output("id", "-u") + if err != nil { + return "", fmt.Errorf("unable to get user id: %v", err) + } + groupId, err := sh.Output("id", "-g") + if err != nil { + return "", fmt.Errorf("unable to get group id: %v", err) + } + DockerRunAsUser = userId + ":" + groupId + } + return DockerRunAsUser, nil +} + +func dockerNet() (string, error) { + platform, err := sh.Output("uname", "-s") + if err != nil { + return "", fmt.Errorf("unable to get platform information: %v", err) + } else if platform == "Darwin" { + return "", nil + } + return "--network=host", nil +} + +func dockerGopathDir() (string, error) { + DockerGopath, err := sh.Output("go", "env", "GOPATH") + if err != nil || DockerGopath == "" { + DockerGopath = ".go" + } + + DockerGopathToks := strings.Split(DockerGopath, ":") + if len(DockerGopathToks) == 0 { + return "", fmt.Errorf("unable to parse DockerGopath: %s", DockerGopath) + } + + return DockerGopathToks[0], nil +} + +func go_CMD() ([]string, error) { + dockerGoPath() + DOCKER_RUN_AS_USER, err := dockerRunAsUser() + if err != nil { + return nil, err + } + DOCKER_NET, err := dockerNet() + if err != nil { + return nil, err + } + DOCKER_GOPATH_DIR, err := dockerGopathDir() + if err != nil { + return nil, err + } + + return []string{ + "run", + "--rm", + "-u", + DOCKER_RUN_AS_USER, + "-v", + "${PWD}:/go/src/armada", + "-w", + "/go/src/armada", + DOCKER_NET, + "-e", + "GOPROXY", + "-e", + "GOPRIVATE", + "-e", + "GOCACHE=/go/cache", + "-e", + "INTEGRATION_ENABLED=true", + "-e", + "CGO_ENABLED=0", + "-e", + "GOOS=linux", + "-e", + "GARCH=amd64", + "-v", + fmt.Sprintf("%s:/go", DOCKER_GOPATH_DIR), + "golang:1.20.2-buster", + }, nil +} + +func go_TEST_CMD() ([]string, error) { + TESTS_IN_DOCKER := os.Getenv("TESTS_IN_DOCKER") + if TESTS_IN_DOCKER == "true" { + return go_CMD() + } else { + return []string{}, nil + } +} + +func dotnetCmd() []string { + dotnetcmd := []string{ + "run", + "-v", + "${PWD}:/go/src/armada", + "-w", + "/go/src/armada", + } + + if useSystemCerts { + dotnetcmd = append(dotnetcmd, "-v", "${PWD}/build/ssl/certs/:/etc/ssl/certs", "-e", "SSL_CERT_FILE=/etc/ssl/certs/ca-certificates.crt") + } + + dotnetcmd = append(dotnetcmd, defaultDotnetDockerImg) + return dotnetcmd +} diff --git a/magefiles/debug.go b/magefiles/debug.go index e23cb89a276..109d195a513 100644 --- a/magefiles/debug.go +++ b/magefiles/debug.go @@ -2,7 +2,6 @@ package main import ( "fmt" - "io/ioutil" "os" "strings" @@ -30,7 +29,7 @@ func createDelveImage() error { func CreateDelveCompose() error { mg.Deps(createDelveImage) - data, err := ioutil.ReadFile("docker-compose.yaml") + data, err := os.ReadFile("docker-compose.yaml") if err != nil { return err } @@ -83,7 +82,7 @@ func CreateDelveCompose() error { } output := strings.Join(lines, "\n") - err = ioutil.WriteFile("docker-compose.dev.yaml", []byte(output), os.ModePerm) + err = os.WriteFile("docker-compose.dev.yaml", []byte(output), os.ModePerm) if err != nil { return err } diff --git a/magefiles/developer.go b/magefiles/developer.go index 9b0a26cb571..d2f9941171c 100644 --- a/magefiles/developer.go +++ b/magefiles/developer.go @@ -48,6 +48,11 @@ func StopDependencies() error { return err } + servicesArg = append([]string{"compose", "rm", "-f"}, services...) + if err := dockerRun(servicesArg...); err != nil { + return err + } + return nil } @@ -69,11 +74,21 @@ func StopComponents() error { composeFile := getComposeFile() components := getComponentsList() + // Adding the pulsar components here temporarily so that they can be stopped without + // adding them to the full run (which is still on legacy scheduler) + // TODO: remove this when pulsar backed scheduler is the default + components = append(components, "server-pulsar", "executor-pulsar", "scheduler", "scheduleringester") + componentsArg := append([]string{"compose", "-f", composeFile, "stop"}, components...) if err := dockerRun(componentsArg...); err != nil { return err } + componentsArg = append([]string{"compose", "-f", composeFile, "rm", "-f"}, components...) + if err := dockerRun(componentsArg...); err != nil { + return err + } + return nil } @@ -92,8 +107,12 @@ func CheckForPulsarRunning() error { return err } if strings.Contains(out, "alive") { - // Sleep for 1 second to allow Pulsar to fully start - time.Sleep(1 * time.Second) + // if seconds is less than 1, it means that pulsar had already started + if seconds < 1 { + fmt.Printf("\nPulsar had already started!\n\n") + return nil + } + fmt.Printf("\nPulsar took %d seconds to start!\n\n", seconds) return nil } diff --git a/magefiles/docker.go b/magefiles/docker.go index 93b1f2c46b5..47b99246aa5 100644 --- a/magefiles/docker.go +++ b/magefiles/docker.go @@ -74,13 +74,13 @@ func dockerVersion() (*semver.Version, error) { return version, nil } -func constraintCheck(version *semver.Version, versionRequirement string) error { +func constraintCheck(version *semver.Version, versionRequirement string, dependencyName string) error { constraint, err := semver.NewConstraint(versionRequirement) if err != nil { return errors.Errorf("error parsing constraint: %v", err) } if !constraint.Check(version) { - return errors.Errorf("found version %v but it failed constaint %v", version, constraint) + return errors.Errorf("found %s version %v but it failed constraint %v", dependencyName, version, constraint) } return nil } @@ -90,7 +90,7 @@ func dockerComposeCheck() error { if err != nil { return errors.Errorf("error getting version: %v", err) } - return constraintCheck(version, DOCKER_COMPOSE_VERSION_CONSTRAINT) + return constraintCheck(version, DOCKER_COMPOSE_VERSION_CONSTRAINT, "docker-compose") } func dockerBuildxCheck() error { @@ -98,7 +98,7 @@ func dockerBuildxCheck() error { if err != nil { return errors.Errorf("error getting version: %v", err) } - return constraintCheck(version, DOCKER_BUILDX_VERSION_CONSTRAINT) + return constraintCheck(version, DOCKER_BUILDX_VERSION_CONSTRAINT, "docker-buildx") } func dockerCheck() error { @@ -111,7 +111,7 @@ func dockerCheck() error { return errors.Errorf("error parsing constraint: %v", err) } if !constraint.Check(version) { - return errors.Errorf("found version %v but it failed constaint %v", version, constraint) + return errors.Errorf("found docker version %v but it failed constraint %v", version, constraint) } return nil } diff --git a/magefiles/dotnet.go b/magefiles/dotnet.go new file mode 100644 index 00000000000..f7d80b4b655 --- /dev/null +++ b/magefiles/dotnet.go @@ -0,0 +1,133 @@ +package main + +import ( + "fmt" + "os" + "path/filepath" + "runtime" + + "github.com/magefile/mage/mg" + "github.com/magefile/mage/sh" +) + +var ( + defaultDotnetDockerImg = "mcr.microsoft.com/dotnet/sdk:3.1.417-buster" + releaseTag string + nugetApiKey string + useSystemCerts bool +) + +func initializeDotnetRequirements() { + releaseTag = getEnvWithDefault("RELEASE_TAG", "UNKNOWN_TAG") + nugetApiKey = getEnvWithDefault("NUGET_API_KEY", "UNKNOWN_NUGET_API_KEY") +} + +func sslCerts() error { + fmt.Println("Setting up SSL certificates...") + sslCertsDir := filepath.Join(".", "build", "ssl", "certs") + err := os.MkdirAll(sslCertsDir, os.ModePerm) + if err != nil { + return err + } + + if _, err := os.Stat("/etc/ssl/certs/ca-certificates.crt"); err == nil { + err = sh.Run("cp", "/etc/ssl/certs/ca-certificates.crt", filepath.Join(sslCertsDir, "ca-certificates.crt")) + if err != nil { + return err + } + } else if _, err := os.Stat("/etc/ssl/certs/ca-bundle.crt"); err == nil { + err = sh.Run("cp", "/etc/ssl/certs/ca-bundle.crt", filepath.Join(sslCertsDir, "ca-certificates.crt")) + if err != nil { + return err + } + } else if runtime.GOOS == "darwin" { + err = sh.Run("security", "find-certificate", "-a", "-p", "/System/Library/Keychains/SystemRootCertificates.keychain", ">>", filepath.Join(sslCertsDir, "ca-certificates.crt")) + if err != nil { + return err + } + + err = sh.Run("security", "find-certificate", "-a", "-p", "/Library/Keychains/System.keychain", ">>", filepath.Join(sslCertsDir, "ca-certificates.crt")) + if err != nil { + return err + } + + err = sh.Run("security", "find-certificate", "-a", "-p", "~/Library/Keychains/login.keychain-db", ">>", filepath.Join(sslCertsDir, "ca-certificates.crt")) + if err != nil { + return err + } + } else { + return fmt.Errorf("don't know where to find root CA certs") + } + + return nil +} + +func dotnetSetup() error { + fmt.Println("Setting up Dotnet...") + if useSystemCerts { + err := sslCerts() + if err != nil { + return err + } + } + return nil +} + +// Target for compiling the dotnet Armada REST client +func Dotnet() error { + mg.Deps(initializeDotnetRequirements, dotnetSetup, BootstrapProto) + fmt.Println("Building Dotnet...") + + dotnetCmd := dotnetCmd() + + client := append(dotnetCmd, "dotnet", "build", "./client/DotNet/Armada.Client", "/t:NSwag") + output, err := dockerOutput(client...) + fmt.Println(output) + if err != nil { + return err + } + + client = append(dotnetCmd, "dotnet", "build", "./client/DotNet/ArmadaProject.Io.Client") + output, err = dockerOutput(client...) + fmt.Println(output) + if err != nil { + return err + } + return nil +} + +// Pack and push dotnet clients to nuget. Requires RELEASE_TAG and NUGET_API_KEY env vars to be set +func PushNuget() error { + mg.Deps(initializeDotnetRequirements, dotnetSetup, Proto) + fmt.Println("Pushing to Nuget...") + + dotnetCmd := dotnetCmd() + push := append(dotnetCmd, "dotnet", "pack", "client/DotNet/Armada.Client/Armada.Client.csproj", "-c", "Release", "-p:PackageVersion="+releaseTag, "-o", "./bin/client/DotNet") + output, err := dockerOutput(push...) + fmt.Println(output) + if err != nil { + return err + } + + push = append(dotnetCmd, "dotnet", "nuget", "push", "./bin/client/DotNet/G-Research.Armada.Client."+releaseTag+".nupkg", "-k", nugetApiKey, "-s", "https://api.nuget.org/v3/index.json") + output, err = dockerOutput(push...) + fmt.Println(output) + if err != nil { + return err + } + + push = append(dotnetCmd, "dotnet", "pack", "client/DotNet/ArmadaProject.Io.Client/ArmadaProject.Io.Client.csproj", "-c", "Release", "-p:PackageVersion="+releaseTag, "-o", "./bin/client/DotNet") + output, err = dockerOutput(push...) + fmt.Println(output) + if err != nil { + return err + } + + push = append(dotnetCmd, "dotnet", "nuget", "push", "./bin/client/DotNet/ArmadaProject.Io.Client."+releaseTag+".nupkg", "-k", nugetApiKey, "-s", "https://api.nuget.org/v3/index.json") + output, err = dockerOutput(push...) + fmt.Println(output) + if err != nil { + return err + } + return nil +} diff --git a/magefiles/go.go b/magefiles/go.go index 5c4d75d6d08..1cff669fe3c 100644 --- a/magefiles/go.go +++ b/magefiles/go.go @@ -43,14 +43,7 @@ func goCheck() error { if err != nil { return errors.Errorf("error getting version: %v", err) } - constraint, err := semver.NewConstraint(GO_VERSION_CONSTRAINT) - if err != nil { - return errors.Errorf("error parsing constraint: %v", err) - } - if !constraint.Check(version) { - return errors.Errorf("found version %v but it failed constaint %v", version, constraint) - } - return nil + return constraintCheck(version, GO_VERSION_CONSTRAINT, "Go") } func goEnv(name string) (string, error) { diff --git a/magefiles/kind.go b/magefiles/kind.go index d27372aa544..f513d5fc761 100644 --- a/magefiles/kind.go +++ b/magefiles/kind.go @@ -21,7 +21,7 @@ const ( func getImages() []string { images := []string{ - "alpine:3.10", + "alpine:3.18.3", "nginx:1.21.6", "registry.k8s.io/ingress-nginx/controller:v1.4.0", "registry.k8s.io/ingress-nginx/kube-webhook-certgen:v20220916-gd32f8c343", @@ -70,22 +70,14 @@ func kindCheck() error { if err != nil { return errors.Errorf("error getting version: %v", err) } - constraint, err := semver.NewConstraint(KIND_VERSION_CONSTRAINT) - if err != nil { - return errors.Errorf("error parsing constraint: %v", err) - } - if !constraint.Check(version) { - return errors.Errorf("found version %v but it failed constaint %v", version, constraint) - } - return nil + return constraintCheck(version, KIND_VERSION_CONSTRAINT, "kind") } // Images that need to be available in the Kind cluster, // e.g., images required for e2e tests. func kindGetImages() error { for _, image := range getImages() { - err := dockerRun("pull", image) - if err != nil { + if err := dockerRun("pull", image); err != nil { return err } } diff --git a/magefiles/kubectl.go b/magefiles/kubectl.go index 7e43460f8eb..a2c1e426e1f 100644 --- a/magefiles/kubectl.go +++ b/magefiles/kubectl.go @@ -48,12 +48,5 @@ func kubectlCheck() error { if err != nil { return errors.Errorf("error getting version: %v", err) } - constraint, err := semver.NewConstraint(KUBECTL_VERSION_CONSTRAINT) - if err != nil { - return errors.Errorf("error parsing constraint: %v", err) - } - if !constraint.Check(version) { - return errors.Errorf("found version %v but it failed constaint %v", version, constraint) - } - return nil + return constraintCheck(version, KUBECTL_VERSION_CONSTRAINT, "kubectl") } diff --git a/magefiles/linting.go b/magefiles/linting.go new file mode 100644 index 00000000000..bc7094cebff --- /dev/null +++ b/magefiles/linting.go @@ -0,0 +1,99 @@ +package main + +import ( + "fmt" + "strings" + + semver "github.com/Masterminds/semver/v3" + "github.com/magefile/mage/mg" + "github.com/magefile/mage/sh" + "github.com/pkg/errors" +) + +const GOLANGCI_LINT_VERSION_CONSTRAINT = ">= 1.52.0" + +// Extract the version of golangci-lint +func golangciLintVersion() (*semver.Version, error) { + output, err := golangcilintOutput("--version") + if err != nil { + return nil, errors.Errorf("error running version cmd: %v", err) + } + fields := strings.Fields(string(output)) + if len(fields) < 4 { + return nil, errors.Errorf("unexpected version cmd output: %s", output) + } + version, err := semver.NewVersion(strings.TrimPrefix(fields[3], "v")) // adjusted index and removed prefix 'v' + if err != nil { + return nil, errors.Errorf("error parsing version: %v", err) + } + return version, nil +} + +// Check if the version of golangci-lint meets the predefined constraints +func golangciLintCheck() error { + version, err := golangciLintVersion() + if err != nil { + return errors.Errorf("error getting version: %v", err) + } + return constraintCheck(version, GOLANGCI_LINT_VERSION_CONSTRAINT, "golangci-lint") +} + +// Fixing Linting +func LintFix() error { + mg.Deps(golangciLintCheck) + cmd, err := go_TEST_CMD() + if err != nil { + return err + } + if len(cmd) == 0 { + output, err := golangcilintOutput("run", "--fix", "--timeout", "10m") + if err != nil { + fmt.Printf("error fixing linting cmd: %v", err) + fmt.Printf("\nOutput: %s\n", output) + } + } else { + cmd = append(cmd, "golangci-lint", "run", "--fix", "--timeout", "10m") + output, err := dockerOutput(cmd...) + fmt.Println(output) + if err != nil { + return err + } + } + return nil +} + +// Linting Check +func CheckLint() error { + mg.Deps(golangciLintCheck) + cmd, err := go_TEST_CMD() + if err != nil { + return err + } + if len(cmd) == 0 { + output, err := golangcilintOutput("run", "--timeout", "10m") + if err != nil { + fmt.Printf("error fixing linting cmd: %v", err) + fmt.Printf("\nOutput: %s\n", output) + } + } else { + cmd = append(cmd, "golangci-lint", "run", "--timeout", "10m") + output, err := dockerOutput(cmd...) + fmt.Println(output) + if err != nil { + return err + } + } + return nil +} + +func golangcilintBinary() string { + return binaryWithExt("golangci-lint") +} + +func golangcilintOutput(args ...string) (string, error) { + return sh.Output(golangcilintBinary(), args...) +} + +// func golangcilintRun(args ...string) error { +// return sh.Run(golangcilintBinary(), args...) +// } diff --git a/magefiles/main.go b/magefiles/main.go index 38d755c0612..de1d804f72c 100644 --- a/magefiles/main.go +++ b/magefiles/main.go @@ -7,6 +7,7 @@ import ( "time" "github.com/magefile/mage/mg" + "github.com/magefile/mage/sh" "github.com/pkg/errors" "sigs.k8s.io/yaml" ) @@ -34,6 +35,33 @@ func BootstrapTools() error { return nil } +// Download install the bootstap tools and download mod and make it tidy +func Download() error { + mg.Deps(BootstrapTools) + go_test_cmd, err := go_TEST_CMD() + if err != nil { + return err + } + if len(go_test_cmd) == 0 { + if err = sh.Run("go", "mod", "download"); err != nil { + return err + } + if err = sh.Run("go", "mod", "tidy"); err != nil { + return err + } + } else { + cmd := append(go_test_cmd, "go", "mod", "download") + if err := dockerRun(cmd...); err != nil { + return err + } + cmd = append(go_test_cmd, "go", "mod", "tidy") + if err := dockerRun(cmd...); err != nil { + return err + } + } + return nil +} + // Check dependent tools are present and the correct version. func CheckDeps() error { checks := []struct { @@ -94,6 +122,19 @@ func Sql() error { return sqlcRun("generate", "-f", "internal/scheduler/database/sql.yaml") } +// Generate Helm documentation. +func HelmDocs() error { + fmt.Println("Generating Helm documentation...") + output, err := sh.Output("./scripts/helm-docs.sh") + if err != nil { + fmt.Println(output) + return fmt.Errorf("failed to generate Helm documentation: %w", err) + } else { + fmt.Println(output) + } + return nil +} + // Generate Protos. func Proto() { mg.Deps(BootstrapProto) @@ -126,17 +167,23 @@ func LocalDev(arg string) error { mg.Deps(BootstrapTools) fmt.Println("Time to bootstrap tools:", time.Since(timeTaken)) + // Set the Executor Update Frequency to 1 second for local development + os.Setenv("ARMADA_SCHEDULING_EXECUTORUPDATEFREQUENCY", "1s") + switch arg { case "minimal": timeTaken := time.Now() + os.Setenv("PULSAR_BACKED", "") mg.Deps(mg.F(goreleaserMinimalRelease, "bundle"), Kind, downloadDependencyImages) fmt.Printf("Time to build, setup kind and download images: %s\n", time.Since(timeTaken)) + case "minimal-pulsar": + mg.Deps(mg.F(goreleaserMinimalRelease, "bundle"), Kind, downloadDependencyImages) case "full": mg.Deps(BuildPython, mg.F(BuildDockers, "bundle, lookout-bundle, jobservice"), Kind, downloadDependencyImages) case "no-build", "debug": mg.Deps(Kind, downloadDependencyImages) default: - return errors.Errorf("invalid argument: %s", arg) + return fmt.Errorf("invalid argument: %s Please enter one the following argument: minimal, minimal-pulsar, full, no-build, debug ", arg) } mg.Deps(StartDependencies) @@ -147,7 +194,12 @@ func LocalDev(arg string) error { case "minimal": os.Setenv("ARMADA_COMPONENTS", "executor,server") mg.Deps(StartComponents) - case "debug": + case "minimal-pulsar": + // This 20s sleep is to remedy an issue caused by pods coming up too fast after pulsar + // TODO: Deal with this internally somehow? + os.Setenv("ARMADA_COMPONENTS", "executor-pulsar,server-pulsar,scheduler,scheduleringester") + mg.Deps(StartComponents) + case "debug", "no-build": fmt.Println("Dependencies started, ending localdev...") return nil default: @@ -187,3 +239,95 @@ func readYaml(filename string, out interface{}) error { err = yaml.Unmarshal(bytes, out) return err } + +// junitReport Output test results in Junit format, e.g., to display in Jenkins. +func JunitReport() error { + if err := os.MkdirAll("test_reports", os.ModePerm); err != nil { + return fmt.Errorf("failed to create directory: %v", err) + } + + // Make sure everything has been synced to disk + if err := sh.RunV("sync"); err != nil { + return fmt.Errorf("failed to sync: %w", err) + } + + // Remove junit.xml file if it exists + if err := os.Remove("test_reports/junit.xml"); err != nil && !os.IsNotExist(err) { + return fmt.Errorf("failed to remove file: %v", err) + } + + // Get the command for the go test + goTestCmd, err := go_TEST_CMD() + if err != nil { + return err + } + + if len(goTestCmd) == 0 { + if err := sh.RunV("bash", "-c", "cat test_reports/*.txt | go-junit-report > test_reports/junit.xml"); err != nil { + return err + } + } else { + goTestCmd = append(goTestCmd, "bash", "-c", "cat test_reports/*.txt | go-junit-report > test_reports/junit.xml") + if err = dockerRun(goTestCmd...); err != nil { + return err + } + } + return nil +} + +// Code generation tasks: statik, goimports, go generate. +func Generate() error { + go_cmd, err := go_CMD() + if err != nil { + return err + } + + // Commands to be run + cmd1 := []string{ + "go", "run", "github.com/rakyll/statik", + "-dest=internal/lookout/repository/schema/", + "-src=internal/lookout/repository/schema/", + "-include=\\*.sql", + "-ns=lookout/sql", + "-Z", + "-f", + "-m", + } + cmd2 := []string{ + "go", "run", "golang.org/x/tools/cmd/goimports", + "-w", + "-local", "github.com/armadaproject/armada", + "internal/lookout/repository/schema/statik", + } + + if len(go_cmd) == 0 { + if err = goRun(cmd1[1:]...); err != nil { + return err + } + if err = goRun(cmd2[2:]...); err != nil { + return err + } + } else { + dockercmd := append(go_cmd, cmd1...) + dockercmd = append(dockercmd, "&&") + dockercmd = append(dockercmd, cmd2...) + fmt.Println(dockercmd) + if err := dockerRun(go_cmd...); err != nil { + return err + } + } + if err = goRun("generate", "./..."); err != nil { + return err + } + return nil +} + +// CI Image to build +func BuildCI() error { + ciImage := []string{"bundle", "lookout-bundle", "server", "executor", "armadactl", "testsuite", "lookout", "lookoutingester", "lookoutv2", "lookoutingesterv2", "eventingester", "scheduler", "scheduleringester", "binoculars", "jobservice"} + err := goreleaserMinimalRelease(ciImage...) + if err != nil { + return err + } + return nil +} diff --git a/magefiles/proto.go b/magefiles/proto.go index 7ae1451e725..da07513d0df 100644 --- a/magefiles/proto.go +++ b/magefiles/proto.go @@ -3,7 +3,6 @@ package main import ( "fmt" "io/fs" - "io/ioutil" "os" "path/filepath" "strings" @@ -133,21 +132,21 @@ func protoGenerate() error { if s, err := goOutput("run", "./scripts/merge_swagger/merge_swagger.go", "api.swagger.json"); err != nil { return err } else { - if err := ioutil.WriteFile("pkg/api/api.swagger.json", []byte(s), 0o755); err != nil { + if err := os.WriteFile("pkg/api/api.swagger.json", []byte(s), 0o755); err != nil { return err } } if s, err := goOutput("run", "./scripts/merge_swagger/merge_swagger.go", "lookout/api.swagger.json"); err != nil { return err } else { - if err := ioutil.WriteFile("pkg/api/lookout/api.swagger.json", []byte(s), 0o755); err != nil { + if err := os.WriteFile("pkg/api/lookout/api.swagger.json", []byte(s), 0o755); err != nil { return err } } if s, err := goOutput("run", "./scripts/merge_swagger/merge_swagger.go", "binoculars/api.swagger.json"); err != nil { return err } else { - if err := ioutil.WriteFile("pkg/api/binoculars/api.swagger.json", []byte(s), 0o755); err != nil { + if err := os.WriteFile("pkg/api/binoculars/api.swagger.json", []byte(s), 0o755); err != nil { return err } } diff --git a/magefiles/protoc.go b/magefiles/protoc.go index a15c5ea1ea8..c46481a1107 100644 --- a/magefiles/protoc.go +++ b/magefiles/protoc.go @@ -43,12 +43,5 @@ func protocCheck() error { if err != nil { return errors.Errorf("error getting version: %v", err) } - constraint, err := semver.NewConstraint(PROTOC_VERSION_CONSTRAINT) - if err != nil { - return errors.Errorf("error parsing constraint: %v", err) - } - if !constraint.Check(version) { - return errors.Errorf("found version %v but it failed constaint %v", version, constraint) - } - return nil + return constraintCheck(version, PROTOC_VERSION_CONSTRAINT, "protoc") } diff --git a/magefiles/sqlc.go b/magefiles/sqlc.go index 1a111f216c7..6dffb85252b 100644 --- a/magefiles/sqlc.go +++ b/magefiles/sqlc.go @@ -37,12 +37,5 @@ func sqlcCheck() error { if err != nil { return errors.Errorf("error getting version: %v", err) } - constraint, err := semver.NewConstraint(SQLC_VERSION_CONSTRAINT) - if err != nil { - return errors.Errorf("error parsing constraint: %v", err) - } - if !constraint.Check(version) { - return errors.Errorf("found version %v but it failed constaint %v", version, constraint) - } - return nil + return constraintCheck(version, SQLC_VERSION_CONSTRAINT, "sqlc") } diff --git a/magefiles/tests.go b/magefiles/tests.go new file mode 100644 index 00000000000..85cba155da0 --- /dev/null +++ b/magefiles/tests.go @@ -0,0 +1,229 @@ +package main + +import ( + "fmt" + "io" + "os" + "os/exec" + "path/filepath" + "strings" + "time" + + "github.com/magefile/mage/mg" + "github.com/magefile/mage/sh" +) + +var Gotestsum string + +var LocalBin = filepath.Join(os.Getenv("PWD"), "/bin") + +func makeLocalBin() error { + if _, err := os.Stat(LocalBin); os.IsNotExist(err) { + err = os.MkdirAll(LocalBin, os.ModePerm) + if err != nil { + return err + } + } + return nil +} + +// Gotestsum downloads gotestsum locally if necessary +func gotestsum() error { + mg.Deps(makeLocalBin) + Gotestsum = filepath.Join(LocalBin, "/gotestsum") + + if _, err := os.Stat(Gotestsum); os.IsNotExist(err) { + fmt.Println(Gotestsum) + cmd := exec.Command("go", "install", "gotest.tools/gotestsum@v1.8.2") + cmd.Env = append(os.Environ(), "GOBIN="+LocalBin) + return cmd.Run() + + } + return nil +} + +// Tests is a mage target that runs the tests and generates coverage reports. +func Tests() error { + mg.Deps(gotestsum) + var err error + + docker_Net, err := dockerNet() + if err != nil { + return err + } + + err = dockerRun("run", "-d", "--name=redis", docker_Net, "-p=6379:6379", "redis:6.2.6") + if err != nil { + return err + } + + err = dockerRun("run", "-d", "--name=postgres", docker_Net, "-p", "5432:5432", "-e", "POSTGRES_PASSWORD=psw", "postgres:14.2") + if err != nil { + return err + } + + defer func() { + if err := dockerRun("rm", "-f", "redis", "postgres"); err != nil { + fmt.Println(err) + } + }() + + err = sh.Run("sleep", "3") + if err != nil { + return err + } + packages, err := sh.Output("go", "list", "./internal/...") + if err != nil { + return err + } + + internalPackages := filterPackages(strings.Fields(packages), "jobservice/repository") + + cmd := []string{ + "--format", "short-verbose", + "--junitfile", "test-reports/unit-tests.xml", + "--jsonfile", "test-reports/unit-tests.json", + "--", "-coverprofile=test-reports/coverage.out", + "-covermode=atomic", "./cmd/...", + "./pkg/...", + } + cmd = append(cmd, internalPackages...) + + if err = sh.Run(Gotestsum, cmd...); err != nil { + return err + } + + return err +} + +func filterPackages(packages []string, filter string) []string { + var filtered []string + for _, pkg := range packages { + if !strings.Contains(pkg, filter) { + filtered = append(filtered, pkg) + } + } + return filtered +} + +func runTest(name, outputFileName string) error { + cmd := exec.Command(Gotestsum, "--", "-v", name, "-count=1") + file, err := os.Create(filepath.Join("test_reports", outputFileName)) + if err != nil { + return err + } + defer file.Close() + cmd.Stdout = io.MultiWriter(os.Stdout, file) + cmd.Stderr = os.Stderr + return cmd.Run() +} + +// Teste2eAirflow runs e2e tests for airflow +func Teste2eAirflow() error { + mg.Deps(AirflowOperator) + if err := BuildDockers("jobservice"); err != nil { + return err + } + + cmd, err := go_CMD() + if err != nil { + return err + } + cmd = append(cmd, "go", "run", "cmd/armadactl/main.go", "create", "queue", "queue-a") + if err := dockerRun(cmd...); err != nil { + fmt.Println(err) + } + + if err := dockerRun("rm", "-f", "jobservice"); err != nil { + fmt.Println(err) + } + + err = dockerRun("run", "-d", "--name", "jobservice", "--network=kind", + "--mount", "type=bind,src=${PWD}/e2e,dst=/e2e", "gresearch/armada-jobservice", "run", "--config", + "/e2e/setup/jobservice.yaml") + if err != nil { + return err + } + + err = dockerRun("run", "-v", "${PWD}/e2e:/e2e", "-v", "${PWD}/third_party/airflow:/code", + "--workdir", "/code", "-e", "ARMADA_SERVER=server", "-e", "ARMADA_PORT=50051", "-e", "JOB_SERVICE_HOST=jobservice", + "-e", "JOB_SERVICE_PORT=60003", "--entrypoint", "python3", "--network=kind", "armada-airflow-operator-builder:latest", + "-m", "pytest", "-v", "-s", "/code/tests/integration/test_airflow_operator_logic.py") + if err != nil { + return err + } + + err = dockerRun("rm", "-f", "jobservice") + if err != nil { + return err + } + return nil +} + +// Teste2epython runs e2e tests for python client +func Teste2epython() error { + mg.Deps(BuildPython) + args := []string{ + "run", + "-v", "${PWD}/client/python:/code", + "--workdir", "/code", + "-e", "ARMADA_SERVER=server", + "-e", "ARMADA_PORT=50051", + "--entrypoint", "python3", + "--network", "kind", + "armada-python-client-builder:latest", + "-m", "pytest", + "-v", "-s", + "/code/tests/integration/test_no_auth.py", + } + + return dockerRun(args...) +} + +// TestsNoSetup runs the tests without setup +func TestsNoSetup() error { + mg.Deps(gotestsum) + + if err := runTest("./internal...", "internal.txt"); err != nil { + return err + } + if err := runTest("./pkg...", "pkg.txt"); err != nil { + return err + } + if err := runTest("./cmd...", "cmd.txt"); err != nil { + return err + } + + return nil +} + +// PopulateLookoutTest populates the lookout test +func PopulateLookoutTest() error { + dockerNet, err := dockerNet() + if err != nil { + return err + } + if err = dockerRun("ps", "-q", "-f", "name=postgres"); err == nil { + + if err := dockerRun("stop", "postgres"); err != nil { + return err + } + if err := dockerRun("rm", "postgres"); err != nil { + return err + } + } + + err = dockerRun("run", "-d", "--name=postgres", dockerNet, "-p", "5432:5432", "-e", "POSTGRES_PASSWORD=psw", "postgres:14.2") + if err != nil { + return err + } + + time.Sleep(5 * time.Second) + + err = goRun("test", "-v", "${PWD}/internal/lookout/db-gen/") + if err != nil { + return err + } + + return nil +} diff --git a/magefiles/utils.go b/magefiles/utils.go index 0c6ca710460..a4985a42b8b 100644 --- a/magefiles/utils.go +++ b/magefiles/utils.go @@ -48,6 +48,11 @@ func onArm() bool { return runtime.GOARCH == "arm64" } +// Check if the user is on a windows system +func onWindows() bool { + return runtime.GOOS == "windows" +} + // Validates that arg is one of validArgs. // Returns nil if arg is valid, error otherwise. func validateArg(arg string, validArgs []string) error { @@ -64,3 +69,11 @@ func validateArg(arg string, validArgs []string) error { } return nil } + +func getEnvWithDefault(key string, defValue string) string { + value, exists := os.LookupEnv(key) + if !exists { + value = defValue + } + return value +} diff --git a/magefiles/yarn.go b/magefiles/yarn.go index 242fdee68ed..a6769b87435 100644 --- a/magefiles/yarn.go +++ b/magefiles/yarn.go @@ -8,7 +8,7 @@ import ( // Create golang code to build the UI func yarnBinary() string { - return binaryWithExt("yarn") + return "yarn" } func yarnRun(args ...string) error { @@ -32,6 +32,9 @@ func yarnInstall() error { } func yarnOpenAPI() error { + if onWindows() { + return yarnRun("run", "openapi:win") + } return yarnRun("run", "openapi") } diff --git a/makefile b/makefile index 81c86bbe1ef..f66c2653baf 100644 --- a/makefile +++ b/makefile @@ -446,11 +446,11 @@ setup-cluster: kind create cluster --config e2e/setup/kind.yaml # Load images necessary for tests. - docker pull "alpine:3.10" # used for e2e tests + docker pull "alpine:3.18.3" # used for e2e tests docker pull "nginx:1.21.6" # used for e2e tests (ingress) docker pull "registry.k8s.io/ingress-nginx/controller:v1.4.0" docker pull "registry.k8s.io/ingress-nginx/kube-webhook-certgen:v20220916-gd32f8c343" - kind load docker-image "alpine:3.10" --name armada-test + kind load docker-image "alpine:3.18.3" --name armada-test kind load docker-image "nginx:1.21.6" --name armada-test kind load docker-image "registry.k8s.io/ingress-nginx/controller:v1.4.0" --name armada-test kind load docker-image "registry.k8s.io/ingress-nginx/kube-webhook-certgen:v20220916-gd32f8c343" --name armada-test diff --git a/pkg/client/auth/oidc/device.go b/pkg/client/auth/oidc/device.go index 17f66141041..e6f9ae63474 100644 --- a/pkg/client/auth/oidc/device.go +++ b/pkg/client/auth/oidc/device.go @@ -5,7 +5,7 @@ import ( "encoding/json" "errors" "fmt" - "io/ioutil" + "io" "net/http" "net/url" "strings" @@ -148,7 +148,7 @@ func requestToken(c *http.Client, config DeviceDetails, deviceCode string) (*oau } func makeErrorForHTTPResponse(resp *http.Response) error { - bodyBytes, err := ioutil.ReadAll(resp.Body) + bodyBytes, err := io.ReadAll(resp.Body) if err != nil { return err } diff --git a/plugins/README.md b/plugins/README.md new file mode 100644 index 00000000000..df3f69b4323 --- /dev/null +++ b/plugins/README.md @@ -0,0 +1,62 @@ +This documentation will walk you through how you can self-host `armadactl` on your local machine but first let's take a look at what Krew is. + +### Krew + +Krew is the plugin manager for [kubectl](https://kubernetes.io/docs/tasks/tools/) command-line tool. Krew works across all major platforms, like macOS, Linux and Windows. + +Krew also helps kubectl plugin developers: You can package and distribute your plugins on multiple platforms easily and makes them discoverable through a centralized plugin repository with Krew. + +## Self-hosting armadactl + +- Make sure you have [kubectl](https://kubernetes.io/docs/tasks/tools/) installed on your machine. + +- Head on over to [Krew](https://krew.sigs.k8s.io/docs/user-guide/setup/install/) and install it based on your OS. If you're on MacOS/Linux you can follow the steps below: + +1. Make sure that [git](https://git-scm.com/downloads) is installed. +2. Run this command to download and install krew: +``` +( + set -x; cd "$(mktemp -d)" && + OS="$(uname | tr '[:upper:]' '[:lower:]')" && + ARCH="$(uname -m | sed -e 's/x86_64/amd64/' -e 's/\(arm\)\(64\)\?.*/\1\2/' -e 's/aarch64$/arm64/')" && + KREW="krew-${OS}_${ARCH}" && + curl -fsSLO "https://github.com/kubernetes-sigs/krew/releases/latest/download/${KREW}.tar.gz" && + tar zxvf "${KREW}.tar.gz" && + ./"${KREW}" install krew +) +``` + 3. Add the `$HOME/.krew/bin` directory to your PATH environment variable. To do this, update your `.bashrc` or `.zshrc` file and append the following line: +``` +export PATH="${KREW_ROOT:-$HOME/.krew}/bin:$PATH" +``` + and restart your shell. + + 4. Run kubectl krew to check if the installation is a success or not. + +- It should look something like this: +![Krew Install](https://github.com/ShivangShandilya/armada/assets/101946115/a4640b5c-656f-466b-bf87-11b402d9e838) + +- Change the directory to [plugins](https://github.com/armadaproject/armada/tree/master/plugins). + +- Run this command in order to install `armadactl` as a Krew plugin and to use it alongside `kubectl`. +``` +kubectl krew install --manifest=armadactl.yaml +``` +- It should show something like this: +![Manifest](https://github.com/ShivangShandilya/armada/assets/101946115/2324787b-978f-4da3-b8b4-e1ee41d8aec0) + +- Now try and run this command to check if you can run `armadactl` alongside `kubectl` +``` +kubectl armadactl +``` +- It should show something like this which will ensure installing of the plugin was a success. +![armadactl plugin](https://github.com/ShivangShandilya/armada/assets/101946115/c73e49f3-1b60-4baa-b0b3-67ddeacf9387) + +⚠️ **Before using the Armada CLI, make sure you have working armada enviornment or a armadactl.yaml file that points to a valid armada cluster.** + +### Uninstalling the plugin + +In order to uninstall the `armadactl` plugin, just run this command: +``` +kubectl krew uninstall armadactl +``` diff --git a/plugins/armadactl.yml b/plugins/armadactl.yml new file mode 100644 index 00000000000..61dc90492a3 --- /dev/null +++ b/plugins/armadactl.yml @@ -0,0 +1,36 @@ +apiVersion: krew.googlecontainertools.github.com/v1alpha2 +kind: Plugin +metadata: + name: armadactl +spec: + version: v0.3.88 + homepage: https://github.com/armadaproject/armada + shortDescription: Command line utility to submit many jobs to armada + description: | + armadactl is a command-line tool used for managing jobs in the Armada workload orchestration system. + It provides functionality for creating, updating, and deleting jobs, as well as monitoring job status and resource usage. + caveats: | + Before using the Armada CLI, make sure you have working armada enviornment + or a armadactl.yaml file that points to a valid armada cluster. + platforms: + - selector: + matchLabels: + os: linux + arch: amd64 + uri: https://github.com/armadaproject/armada/releases/download/v0.3.8655/armadactl_0.3.8655_linux_amd64.tar.gz + sha256: 0078f43119cd992b5af0c5c6bab5a0780c7449d38a35ea572d959fe500aa766c + bin: armadactl + - selector: + matchLabels: + os: darwin + arch: amd64 + uri: https://github.com/armadaproject/armada/releases/download/v0.3.8655/armadactl_0.3.8655_darwin_all.tar.gz + sha256: 7f49ea0851dd83303e3e3553834571313b66415f3d4edd99e10f56532849300f + bin: armadactl + - selector: + matchLabels: + os: windows + arch: amd64 + uri: https://github.com/armadaproject/armada/releases/download/v0.3.8655/armadactl_0.3.8655_windows_amd64.zip + sha256: 27774e39b8a29603671c21ed9487fbd073eb408535afe5de5f336e84dc13998b + bin: armadactl.exe \ No newline at end of file diff --git a/testsuite/performance/jobservice/fakearmada/armada.go b/testsuite/performance/jobservice/fakearmada/armada.go new file mode 100644 index 00000000000..73cae1382e6 --- /dev/null +++ b/testsuite/performance/jobservice/fakearmada/armada.go @@ -0,0 +1,47 @@ +package main + +import ( + "fmt" + "net" + "sync" + + log "github.com/sirupsen/logrus" + "google.golang.org/grpc" + "google.golang.org/grpc/encoding" + "google.golang.org/grpc/encoding/gzip" + + "github.com/armadaproject/armada/pkg/api" +) + +func ServePerformanceTestArmadaServer(port int) error { + comp := encoding.GetCompressor(gzip.Name) + encoding.RegisterCompressor(comp) + + server := grpc.NewServer([]grpc.ServerOption{}...) + + performanceTestEventServer := NewPerformanceTestEventServer() + + api.RegisterEventServer(server, performanceTestEventServer) + + log.Infof("Armada performanceTestEventServer gRPC server listening on %d", port) + lis, err := net.Listen("tcp", fmt.Sprintf(":%d", port)) + if err != nil { + return err + } + return server.Serve(lis) +} + +func main() { + var wg sync.WaitGroup + + wg.Add(1) + go func() { + err := ServePerformanceTestArmadaServer(1337) + if err != nil { + fmt.Println(err.Error()) + } + wg.Done() + }() + + wg.Wait() +} diff --git a/testsuite/performance/jobservice/fakearmada/event_server.go b/testsuite/performance/jobservice/fakearmada/event_server.go new file mode 100644 index 00000000000..21df17959d2 --- /dev/null +++ b/testsuite/performance/jobservice/fakearmada/event_server.go @@ -0,0 +1,157 @@ +package main + +import ( + "context" + "fmt" + "time" + + "github.com/gogo/protobuf/types" + + "github.com/armadaproject/armada/pkg/api" +) + +type PerformanceTestEventServer struct{} + +func NewPerformanceTestEventServer() *PerformanceTestEventServer { + return &PerformanceTestEventServer{} +} + +func (s *PerformanceTestEventServer) Report(ctx context.Context, message *api.EventMessage) (*types.Empty, error) { + return &types.Empty{}, nil +} + +func (s *PerformanceTestEventServer) ReportMultiple(ctx context.Context, message *api.EventList) (*types.Empty, error) { + return &types.Empty{}, nil +} + +// GetJobSetEvents streams back all events associated with a particular job set. +func (s *PerformanceTestEventServer) GetJobSetEvents(request *api.JobSetRequest, stream api.Event_GetJobSetEventsServer) error { + // FIXME: Handle case where watch is not True. + return s.serveSimulatedEvents(request, stream) +} + +func (s *PerformanceTestEventServer) Health(ctx context.Context, cont_ *types.Empty) (*api.HealthCheckResponse, error) { + return &api.HealthCheckResponse{Status: api.HealthCheckResponse_SERVING}, nil +} + +func (s *PerformanceTestEventServer) Watch(req *api.WatchRequest, stream api.Event_WatchServer) error { + request := &api.JobSetRequest{ + Id: req.JobSetId, + Watch: true, + FromMessageId: req.FromId, + Queue: req.Queue, + ErrorIfMissing: true, + ForceLegacy: req.ForceLegacy, + ForceNew: req.ForceNew, + } + return s.GetJobSetEvents(request, stream) +} + +type scriptedMessage struct { + Delay time.Duration + MessageFunc func(*api.JobSetRequest) *api.EventMessage +} + +var messageScript = []*scriptedMessage{ + { // Submitted + Delay: time.Duration(1), + MessageFunc: func(request *api.JobSetRequest) *api.EventMessage { + return &api.EventMessage{ + Events: &api.EventMessage_Submitted{ + Submitted: &api.JobSubmittedEvent{ + JobId: "fake_job_id", + JobSetId: request.Id, + Queue: request.Queue, + Created: time.Now(), + Job: api.Job{ + Id: "fake_job_id", + ClientId: "", + Queue: request.Queue, + JobSetId: request.Id, + Namespace: "fakeNamespace", + Created: time.Now(), + }, + }, + }, + } + }, + }, + { // Queued + Delay: time.Duration(time.Second * 1), + MessageFunc: func(request *api.JobSetRequest) *api.EventMessage { + return &api.EventMessage{ + Events: &api.EventMessage_Queued{ + Queued: &api.JobQueuedEvent{ + JobId: "fake_job_id", + JobSetId: request.Id, + Queue: request.Queue, + Created: time.Now(), + }, + }, + } + }, + }, + { // Running + Delay: time.Duration(time.Second * 1), + MessageFunc: func(request *api.JobSetRequest) *api.EventMessage { + return &api.EventMessage{ + Events: &api.EventMessage_Running{ + Running: &api.JobRunningEvent{ + JobId: "fake_job_id", + JobSetId: request.Id, + Queue: request.Queue, + Created: time.Now(), + ClusterId: "fakeCluster", + KubernetesId: "fakeK8s", + NodeName: "fakeNode", + PodNumber: 1, + PodName: "fakePod", + PodNamespace: "fakeNamespace", + }, + }, + } + }, + }, + { // Success + Delay: time.Duration(time.Second * 10), + MessageFunc: func(request *api.JobSetRequest) *api.EventMessage { + return &api.EventMessage{ + Events: &api.EventMessage_Succeeded{ + Succeeded: &api.JobSucceededEvent{ + JobId: "fake_job_id", + JobSetId: request.Id, + Queue: request.Queue, + Created: time.Now(), + ClusterId: "fakeCluster", + KubernetesId: "fakeK8s", + NodeName: "fakeNode", + PodNumber: 1, + PodName: "fakePod", + PodNamespace: "fakeNamespace", + }, + }, + } + }, + }, +} + +func (s *PerformanceTestEventServer) serveSimulatedEvents(request *api.JobSetRequest, stream api.Event_GetJobSetEventsServer) error { + nextId := 1 + + for _, message := range messageScript { + time.Sleep(message.Delay) + err := stream.Send(&api.EventStreamMessage{ + Id: fmt.Sprintf("%d", nextId), + Message: message.MessageFunc(request), + }) + if err != nil { + return err + } + nextId += 1 + } + + // Keep the stream active but don't send anything + time.Sleep(time.Minute * 10) + + return nil +} diff --git a/testsuite/performance/jobservice/jobservice/jobservice.go b/testsuite/performance/jobservice/jobservice/jobservice.go new file mode 100644 index 00000000000..608f779a1c6 --- /dev/null +++ b/testsuite/performance/jobservice/jobservice/jobservice.go @@ -0,0 +1,98 @@ +package main + +import ( + "context" + "fmt" + "net/http" + "os" + "os/signal" + "sync" + "time" + + "google.golang.org/grpc/encoding" + "google.golang.org/grpc/encoding/gzip" + + "github.com/armadaproject/armada/internal/jobservice" + "github.com/armadaproject/armada/internal/jobservice/configuration" + "github.com/armadaproject/armada/pkg/client" + + _ "net/http/pprof" +) + +func main() { + var wg sync.WaitGroup + + ctx, cancel := context.WithCancel(context.Background()) + + signalChan := make(chan os.Signal, 1) + signal.Notify(signalChan, os.Interrupt) + + go func() { + select { + case <-ctx.Done(): + return + case <-signalChan: + fmt.Println("Got interrupt, stopping...") + cancel() + return + } + }() + + go func() { + _ = http.ListenAndServe("localhost:6060", nil) + }() + + comp := encoding.GetCompressor(gzip.Name) + encoding.RegisterCompressor(comp) + + outfile, err := os.Create("jobservice.profile") + if err != nil { + fmt.Println(err.Error()) + return + } + defer outfile.Close() + + /* + err = pprof.StartCPUProfile(outfile) + if err != nil { + fmt.Println(err.Error()) + return + }*/ + + js := jobservice.New() + wg.Add(1) + go func() { + os.Setenv("JOBSERVICE_DEBUG", "TRUE") + err := js.StartUp(ctx, &configuration.JobServiceConfiguration{ + GrpcPort: 2000, + MetricsPort: 2001, + HttpPort: 2002, + DatabaseType: "postgres", + ApiConnection: client.ApiConnectionDetails{ + ArmadaUrl: "localhost:1337", + ForceNoTls: true, + }, + PostgresConfig: configuration.PostgresConfig{ + PoolMaxOpenConns: 50, + PoolMaxIdleConns: 10, + PoolMaxConnLifetime: time.Duration(time.Minute * 30), + Connection: map[string]string{ + "host": "localhost", + "port": "5432", + "user": "postgres", + "password": "psw", + "dbname": "postgres", + "sslmode": "disable", + }, + }, + }) + if err != nil { + fmt.Printf("Error starting Job Service: %v\n", err) + } + wg.Done() + }() + + wg.Wait() + + // pprof.StopCPUProfile() +} diff --git a/testsuite/performance/jobservice/jsloadtest.go b/testsuite/performance/jobservice/jsloadtest.go new file mode 100644 index 00000000000..7eac9d40558 --- /dev/null +++ b/testsuite/performance/jobservice/jsloadtest.go @@ -0,0 +1,73 @@ +package main + +import ( + "context" + "fmt" + "math/rand" + "sync" + "time" + + "github.com/gogo/protobuf/types" + "google.golang.org/grpc" + "google.golang.org/grpc/credentials/insecure" + + jsgrpc "github.com/armadaproject/armada/pkg/api/jobservice" +) + +func init() { + rand.Seed(time.Now().UnixNano()) +} + +// TODO: Add arguments to control how the load is applied. +func main() { + ctx := context.Background() + wg := sync.WaitGroup{} + + // Launch a jobservice client to query jobservice about a jobset + conn, err := grpc.Dial("localhost:2000", grpc.WithTransportCredentials(insecure.NewCredentials())) + if err != nil { + fmt.Println(err.Error()) + return + } + client := jsgrpc.NewJobServiceClient(conn) + healthResp, err := client.Health(ctx, &types.Empty{}) + if err != nil { + fmt.Println(err.Error()) + return + } + fmt.Println(healthResp.Status.String()) + + prefix := rand.Intn(10000) + + maxJob := 1000 + wg.Add(maxJob) + + for i := 0; i < maxJob; i++ { + go func(n int) { + err := queryJobStatus(ctx, conn, n, prefix) + if err != nil { + fmt.Printf("Error querying job status: %v\n", err) + } + wg.Done() + }(i) + } + + wg.Wait() +} + +func queryJobStatus(ctx context.Context, conn *grpc.ClientConn, n int, prefix int) error { + client := jsgrpc.NewJobServiceClient(conn) + + resp, err := client.GetJobStatus(ctx, &jsgrpc.JobServiceRequest{ + JobId: "fake_job_id", + JobSetId: fmt.Sprintf("%d_new_fake_job_set_id_%d", prefix, n), + Queue: "fake_queue", + }) + if err != nil { + fmt.Println(err.Error()) + return err + } + + fmt.Printf("%s - %d\n", resp.State.String(), n) + return nil +} diff --git a/third_party/airflow/README.md b/third_party/airflow/README.md index 6ed0740fdea..73a9b36c525 100644 --- a/third_party/airflow/README.md +++ b/third_party/airflow/README.md @@ -36,12 +36,10 @@ python3.8 -m pip install armada-airflow From the top level of the repo, you should run `make airflow-operator`. This will generate proto/grpc files in the jobservice folder. -Airflow with the Armada operator can be run alongside the other Armada services via the localdev docker-compose -environment. It is manually started in this way: +Airflow with the Armada operator can be run alongside the other Armada services via the docker-compose environment. It is manually started in this way: ``` -cd localdev -docker-compose up -d airflow +mage airflow start ``` Airflow's web UI will then be accessible at http://localhost:8081 (login with admin/admin). diff --git a/third_party/airflow/armada/operators/armada.py b/third_party/airflow/armada/operators/armada.py index bb8b1d70fb0..33475651275 100644 --- a/third_party/airflow/armada/operators/armada.py +++ b/third_party/airflow/armada/operators/armada.py @@ -88,6 +88,7 @@ def __init__( if "options" not in job_service_channel_args: job_service_channel_args["options"] = default_jobservice_channel_options + self.job_service_channel_args = GrpcChannelArguments(**job_service_channel_args) self.armada_queue = armada_queue self.job_request_items = job_request_items self.lookout_url_template = lookout_url_template diff --git a/third_party/airflow/armada/operators/armada_deferrable.py b/third_party/airflow/armada/operators/armada_deferrable.py index 577b6b3b081..2f53a702228 100644 --- a/third_party/airflow/armada/operators/armada_deferrable.py +++ b/third_party/airflow/armada/operators/armada_deferrable.py @@ -73,6 +73,7 @@ class ArmadaDeferrableOperator(BaseOperator): The format should be: "https://lookout.armada.domain/jobs?job_id=" where will be replaced with the actual job ID. + :param poll_interval: How often to poll jobservice to get status. :return: A deferrable armada operator instance. """ @@ -86,6 +87,7 @@ def __init__( armada_queue: str, job_request_items: List[JobSubmitRequestItem], lookout_url_template: Optional[str] = None, + poll_interval: int = 30, **kwargs, ) -> None: super().__init__(**kwargs) @@ -99,6 +101,7 @@ def __init__( self.armada_queue = armada_queue self.job_request_items = job_request_items self.lookout_url_template = lookout_url_template + self.poll_interval = poll_interval def execute(self, context) -> None: """ @@ -257,6 +260,7 @@ async def run(self): job_set_id=self.job_set_id, airflow_task_name=self.airflow_task_name, job_id=self.job_id, + poll_interval=self.poll_interval, log=self.log, ) yield TriggerEvent({"job_state": job_state, "job_message": job_message}) diff --git a/third_party/airflow/armada/operators/grpc.py b/third_party/airflow/armada/operators/grpc.py index 31970595f93..bebb0f98835 100644 --- a/third_party/airflow/armada/operators/grpc.py +++ b/third_party/airflow/armada/operators/grpc.py @@ -1,17 +1,55 @@ +import importlib from typing import Optional, Sequence, Tuple, Any, TypedDict import grpc +class CredentialsCallbackDict(TypedDict): + """ + Helper class to provide stronger type checking on Credential callback args. + """ + + module_name: str + function_name: str + function_kwargs: dict + + class GrpcChannelArgsDict(TypedDict): """ Helper class to provide stronger type checking on Grpc channel arugments. """ target: str - credentials: Optional[grpc.ChannelCredentials] options: Optional[Sequence[Tuple[str, Any]]] compression: Optional[grpc.Compression] + credentials_callback_args: Optional[CredentialsCallbackDict] + + +class CredentialsCallback(object): + """ + Allows the use of an arbitrary callback function to get grpc credentials. + + :param module_name: The fully qualified python module name where the + function is located. + :param function_name: The name of the function to be called. + :param function_kwargs: Keyword arguments to function_name in a dictionary. + """ + + def __init__( + self, + module_name: str, + function_name: str, + function_kwargs: dict, + ) -> None: + self.module_name = module_name + self.function_name = function_name + self.function_kwargs = function_kwargs + + def call(self): + """Do the callback to get grpc credentials.""" + module = importlib.import_module(self.module_name) + func = getattr(module, self.function_name) + return func(**self.function_kwargs) class GrpcChannelArguments(object): @@ -20,8 +58,8 @@ class GrpcChannelArguments(object): :param target: Target keyword argument used when instantiating a grpc channel. - :param credentials: credentials keyword argument used - when instantiating a grpc channel. + :param credentials_callback_args: Arguments to CredentialsCallback to use + when instantiating a grpc channel that takes credentials. :param options: options keyword argument used when instantiating a grpc channel. :param compression: compression keyword argument used @@ -32,14 +70,17 @@ class GrpcChannelArguments(object): def __init__( self, target: str, - credentials: Optional[grpc.ChannelCredentials] = None, options: Optional[Sequence[Tuple[str, Any]]] = None, compression: Optional[grpc.Compression] = None, + credentials_callback_args: CredentialsCallbackDict = None, ) -> None: self.target = target - self.credentials = credentials self.options = options self.compression = compression + self.credentials_callback = None + + if credentials_callback_args is not None: + self.credentials_callback = CredentialsCallback(**credentials_callback_args) def channel(self) -> grpc.Channel: """ @@ -49,7 +90,7 @@ def channel(self) -> grpc.Channel: returns grpc.secure_channel. """ - if self.credentials is None: + if self.credentials_callback is None: return grpc.insecure_channel( target=self.target, options=self.options, @@ -57,7 +98,7 @@ def channel(self) -> grpc.Channel: ) return grpc.secure_channel( target=self.target, - credentials=self.credentials, + credentials=self.credentials_callback.call(), options=self.options, compression=self.compression, ) @@ -70,7 +111,7 @@ def aio_channel(self) -> grpc.aio.Channel: returns grpc.aio.secure_channel. """ - if self.credentials is None: + if self.credentials_callback is None: return grpc.aio.insecure_channel( target=self.target, options=self.options, @@ -78,7 +119,7 @@ def aio_channel(self) -> grpc.aio.Channel: ) return grpc.aio.secure_channel( target=self.target, - credentials=self.credentials, + credentials=self.credentials_callback.call(), options=self.options, compression=self.compression, ) @@ -93,7 +134,7 @@ def serialize(self) -> dict: return { "target": self.target, - "credentials": self.credentials, + "credentials_callback_args": self.credentials_callback_args, "options": self.options, "compression": self.compression, } diff --git a/third_party/airflow/armada/operators/utils.py b/third_party/airflow/armada/operators/utils.py index c9eb9ff1f32..e3c68beb321 100644 --- a/third_party/airflow/armada/operators/utils.py +++ b/third_party/airflow/armada/operators/utils.py @@ -3,7 +3,7 @@ import os import time -from airflow.exceptions import AirflowFailException +from airflow.exceptions import AirflowException from typing import List, Optional, Tuple from enum import Enum @@ -61,7 +61,9 @@ def airflow_error(job_state: JobState, name: str, job_id: str): or job_state == JobState.JOB_ID_NOT_FOUND ): job_message = job_state.name - raise AirflowFailException(f"The Armada job {name}:{job_id} {job_message}") + # AirflowException allows operator-level retries. AirflowFailException + # does *not*. + raise AirflowException(f"The Armada job {name}:{job_id} {job_message}") def default_job_status_callable( diff --git a/third_party/airflow/pyproject.toml b/third_party/airflow/pyproject.toml index d63349c89f1..bd9814cc10c 100644 --- a/third_party/airflow/pyproject.toml +++ b/third_party/airflow/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "armada_airflow" -version = "0.5.0" +version = "0.5.3" description = "Armada Airflow Operator" requires-python = ">=3.7" # Note(JayF): This dependency value is not suitable for release. Whatever @@ -19,10 +19,10 @@ license = { text = "Apache Software License" } readme = "README.md" [project.optional-dependencies] -format = ["black==23.3.0", "flake8==6.0.0", "pylint==2.17.4"] -test = ["pytest==7.3.1", "coverage>=6.5.0", "pytest-asyncio==0.21.0"] +format = ["black==23.7.0", "flake8==6.1.0", "pylint==2.17.5"] +test = ["pytest==7.3.1", "coverage>=6.5.0", "pytest-asyncio==0.21.1"] # note(JayF): sphinx-jekyll-builder was broken by sphinx-markdown-builder 0.6 -- so pin to 0.5.5 -docs = ["sphinx==7.0.1", "sphinx-jekyll-builder==0.3.0", "sphinx-toolbox==3.2.0b1", "sphinx-markdown-builder==0.5.5"] +docs = ["sphinx==7.1.2", "sphinx-jekyll-builder==0.3.0", "sphinx-toolbox==3.2.0b1", "sphinx-markdown-builder==0.5.5"] [build-system] requires = ["setuptools"] diff --git a/third_party/airflow/tests/unit/test_airflow_error.py b/third_party/airflow/tests/unit/test_airflow_error.py index 24705e2532e..1e51c08e5ff 100644 --- a/third_party/airflow/tests/unit/test_airflow_error.py +++ b/third_party/airflow/tests/unit/test_airflow_error.py @@ -1,5 +1,5 @@ from armada.operators.utils import JobState, airflow_error -from airflow.exceptions import AirflowFailException +from airflow.exceptions import AirflowException import pytest testdata_success = [JobState.SUCCEEDED] @@ -19,6 +19,6 @@ def test_airflow_error_successful(state): @pytest.mark.parametrize("state, expected_exception_message", testdata_error) def test_airflow_error_states(state, expected_exception_message): - with pytest.raises(AirflowFailException) as airflow: + with pytest.raises(AirflowException) as airflow: airflow_error(state, "hello", "id") assert str(airflow.value) == expected_exception_message diff --git a/third_party/airflow/tests/unit/test_armada_operator.py b/third_party/airflow/tests/unit/test_armada_operator.py index 9970f321f92..571d634dc70 100644 --- a/third_party/airflow/tests/unit/test_armada_operator.py +++ b/third_party/airflow/tests/unit/test_armada_operator.py @@ -1,6 +1,14 @@ -from armada.operators.armada import ArmadaOperator +import copy +from unittest.mock import patch, Mock + +import grpc import pytest +from armada.jobservice import jobservice_pb2 +from armada.operators.armada import ArmadaOperator +from armada.operators.grpc import CredentialsCallback +from armada.operators.utils import JobState + get_lookout_url_test_cases = [ ( "http://localhost:8089/jobs?job_id=", @@ -35,3 +43,155 @@ def test_get_lookout_url(lookout_url_template, job_id, expected_url): ) assert operator._get_lookout_url(job_id) == expected_url + + +def test_deepcopy_operator(): + armada_channel_args = {"target": "127.0.0.1:50051"} + job_service_channel_args = {"target": "127.0.0.1:60003"} + + operator = ArmadaOperator( + task_id="test_task_id", + name="test_task", + armada_channel_args=armada_channel_args, + job_service_channel_args=job_service_channel_args, + armada_queue="test_queue", + job_request_items=[], + lookout_url_template="http://localhost:8089/jobs?job_id=", + ) + + try: + copy.deepcopy(operator) + except Exception as e: + assert False, f"{e}" + + +@pytest.mark.skip("demonstrates how the old way of passing in credentials fails") +def test_deepcopy_operator_with_grpc_credentials(): + armada_channel_args = { + "target": "127.0.0.1:50051", + "credentials": grpc.composite_channel_credentials( + grpc.ssl_channel_credentials(), + grpc.metadata_call_credentials(("authorization", "fake_jwt")), + ), + } + job_service_channel_args = {"target": "127.0.0.1:60003"} + + operator = ArmadaOperator( + task_id="test_task_id", + name="test_task", + armada_channel_args=armada_channel_args, + job_service_channel_args=job_service_channel_args, + armada_queue="test_queue", + job_request_items=[], + lookout_url_template="http://localhost:8089/jobs?job_id=", + ) + + try: + copy.deepcopy(operator) + except Exception as e: + assert False, f"{e}" + + +def test_deepcopy_operator_with_grpc_credentials_callback(): + armada_channel_args = { + "target": "127.0.0.1:50051", + "credentials_callback_args": { + "module_name": "tests.unit.test_armada_operator", + "function_name": "__example_test_callback", + "function_kwargs": { + "test_arg": "fake_arg", + }, + }, + } + job_service_channel_args = {"target": "127.0.0.1:60003"} + + operator = ArmadaOperator( + task_id="test_task_id", + name="test_task", + armada_channel_args=armada_channel_args, + job_service_channel_args=job_service_channel_args, + armada_queue="test_queue", + job_request_items=[], + lookout_url_template="http://localhost:8089/jobs?job_id=", + ) + + try: + copy.deepcopy(operator) + except Exception as e: + assert False, f"{e}" + + +def __example_test_callback(foo=None): + return f"fake_cred {foo}" + + +def test_credentials_callback(): + callback = CredentialsCallback( + module_name="test_armada_operator", + function_name="__example_test_callback", + function_kwargs={"foo": "bar"}, + ) + + result = callback.call() + assert result == "fake_cred bar" + + +@patch("armada.operators.armada.search_for_job_complete") +@patch("armada.operators.armada.ArmadaClient", autospec=True) +@patch("armada.operators.armada.JobServiceClient", autospec=True) +def test_armada_operator_execute( + JobServiceClientMock, ArmadaClientMock, search_for_job_complete_mock +): + jsclient_mock = Mock() + jsclient_mock.health.return_value = jobservice_pb2.HealthCheckResponse( + status=jobservice_pb2.HealthCheckResponse.SERVING + ) + + JobServiceClientMock.return_value = jsclient_mock + + item = Mock() + item.job_id = "fake_id" + + job = Mock() + job.job_response_items = [ + item, + ] + + aclient_mock = Mock() + aclient_mock.submit_jobs.return_value = job + ArmadaClientMock.return_value = aclient_mock + + search_for_job_complete_mock.return_value = (JobState.SUCCEEDED, "No error") + + armada_channel_args = {"target": "127.0.0.1:50051"} + job_service_channel_args = {"target": "127.0.0.1:60003"} + + operator = ArmadaOperator( + task_id="test_task_id", + name="test_task", + armada_channel_args=armada_channel_args, + job_service_channel_args=job_service_channel_args, + armada_queue="test_queue", + job_request_items=[], + lookout_url_template="https://lookout.armada.domain/jobs?job_id=", + ) + + task_instance = Mock() + task_instance.task_id = "mock_task_id" + + dag = Mock() + dag.dag_id = "mock_dag_id" + + context = { + "run_id": "mock_run_id", + "ti": task_instance, + "dag": dag, + } + + try: + operator.execute(context) + except Exception as e: + assert False, f"{e}" + + jsclient_mock.health.assert_called() + aclient_mock.submit_jobs.assert_called() diff --git a/tools.yaml b/tools.yaml index 6f98c7d807d..e50e8859b2b 100644 --- a/tools.yaml +++ b/tools.yaml @@ -3,7 +3,7 @@ tools: - github.com/go-swagger/go-swagger/cmd/swagger@v0.29.0 - github.com/gordonklaus/ineffassign@v0.0.0-20210914165742-4cc7213b9bc8 -- github.com/goreleaser/goreleaser@v1.18.2 +- github.com/goreleaser/goreleaser@v1.20.0 - github.com/grpc-ecosystem/grpc-gateway/protoc-gen-grpc-gateway@v1.16.0 - github.com/grpc-ecosystem/grpc-gateway/protoc-gen-swagger@v1.16.0 - github.com/jstemmer/go-junit-report@v1.0.0