From b12bfc72978e9e67b8eb536fc28a6530a376cb98 Mon Sep 17 00:00:00 2001 From: Dejan Pejchev Date: Tue, 20 Jun 2023 02:51:15 +0200 Subject: [PATCH 01/14] feat: fix release ci --- .github/workflows/release-validate.yml | 56 ++++++++++++++++++++++++++ .github/workflows/release.yml | 11 +++-- 2 files changed, 63 insertions(+), 4 deletions(-) create mode 100644 .github/workflows/release-validate.yml diff --git a/.github/workflows/release-validate.yml b/.github/workflows/release-validate.yml new file mode 100644 index 00000000000..be90f63d1ee --- /dev/null +++ b/.github/workflows/release-validate.yml @@ -0,0 +1,56 @@ +name: Validate Release + +on: + push: + tags: + - v* + +permissions: + contents: write + +jobs: + compare_tags: + if: github.repository_owner == 'armadaproject' + name: "Compare tags" + runs-on: ubuntu-22.04 + + steps: + - name: "Checkout" + uses: "actions/checkout@v3" + with: + fetch-depth: 0 + + - name: Compare tags + env: + ALLOWED_BRANCH: "master" + run: | + ref=${{ github.ref }} + tag=${ref#refs/tags/} + echo "Current tag: $tag" + sha=${{ github.sha }} + echo "Current sha: $sha" + result=0 + case $tag in + v?*) + latest_tag_commit=$(git rev-parse refs/tags/$tag^{}) + git branch --contains=$sha $ALLOWED_BRANCH >> /dev/null + branch_contains_commit=$? + + if [[ $branch_contains_commit -eq 0 && "$latest_tag_commit" == "$sha" ]]; then + result=0 + else + result=1 + fi + ;; + *) + echo "Invalid tag $tag" + result=1 + ;; + esac + if [ $result -ne 0 ]; then + echo "Latest tag ($tag) does not match the current commit ($sha)." + echo "::error ::Invalid ref $ref $sha" + exit 1 + else + echo "Latest tag ($tag) matches the current commit ($sha)." + fi diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 188bd802687..3a996ab9ac5 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1,16 +1,19 @@ name: Release Armada components on: - push: - tags: - - 'v*' + workflow_run: + types: [completed] + workflows: [Validate Release] + branches: + - master + - v* permissions: contents: write jobs: release: - if: github.repository_owner == 'armadaproject' + if: github.event.workflow_run.event == 'push' && github.event.workflow_run.conclusion == 'success' && github.repository_owner == 'armadaproject' name: "Release" runs-on: ubuntu-22.04 environment: armada-dockerhub From 45e1c9e777d8d975507935d41890f4a25b5b8482 Mon Sep 17 00:00:00 2001 From: Dejan Pejchev Date: Wed, 28 Jun 2023 16:36:11 +0200 Subject: [PATCH 02/14] change release workflows to depend on ci workflow --- .github/workflows/release-rc.yml | 36 +++++++++++++++++++++++++++++--- .github/workflows/release.yml | 33 +++++++++++++++++++++++++++-- 2 files changed, 64 insertions(+), 5 deletions(-) diff --git a/.github/workflows/release-rc.yml b/.github/workflows/release-rc.yml index e479211ff3c..85e41026c86 100644 --- a/.github/workflows/release-rc.yml +++ b/.github/workflows/release-rc.yml @@ -1,18 +1,48 @@ name: Release Armada components - RC on: - push: + workflow_run: + types: [completed] + workflows: [CI] branches: - - main - master permissions: contents: write jobs: + validate: + if: github.event.workflow_run.event == 'push' && github.event.workflow_run.conclusion == 'success' && github.repository_owner == 'armadaproject' + name: "Validate revision" + runs-on: ubuntu-22.04 + + steps: + - name: "Checkout" + uses: "actions/checkout@v3" + with: + fetch-depth: 0 + + - name: Validate ref + run: | + ref=${{ github.event.workflow_run.head_branch }} + sha=${{ github.event.workflow_run.head_sha }} + case $ref in + v?*) + [ $(git rev-parse refs/tags/$ref) == $sha ] && + [ $(git branch --contains=$sha main | wc -l) -eq 1 ] + ;; + *) + false + ;; + esac + if [ $? -ne 0 ]; then + echo "::error ::Invalid ref $ref $sha" + exit 1 + fi release: - if: github.repository_owner == 'armadaproject' + if: github.event.workflow_run.event == 'push' && github.event.workflow_run.conclusion == 'success' && github.repository_owner == 'armadaproject' name: Release + needs: validate runs-on: "ubuntu-22.04" environment: armada-dockerhub diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 3a996ab9ac5..39677d35809 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -3,18 +3,47 @@ name: Release Armada components on: workflow_run: types: [completed] - workflows: [Validate Release] + workflows: [CI] branches: - - master - v* permissions: contents: write jobs: + validate: + if: github.event.workflow_run.event == 'push' && github.event.workflow_run.conclusion == 'success' && github.repository_owner == 'armadaproject' + name: "Validate revision" + runs-on: ubuntu-22.04 + + steps: + - name: "Checkout" + uses: "actions/checkout@v3" + with: + fetch-depth: 0 + + - name: Validate ref + run: | + ref=${{ github.event.workflow_run.head_branch }} + sha=${{ github.event.workflow_run.head_sha }} + case $ref in + v?*) + [ $(git rev-parse refs/tags/$ref) == $sha ] && + [ $(git branch --contains=$sha main | wc -l) -eq 1 ] + ;; + *) + false + ;; + esac + if [ $? -ne 0 ]; then + echo "::error ::Invalid ref $ref $sha" + exit 1 + fi + release: if: github.event.workflow_run.event == 'push' && github.event.workflow_run.conclusion == 'success' && github.repository_owner == 'armadaproject' name: "Release" + needs: validate runs-on: ubuntu-22.04 environment: armada-dockerhub From 46699a99f68c80485f7a86b31b5872efef5d4fe4 Mon Sep 17 00:00:00 2001 From: Dejan Pejchev Date: Wed, 5 Jul 2023 16:01:00 +0200 Subject: [PATCH 03/14] update rc job --- .github/workflows/release-rc.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/release-rc.yml b/.github/workflows/release-rc.yml index 85e41026c86..0dc9bad183d 100644 --- a/.github/workflows/release-rc.yml +++ b/.github/workflows/release-rc.yml @@ -27,9 +27,9 @@ jobs: ref=${{ github.event.workflow_run.head_branch }} sha=${{ github.event.workflow_run.head_sha }} case $ref in - v?*) - [ $(git rev-parse refs/tags/$ref) == $sha ] && - [ $(git branch --contains=$sha main | wc -l) -eq 1 ] + main) + [ $(git branch --contains=$sha main | wc -l) -eq 1 ] && + [ $(git rev-list --count $sha..main) -le 2 ] ;; *) false From 3f699bfc6c17dd10ddb5222390f0d4aea66d0bfb Mon Sep 17 00:00:00 2001 From: Dejan Pejchev Date: Wed, 5 Jul 2023 16:02:40 +0200 Subject: [PATCH 04/14] test release job --- .github/workflows/release-rc.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/release-rc.yml b/.github/workflows/release-rc.yml index 0dc9bad183d..bc288869dd6 100644 --- a/.github/workflows/release-rc.yml +++ b/.github/workflows/release-rc.yml @@ -12,7 +12,7 @@ permissions: jobs: validate: - if: github.event.workflow_run.event == 'push' && github.event.workflow_run.conclusion == 'success' && github.repository_owner == 'armadaproject' + if: github.event.workflow_run.event == 'push' && github.event.workflow_run.conclusion == 'success' name: "Validate revision" runs-on: ubuntu-22.04 @@ -40,7 +40,7 @@ jobs: exit 1 fi release: - if: github.event.workflow_run.event == 'push' && github.event.workflow_run.conclusion == 'success' && github.repository_owner == 'armadaproject' + if: github.event.workflow_run.event == 'push' && github.event.workflow_run.conclusion == 'success' name: Release needs: validate runs-on: "ubuntu-22.04" From 543bf6377fb881729510f44e51e09793d4c0815d Mon Sep 17 00:00:00 2001 From: Dejan Pejchev Date: Wed, 5 Jul 2023 16:18:55 +0200 Subject: [PATCH 05/14] add protection for branch/tag name injection --- .github/workflows/release-rc.yml | 4 ++-- .github/workflows/release.yml | 10 ++++++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/.github/workflows/release-rc.yml b/.github/workflows/release-rc.yml index bc288869dd6..8e57272fb69 100644 --- a/.github/workflows/release-rc.yml +++ b/.github/workflows/release-rc.yml @@ -24,8 +24,8 @@ jobs: - name: Validate ref run: | - ref=${{ github.event.workflow_run.head_branch }} - sha=${{ github.event.workflow_run.head_sha }} + ref='${{ github.event.workflow_run.head_branch }}' + sha='${{ github.event.workflow_run.head_sha }}' case $ref in main) [ $(git branch --contains=$sha main | wc -l) -eq 1 ] && diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 39677d35809..a00fe10def2 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -24,10 +24,16 @@ jobs: - name: Validate ref run: | - ref=${{ github.event.workflow_run.head_branch }} - sha=${{ github.event.workflow_run.head_sha }} + ref='${{ github.event.workflow_run.head_branch }}' + sha='${{ github.event.workflow_run.head_sha }}' case $ref in v?*) + semver_pattern="^v[0-9]+\.[0-9]+\.[0-9]+$" + # Check if the tag/branch name matches the semver pattern + if [[ ! $ref =~ $semver_pattern ]]; then + echo "::error ::Invalid ref $ref. It must be in semver format vX.Y.Z!" + exit 1 + fi [ $(git rev-parse refs/tags/$ref) == $sha ] && [ $(git branch --contains=$sha main | wc -l) -eq 1 ] ;; From df7537235b3b2a47badc16a67dd7d08e82bcd644 Mon Sep 17 00:00:00 2001 From: Dejan Zele Pejchev Date: Wed, 5 Jul 2023 16:33:56 +0200 Subject: [PATCH 06/14] Feat/release ci fix (#44) Co-authored-by: Albin Severinson Co-authored-by: JamesMurkin Co-authored-by: Carlo Camurri Co-authored-by: Noah Held <41909795+zuqq@users.noreply.github.com> Co-authored-by: Rich Scott Co-authored-by: Jay Faulkner Co-authored-by: Adam McArthur <46480158+Sharpz7@users.noreply.github.com> Co-authored-by: Kevin Hannon --- .github/workflows/ci.yml | 44 ++ .github/workflows/codeql-analysis.yml | 12 +- .github/workflows/coverage.yml | 26 - .github/workflows/go-integration.yml | 73 -- .github/workflows/go.yml | 66 -- .github/workflows/lint.yml | 53 ++ .github/workflows/not-ts.yml | 27 - .github/workflows/release-rc.yml | 36 +- .github/workflows/release-validate.yml | 56 ++ .github/workflows/release.yml | 46 +- .github/workflows/test.yml | 248 +++++++ .github/workflows/ts.yml | 47 -- .gitignore | 1 + client/python/pyproject.toml | 3 +- config/armada/config.yaml | 21 +- config/executor/config.yaml | 3 + config/scheduler/config.yaml | 20 +- .../templates/scheduler-statefulset.yaml | 10 + deployment/scheduler/values.yaml | 2 + docs/developer.md | 10 +- docs/developer/ubuntu-setup.md | 164 +++++ docs/developer/ui.md | 4 +- internal/armada/configuration/constants.go | 10 +- internal/armada/configuration/types.go | 30 +- internal/armada/server/applydefaults.go | 15 + internal/armada/server/applydefaults_test.go | 47 ++ internal/armada/server/lease.go | 293 ++++---- internal/armada/server/submit.go | 1 + internal/common/database/lookout/jobstates.go | 12 + internal/common/validation/job.go | 18 +- internal/common/validation/job_test.go | 24 +- internal/executor/application.go | 4 +- internal/executor/configuration/types.go | 12 + internal/executor/job/job_run_state_store.go | 12 +- .../executor/job/job_run_state_store_test.go | 15 +- .../executor/reporter/job_event_reporter.go | 1 + .../executor/service/pod_issue_handler.go | 330 ++++++--- .../service/pod_issue_handler_test.go | 145 +++- .../utilisation/cluster_utilisation.go | 4 +- internal/lookout/ui/package.json | 1 + internal/lookout/ui/yarn.lock | 20 + .../lookoutv2/conversions/convert_test.go | 10 +- internal/lookoutv2/gen/models/group.go | 10 +- .../lookoutv2/gen/restapi/embedded_spec.go | 4 +- internal/lookoutv2/model/model.go | 2 +- internal/lookoutv2/repository/aggregates.go | 133 ++++ internal/lookoutv2/repository/fieldparser.go | 122 ++++ internal/lookoutv2/repository/groupjobs.go | 178 ++--- .../lookoutv2/repository/groupjobs_test.go | 374 +++++++++- .../repository/{common.go => querybuilder.go} | 54 +- .../{common_test.go => querybuilder_test.go} | 59 ++ internal/lookoutv2/repository/tables.go | 8 +- internal/lookoutv2/repository/util.go | 49 ++ internal/lookoutv2/swagger.yaml | 2 +- internal/scheduler/api.go | 133 ++-- internal/scheduler/api_test.go | 4 +- internal/scheduler/common.go | 149 +--- internal/scheduler/common_test.go | 12 +- .../scheduler/configuration/configuration.go | 22 +- internal/scheduler/constraints/constraints.go | 30 +- internal/scheduler/context/context.go | 255 ++++--- internal/scheduler/context/context_test.go | 16 +- internal/scheduler/gang_scheduler.go | 131 +++- internal/scheduler/gang_scheduler_test.go | 203 ++++-- internal/scheduler/interfaces/interfaces.go | 19 +- internal/scheduler/jobdb/job.go | 71 +- internal/scheduler/jobdb/job_test.go | 1 - internal/scheduler/jobiteration.go | 14 +- internal/scheduler/leader.go | 4 + internal/scheduler/leader_metrics.go | 63 ++ internal/scheduler/leader_metrics_test.go | 56 ++ internal/scheduler/metrics.go | 2 +- internal/scheduler/nodedb/nodedb.go | 686 ++++++++++-------- internal/scheduler/nodedb/nodedb_test.go | 499 +++++++------ internal/scheduler/nodedb/nodeiteration.go | 44 +- .../scheduler/nodedb/nodeiteration_test.go | 140 ++-- internal/scheduler/pool_assigner.go | 48 +- .../scheduler/preempting_queue_scheduler.go | 207 +++--- .../preempting_queue_scheduler_test.go | 600 ++++++++++----- internal/scheduler/queue_scheduler.go | 65 +- internal/scheduler/queue_scheduler_test.go | 216 +++--- internal/scheduler/reports.go | 8 +- internal/scheduler/reports_test.go | 25 +- internal/scheduler/scheduler.go | 33 +- internal/scheduler/scheduler_test.go | 18 +- internal/scheduler/schedulerapp.go | 25 +- .../schedulerobjects/nodematching.go | 24 +- .../schedulerobjects/nodematching_test.go | 2 +- .../scheduler/schedulerobjects/nodetype.go | 4 - .../scheduler/schedulerobjects/podutils.go | 19 - .../schedulerobjects/requirements.go | 11 - .../schedulerobjects/resourcelist.go | 106 +-- .../schedulerobjects/resourcelist_test.go | 187 +++-- .../schedulerobjects/schedulerobjects.pb.go | 483 ++++++++---- .../schedulerobjects/schedulerobjects.proto | 6 +- .../schedulerobjects/schedulinginfo.go | 10 + internal/scheduler/scheduling_algo.go | 132 ++-- internal/scheduler/scheduling_algo_test.go | 131 ++-- internal/scheduler/submitcheck.go | 100 +-- internal/scheduler/submitcheck_test.go | 23 +- .../scheduler/testfixtures/testfixtures.go | 157 ++-- internal/scheduleringester/dbops.go | 2 +- internal/scheduleringester/instructions.go | 25 +- .../scheduleringester/instructions_test.go | 15 +- internal/scheduleringester/schedulerdb.go | 39 +- magefiles/main.go | 3 + makefile | 9 +- pkg/api/util.go | 45 +- pkg/api/util_test.go | 186 ++--- third_party/airflow/pyproject.toml | 3 +- 110 files changed, 5329 insertions(+), 3163 deletions(-) create mode 100644 .github/workflows/ci.yml delete mode 100644 .github/workflows/coverage.yml delete mode 100644 .github/workflows/go-integration.yml delete mode 100644 .github/workflows/go.yml create mode 100644 .github/workflows/lint.yml delete mode 100644 .github/workflows/not-ts.yml create mode 100644 .github/workflows/release-validate.yml create mode 100644 .github/workflows/test.yml delete mode 100644 .github/workflows/ts.yml create mode 100644 docs/developer/ubuntu-setup.md create mode 100644 internal/lookoutv2/repository/aggregates.go create mode 100644 internal/lookoutv2/repository/fieldparser.go rename internal/lookoutv2/repository/{common.go => querybuilder.go} (95%) rename internal/lookoutv2/repository/{common_test.go => querybuilder_test.go} (88%) create mode 100644 internal/scheduler/leader_metrics.go create mode 100644 internal/scheduler/leader_metrics_test.go delete mode 100644 internal/scheduler/schedulerobjects/requirements.go create mode 100644 internal/scheduler/schedulerobjects/schedulinginfo.go diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 00000000000..cb3a5723961 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,44 @@ +name: CI + +on: + push: + branches-ignore: + - gh-pages + pull_request: + branches-ignore: + - gh-pages + schedule: + # Run daily at 01:34, so we get notified if CI is broken before a pull request + # is submitted. + - cron: "34 1 * * *" + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +permissions: + contents: read + pull-requests: read + checks: write + actions: read + security-events: write + +jobs: + lint: + uses: ./.github/workflows/lint.yml + codeql: + uses: ./.github/workflows/codeql-analysis.yml + test: + uses: ./.github/workflows/test.yml + # Virtual job that can be configured as a required check before a PR can be merged. + all-required-checks-done: + name: All required checks done + needs: + - lint + - codeql + - test + runs-on: ubuntu-22.04 + steps: + - run: | + echo "All required checks done" + diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index 1a6c5d01ed4..cda45fd1ef3 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -12,13 +12,11 @@ name: "CodeQL" on: - push: - branches: [ master ] - pull_request: - # The branches below must be a subset of the branches above - branches: [ master ] - schedule: - - cron: '15 2 * * 4' + workflow_call: + +permissions: + actions: read + security-events: write jobs: analyze: diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml deleted file mode 100644 index 5e7d5d86aec..00000000000 --- a/.github/workflows/coverage.yml +++ /dev/null @@ -1,26 +0,0 @@ -name: Code Coverage - -on: - push: - pull_request: - -jobs: - go-unit-coverage: - runs-on: ubuntu-22.04 - strategy: - fail-fast: false - matrix: - go: [ '1.20' ] - - steps: - - uses: actions/checkout@v3 - - uses: ./.github/workflows/go-setup - - - name: make tests - run: make tests - - - name: Go Test Coverage - uses: codecov/codecov-action@v3 - with: - files: ./internal_coverage.xml,./cmd_coverage.xml # optional - flags: armada-server diff --git a/.github/workflows/go-integration.yml b/.github/workflows/go-integration.yml deleted file mode 100644 index 9ddc2c254c3..00000000000 --- a/.github/workflows/go-integration.yml +++ /dev/null @@ -1,73 +0,0 @@ -name: Go End to End - -on: - push: - branches-ignore: - - master - pull_request: - branches-ignore: - - gh-pages - -jobs: - go-integration-tests: - if: github.repository_owner == 'armadaproject' - strategy: - fail-fast: false - matrix: - # WARN(JayF): If we begin attempting to support >1 golang version, - # we'll have to ensure that we don't use the same artifact name for - # both versions in the matrix -- this will lead to them overwriting - # each other. - go: [ '1.20' ] - # As of December 2022, using 8vcpu runners is slower overall, - # due to longer queue times. - runs-on: ubuntu-22.04 - env: - # Cache Docker layers in the Github actions cache. - # These variables are picked up by the goreleaser config. - DOCKER_BUILDX_CACHE_FROM: "type=gha" - DOCKER_BUILDX_CACHE_TO: "type=gha,mode=max" - DOCKER_BUILDX_BUILDER: "builder" - # Seems that for optimal performance, we need to set these - # explicitly - including GOBIN. - GOCACHE: "/home/runner/.cache/go-build" - GOPATH: "/home/runner/go" - GOBIN: "/home/runner/go/bin" - steps: - - uses: actions/checkout@v3 - - run: docker buildx create --name ${DOCKER_BUILDX_BUILDER} --driver docker-container --use - - run: docker buildx install - - uses: actions/setup-go@v4 - with: - go-version: ${{ matrix.go }} - cache: false - - name: Setup Golang caches - uses: actions/cache@v3 - with: - path: | - /home/runner/.cache/go-build - /home/runner/go - /home/runner/go/bin - key: ${{ runner.os }}-golang-${{ hashFiles('**/go.sum') }} - restore-keys: | - ${{ runner.os }}-golang- - - name: Setup and integration tests - run: | - # Manually create folders to ensure perms are correct. - mkdir -p .kube/internal - mkdir -p .kube/external - go run github.com/magefile/mage@v1.14.0 -v localdev minimal testsuite - - name: Upload JUnit report artifact - uses: actions/upload-artifact@v3.1.1 - with: - name: junit.xml - path: junit.xml - if-no-files-found: error - - name: Publish JUnit report - uses: mikepenz/action-junit-report@v3.6.1 - if: always() - with: - report_paths: junit.xml - fail_on_failure: true - require_tests: true - detailed_summary: true diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml deleted file mode 100644 index 84c6b403229..00000000000 --- a/.github/workflows/go.yml +++ /dev/null @@ -1,66 +0,0 @@ -name: Go Tests - -on: - push: - branches-ignore: - - master - pull_request: - branches-ignore: - - gh-pages - -jobs: - go-lint: - # TODO(JayF): Configure this to only run when golang code has changed - runs-on: ubuntu-22.04 - strategy: - fail-fast: false - matrix: - go: [ '1.20' ] - steps: - - uses: actions/checkout@v3 - - name: Install Protoc - uses: arduino/setup-protoc@v1 - with: - version: '3.17.3' - repo-token: ${{ secrets.GITHUB_TOKEN }} - - uses: ./.github/workflows/go-setup - - - name: golangci-lint - uses: golangci/golangci-lint-action@v3 - with: - version: "v1.53.1" - skip-pkg-cache: true - skip-build-cache: true - args: "-c ./.golangci.yml --timeout=10m --issues-exit-code=1 --max-issues-per-linter=0 --sort-results ./..." - - # TODO(JayF): Consider moving this into its own job, that runs under a larger set of circumstances - # since it's possible for this to fail without any go changes being made. - - name: Validate no changes in generated proto files - run: | - make proto - make dotnet - git status -s -uno - git --no-pager diff - exit $(git status -s -uno | wc -l) - - go-unit-tests: - runs-on: ubuntu-22.04 - strategy: - fail-fast: false - matrix: - go: [ '1.20' ] - steps: - - uses: actions/checkout@v3 - - uses: ./.github/workflows/go-setup - - - name: make tests - run: make tests - - - name: make junit-report - run: make junit-report - - - name: Upload junit report - uses: actions/upload-artifact@v3.1.1 - with: - name: junit.xml - path: test_reports/junit.xml diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 00000000000..5572e4b9d99 --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,53 @@ +name: "Lint" + +on: + workflow_call: + +permissions: + contents: read + pull-requests: read + +jobs: + ts-lint: + # TODO(JayF): Determine what nodejs versions we target, and setup matrix-based testing similar to what we do for go + name: Lint TypeScript + runs-on: ubuntu-22.04 + + steps: + - name: Checkout + uses: actions/checkout@v3 + + - name: Setup Node + uses: actions/setup-node@v3 + with: + node-version: 16.14.2 + cache: yarn + cache-dependency-path: ./internal/lookout/ui/yarn.lock + + - name: Check TypeScript formatting + run: | + yarn install --frozen-lockfile && yarn run fmt || exit 1 + exit $(git status -s -uno | wc -l) + working-directory: ./internal/lookout/ui + + go-lint: + name: Lint Go + runs-on: ubuntu-22.04 + + steps: + - name: Checkout + uses: actions/checkout@v3 + + - name: Setup Golang with Cache + uses: magnetikonline/action-golang-cache@v4 + with: + go-version: "1.20" + + - name: Lint using golangci-lint + uses: golangci/golangci-lint-action@v3 + with: + skip-pkg-cache: true + skip-build-cache: true + version: v1.52.2 + only-new-issues: true + args: --timeout=10m --issues-exit-code=1 --sort-results ./... diff --git a/.github/workflows/not-ts.yml b/.github/workflows/not-ts.yml deleted file mode 100644 index 19fe723f2c3..00000000000 --- a/.github/workflows/not-ts.yml +++ /dev/null @@ -1,27 +0,0 @@ -name: Lookout - -on: - push: - branches-ignore: - - master - paths-ignore: - - 'internal/lookout/ui/**' - - 'build/lookout/**' - - '.github/workflows/ts.yml' - pull_request: - branches-ignore: - - gh-pages - paths-ignore: - - 'internal/lookout/ui/**' - - 'build/lookout/**' - - '.github/workflows/ts.yml' - -jobs: - ts-lint: - runs-on: ubuntu-latest - steps: - - run: 'echo "No lookout code modified, not running lookout jobs"' - ts-unit-tests: - runs-on: ubuntu-latest - steps: - - run: 'echo "No lookout code modified, not running lookout jobs"' diff --git a/.github/workflows/release-rc.yml b/.github/workflows/release-rc.yml index e479211ff3c..8e57272fb69 100644 --- a/.github/workflows/release-rc.yml +++ b/.github/workflows/release-rc.yml @@ -1,18 +1,48 @@ name: Release Armada components - RC on: - push: + workflow_run: + types: [completed] + workflows: [CI] branches: - - main - master permissions: contents: write jobs: + validate: + if: github.event.workflow_run.event == 'push' && github.event.workflow_run.conclusion == 'success' + name: "Validate revision" + runs-on: ubuntu-22.04 + + steps: + - name: "Checkout" + uses: "actions/checkout@v3" + with: + fetch-depth: 0 + + - name: Validate ref + run: | + ref='${{ github.event.workflow_run.head_branch }}' + sha='${{ github.event.workflow_run.head_sha }}' + case $ref in + main) + [ $(git branch --contains=$sha main | wc -l) -eq 1 ] && + [ $(git rev-list --count $sha..main) -le 2 ] + ;; + *) + false + ;; + esac + if [ $? -ne 0 ]; then + echo "::error ::Invalid ref $ref $sha" + exit 1 + fi release: - if: github.repository_owner == 'armadaproject' + if: github.event.workflow_run.event == 'push' && github.event.workflow_run.conclusion == 'success' name: Release + needs: validate runs-on: "ubuntu-22.04" environment: armada-dockerhub diff --git a/.github/workflows/release-validate.yml b/.github/workflows/release-validate.yml new file mode 100644 index 00000000000..be90f63d1ee --- /dev/null +++ b/.github/workflows/release-validate.yml @@ -0,0 +1,56 @@ +name: Validate Release + +on: + push: + tags: + - v* + +permissions: + contents: write + +jobs: + compare_tags: + if: github.repository_owner == 'armadaproject' + name: "Compare tags" + runs-on: ubuntu-22.04 + + steps: + - name: "Checkout" + uses: "actions/checkout@v3" + with: + fetch-depth: 0 + + - name: Compare tags + env: + ALLOWED_BRANCH: "master" + run: | + ref=${{ github.ref }} + tag=${ref#refs/tags/} + echo "Current tag: $tag" + sha=${{ github.sha }} + echo "Current sha: $sha" + result=0 + case $tag in + v?*) + latest_tag_commit=$(git rev-parse refs/tags/$tag^{}) + git branch --contains=$sha $ALLOWED_BRANCH >> /dev/null + branch_contains_commit=$? + + if [[ $branch_contains_commit -eq 0 && "$latest_tag_commit" == "$sha" ]]; then + result=0 + else + result=1 + fi + ;; + *) + echo "Invalid tag $tag" + result=1 + ;; + esac + if [ $result -ne 0 ]; then + echo "Latest tag ($tag) does not match the current commit ($sha)." + echo "::error ::Invalid ref $ref $sha" + exit 1 + else + echo "Latest tag ($tag) matches the current commit ($sha)." + fi diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 188bd802687..a00fe10def2 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1,17 +1,55 @@ name: Release Armada components on: - push: - tags: - - 'v*' + workflow_run: + types: [completed] + workflows: [CI] + branches: + - v* permissions: contents: write jobs: + validate: + if: github.event.workflow_run.event == 'push' && github.event.workflow_run.conclusion == 'success' && github.repository_owner == 'armadaproject' + name: "Validate revision" + runs-on: ubuntu-22.04 + + steps: + - name: "Checkout" + uses: "actions/checkout@v3" + with: + fetch-depth: 0 + + - name: Validate ref + run: | + ref='${{ github.event.workflow_run.head_branch }}' + sha='${{ github.event.workflow_run.head_sha }}' + case $ref in + v?*) + semver_pattern="^v[0-9]+\.[0-9]+\.[0-9]+$" + # Check if the tag/branch name matches the semver pattern + if [[ ! $ref =~ $semver_pattern ]]; then + echo "::error ::Invalid ref $ref. It must be in semver format vX.Y.Z!" + exit 1 + fi + [ $(git rev-parse refs/tags/$ref) == $sha ] && + [ $(git branch --contains=$sha main | wc -l) -eq 1 ] + ;; + *) + false + ;; + esac + if [ $? -ne 0 ]; then + echo "::error ::Invalid ref $ref $sha" + exit 1 + fi + release: - if: github.repository_owner == 'armadaproject' + if: github.event.workflow_run.event == 'push' && github.event.workflow_run.conclusion == 'success' && github.repository_owner == 'armadaproject' name: "Release" + needs: validate runs-on: ubuntu-22.04 environment: armada-dockerhub diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 00000000000..ce0e9cf58c9 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,248 @@ +name: Code Build and Tests + +on: + workflow_call: + +permissions: + contents: read + checks: write + +jobs: + ts-unit-tests: + name: TypeScript Unit Tests + runs-on: ubuntu-22.04 + + steps: + - name: Checkout + uses: actions/checkout@v3 + + - name: Setup Node + uses: actions/setup-node@v3 + with: + node-version: 16.14.2 + cache: yarn + cache-dependency-path: ./internal/lookout/ui/yarn.lock + + - name: Run Unit Tests + run: | + yarn install --frozen-lockfile && yarn run openapi && CI=true yarn run test --reporters=jest-junit + working-directory: ./internal/lookout/ui + + - name: Publish JUnit Report + uses: mikepenz/action-junit-report@v3 + if: always() + with: + report_paths: ./internal/lookout/ui/junit.xml + fail_on_failure: true + require_tests: true + detailed_summary: true + token: ${{ secrets.GITHUB_TOKEN }} + + - name: Upload Test Reports Artifacts + if: always() + uses: actions/upload-artifact@v3 + with: + name: ts-unit-test-reports + path: ./internal/lookout/ui/junit.xml + if-no-files-found: error + + - name: Send Coverage Report to Codecov + if: always() + uses: codecov/codecov-action@v3 + with: + file: ./internal/lookout/ui/coverage/cobertura-coverage.xml + flags: unittests + name: codecov-armada-ts-unit-tests + verbose: true + + go-unit-tests: + name: Golang Unit Tests + runs-on: ubuntu-22.04 + + steps: + - name: Checkout + uses: actions/checkout@v3 + + - name: Setup Golang with Cache + uses: magnetikonline/action-golang-cache@v4 + with: + go-version: "1.20" + + - name: Setup dependencies + shell: bash + run: make download + + - name: Unit Tests + id: unit_test + run: make tests + + - name: Publish JUnit Report + uses: mikepenz/action-junit-report@v3 + if: always() + with: + report_paths: test-reports/unit-tests.xml + fail_on_failure: true + require_tests: true + detailed_summary: true + token: ${{ secrets.GITHUB_TOKEN }} + + - name: Upload Test Reports Artifacts + if: always() + uses: actions/upload-artifact@v3 + with: + name: go-unit-test-reports + path: test-reports/ + if-no-files-found: error + + - name: Send Coverage Report to Codecov + if: always() + uses: codecov/codecov-action@v3 + with: + file: ./test-reports/coverage.out + flags: unittests + name: codecov-armada-go-unit-tests + verbose: true + + go-integration-tests: + name: Golang Integration Tests + runs-on: ubuntu-22.04 + + env: + ARMADA_EXECUTOR_INGRESS_URL: "http://localhost" + ARMADA_EXECUTOR_INGRESS_PORT: 5001 + # Cache Docker layers in the GitHub actions cache. + # These variables are picked up by the goreleaser config. + DOCKER_BUILDX_CACHE_FROM: "type=gha" + DOCKER_BUILDX_CACHE_TO: "type=gha,mode=max" + DOCKER_BUILDX_BUILDER: "builder" + + steps: + - name: Checkout + uses: actions/checkout@v3 + + - name: Create Docker Buildx Builder + run: docker buildx create --name ${DOCKER_BUILDX_BUILDER} --driver docker-container --use + + - name: Install Docker Buildx + run: docker buildx install + + - name: Setup Golang with Cache + uses: magnetikonline/action-golang-cache@v4 + with: + go-version: "1.20" + + - name: Setup dependencies + shell: bash + run: make download + + - name: Setup and Run Integration Tests + run: | + # Manually create folders to ensure perms are correct. + mkdir -p .kube/internal + mkdir -p .kube/external + go run github.com/magefile/mage@v1.14.0 -v localdev minimal testsuite + + - name: Upload JUnit Report Artifact + uses: actions/upload-artifact@v3 + with: + name: go-integration-test-reports + path: junit.xml + if-no-files-found: error + + - name: Publish JUnit Report + uses: mikepenz/action-junit-report@v3 + if: always() + with: + report_paths: junit.xml + fail_on_failure: true + require_tests: true + detailed_summary: true + token: ${{ secrets.GITHUB_TOKEN }} + + go-mod-up-to-date: + name: Golang Mod Up To Date + runs-on: ubuntu-22.04 + + steps: + - name: Checkout code + uses: actions/checkout@v3 + + - name: Setup Golang with Cache + uses: magnetikonline/action-golang-cache@v4 + with: + go-version: "1.20" + + - name: Download all Go modules + run: go mod download + + - name: Check for tidyness of go.mod and go.sum + run: | + go mod tidy + + changed=$(git status -s -uno | wc -l) + + echo -e "### Git status" >> $GITHUB_STEP_SUMMARY + if [[ "$changed" -gt 0 ]]; then + echo -e "Go modules are not synchronized. Please run 'go mod tidy' and commit the changes." >> $GITHUB_STEP_SUMMARY + + git status -s -uno >> $GITHUB_STEP_SUMMARY + + echo -e >> $GITHUB_STEP_SUMMARY + echo -e "### Git diff" >> $GITHUB_STEP_SUMMARY + + git --no-pager diff >> $GITHUB_STEP_SUMMARY + else + echo -e "Go modules are synchronized." >> $GITHUB_STEP_SUMMARY + echo -e >> $GITHUB_STEP_SUMMARY + fi + + exit $changed + + proto-up-to-date: + name: Proto Up To Date + runs-on: ubuntu-22.04 + + steps: + - name: Checkout + uses: actions/checkout@v3 + + - name: Install Protoc + uses: arduino/setup-protoc@v1 + with: + version: '3.17.3' + repo-token: ${{ secrets.GITHUB_TOKEN }} + + - name: Set up Go + uses: actions/setup-go@v4 + with: + go-version: ${{ matrix.go }} + + - name: Setup dependencies + shell: bash + run: make download + + # TODO(JayF): Consider moving this into its own job, that runs under a larger set of circumstances + # since it's possible for this to fail without any go changes being made. + - name: Validate no changes in generated proto files + run: | + make proto + make dotnet + + changed=$(git status -s -uno | wc -l) + + echo -e "### Git status" >> $GITHUB_STEP_SUMMARY + if [[ "$changed" -gt 0 ]]; then + echo -e "Generated proto files are out of date. Please run 'make proto' and commit the changes." >> $GITHUB_STEP_SUMMARY + + git status -s -uno >> $GITHUB_STEP_SUMMARY + + echo -e >> $GITHUB_STEP_SUMMARY + echo -e "### Git diff" >> $GITHUB_STEP_SUMMARY + + git --no-pager diff >> $GITHUB_STEP_SUMMARY + else + echo -e "Generated proto files are up to date." >> $GITHUB_STEP_SUMMARY + echo -e >> $GITHUB_STEP_SUMMARY + fi + + exit $changed diff --git a/.github/workflows/ts.yml b/.github/workflows/ts.yml deleted file mode 100644 index 52887248a84..00000000000 --- a/.github/workflows/ts.yml +++ /dev/null @@ -1,47 +0,0 @@ -name: Lookout - -on: - push: - branches-ignore: - - master - paths: - - 'internal/lookout/ui/**' - - 'build/lookout/**' - - '.github/workflows/ts.yml' - pull_request: - branches-ignore: - - gh-pages - paths: - - 'internal/lookout/ui/**' - - 'build/lookout/**' - - '.github/workflows/ts.yml' - -jobs: - ts-lint: - # TODO(JayF): Determine what nodejs versions we target, and setup matrix-based testing similar to what we do for go - runs-on: ubuntu-22.04 - steps: - - uses: actions/checkout@v3 - - uses: actions/setup-node@v3 - with: - node-version: 16.14.2 - - - name: Check TypeScript formatting - run: | - yarn install --frozen-lockfile && yarn run fmt || exit 1 - exit $(git status -s -uno | wc -l) - working-directory: ./internal/lookout/ui - ts-unit-tests: - runs-on: ubuntu-22.04 - steps: - - uses: actions/checkout@v3 - - uses: actions/setup-node@v3 - with: - node-version: 16.14.2 - - - name: Run Unit Tests - run: | - yarn install --frozen-lockfile && yarn run openapi && CI=true yarn run test - working-directory: ./internal/lookout/ui - - diff --git a/.gitignore b/.gitignore index 9394f082479..60f429425d7 100644 --- a/.gitignore +++ b/.gitignore @@ -45,6 +45,7 @@ bin/ *.test # Output of the go coverage tool, specifically when used with LiteIDE +test-reports *.out # Dependency directories (remove the comment below to include it) diff --git a/client/python/pyproject.toml b/client/python/pyproject.toml index d4e8c08a996..ce74f88a5ef 100644 --- a/client/python/pyproject.toml +++ b/client/python/pyproject.toml @@ -10,7 +10,8 @@ authors = [{ name = "G-Research Open Source Software", email = "armada@armadapro [project.optional-dependencies] format = ["black==23.3.0", "flake8==6.0.0", "pylint==2.17.4"] -docs = ["sphinx", "sphinx-jekyll-builder", "sphinx-toolbox==3.2.0b1"] +# note(JayF): sphinx-jekyll-builder was broken by sphinx-markdown-builder 0.6 -- so pin to 0.5.5 +docs = ["sphinx==7.0.1", "sphinx-jekyll-builder==0.3.0", "sphinx-toolbox==3.2.0b1", "sphinx-markdown-builder==0.5.5"] test = ["pytest==7.3.1", "coverage>=6.5.0", "pytest-asyncio==0.21.0"] [build-system] diff --git a/config/armada/config.yaml b/config/armada/config.yaml index 265c2d80822..fbd15ba5476 100644 --- a/config/armada/config.yaml +++ b/config/armada/config.yaml @@ -31,9 +31,17 @@ eventsApiRedis: poolSize: 1000 scheduling: enableAssertions: true + fairnessModel: "AssetFairness" + dominantResourceFairnessResourcesToConsider: + - "cpu" + - "memory" + - "nvidia.com/gpu" + resourceScarcity: + cpu: 1.0 preemption: nodeEvictionProbability: 1.0 nodeOversubscriptionEvictionProbability: 1.0 + protectedFractionOfFairShare: 1.0 setNodeIdSelector: true nodeIdLabel: kubernetes.io/hostname setNodeName: false @@ -42,8 +50,8 @@ scheduling: priority: 1000 preemptible: false maximumResourceFractionPerQueue: - memory: 0.99 - cpu: 0.99 + memory: 1.0 + cpu: 1.0 armada-preemptible: priority: 1000 preemptible: true @@ -53,7 +61,7 @@ scheduling: maxExtraNodesToConsider: 1 maximumResourceFractionToSchedule: memory: 1.0 - cpu: 1.0 + cpu: 1.0 maxJobSchedulingContextsPerExecutor: 10000 lease: expireAfter: 15m @@ -68,11 +76,6 @@ scheduling: value: "true" effect: "NoSchedule" defaultJobTolerationsByPriorityClass: - "": - - key: "armadaproject.io/pc-armada-default" - operator: "Equal" - value: "true" - effect: "NoSchedule" armada-default: - key: "armadaproject.io/pc-armada-default" operator: "Equal" @@ -84,8 +87,6 @@ scheduling: value: "true" effect: "NoSchedule" maxRetries: 5 - resourceScarcity: - cpu: 1.0 maxPodSpecSizeBytes: 65535 minJobResources: memory: 1Mi diff --git a/config/executor/config.yaml b/config/executor/config.yaml index c81beb2773d..44cb869cc91 100644 --- a/config/executor/config.yaml +++ b/config/executor/config.yaml @@ -59,6 +59,9 @@ kubernetes: fatalPodSubmissionErrors: - "admission webhook" - "namespaces \".*\" not found" + stateChecks: + deadlineForSubmittedPodConsideredMissing: 15m + deadlineForActivePodConsideredMissing: 5m pendingPodChecks: deadlineForUpdates: 10m deadlineForNodeAssignment: 5m diff --git a/config/scheduler/config.yaml b/config/scheduler/config.yaml index c05b6e1ebf4..92b21b8059b 100644 --- a/config/scheduler/config.yaml +++ b/config/scheduler/config.yaml @@ -37,6 +37,8 @@ leader: renewDeadline: 10s retryPeriod: 2s podName: "" # This must be set so viper allows env vars to overwrite it +http: + port: 8080 grpc: port: 50052 keepaliveParams: @@ -49,6 +51,13 @@ grpc: scheduling: executorTimeout: 10m enableAssertions: true + fairnessModel: "AssetFairness" + dominantResourceFairnessResourcesToConsider: + - "cpu" + - "memory" + - "nvidia.com/gpu" + resourceScarcity: + cpu: 1.0 preemption: alwaysAttemptScheduling: false enabled: true @@ -60,8 +69,8 @@ scheduling: priority: 1000 preemptible: false maximumResourceFractionPerQueue: - memory: 0.99 - cpu: 0.99 + memory: 1.0 + cpu: 1.0 armada-preemptible: priority: 1000 preemptible: true @@ -85,11 +94,6 @@ scheduling: value: "true" effect: "NoSchedule" defaultJobTolerationsByPriorityClass: - "": - - key: "armadaproject.io/pc-armada-default" - operator: "Equal" - value: "true" - effect: "NoSchedule" armada-default: - key: "armadaproject.io/pc-armada-default" operator: "Equal" @@ -101,8 +105,6 @@ scheduling: value: "true" effect: "NoSchedule" maxRetries: 5 - resourceScarcity: - cpu: 1.0 indexedResources: - name: "cpu" resolution: "100m" diff --git a/deployment/scheduler/templates/scheduler-statefulset.yaml b/deployment/scheduler/templates/scheduler-statefulset.yaml index 01c0220180b..a2d5a6f8a6f 100644 --- a/deployment/scheduler/templates/scheduler-statefulset.yaml +++ b/deployment/scheduler/templates/scheduler-statefulset.yaml @@ -55,6 +55,9 @@ spec: resources: {{- toYaml .Values.scheduler.resources | nindent 12 }} ports: + - containerPort: { { .Values.scheduler.applicationConfig.http.port } } + protocol: TCP + name: http - containerPort: {{ .Values.scheduler.applicationConfig.grpc.port }} protocol: TCP name: grpc @@ -79,6 +82,13 @@ spec: {{- if .Values.scheduler.additionalVolumeMounts }} {{- toYaml .Values.scheduler.additionalVolumeMounts | nindent 12 -}} {{- end }} + readinessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 5 + timeoutSeconds: 5 + failureThreshold: 2 securityContext: allowPrivilegeEscalation: false affinity: diff --git a/deployment/scheduler/values.yaml b/deployment/scheduler/values.yaml index e09942ce2a6..d6ead90ea66 100644 --- a/deployment/scheduler/values.yaml +++ b/deployment/scheduler/values.yaml @@ -19,6 +19,8 @@ scheduler: port: 50051 metrics: port: 9001 + http: + port: 8080 pulsar: {} updateStrategy: rollingUpdate: diff --git a/docs/developer.md b/docs/developer.md index 5a073f7405d..28c0b7f4ea3 100644 --- a/docs/developer.md +++ b/docs/developer.md @@ -12,6 +12,12 @@ Want to quickly get Armada running and test it? Install the [Pre-requisites](#pr mage localdev minimal testsuite ``` +To get the UI running, run: + +```bash +mage ui +``` + ## A note for Devs on Arm / Windows There is limited information on issues that appear on Arm / Windows Machines when running this setup. @@ -92,9 +98,9 @@ go run cmd/testsuite/main.go test --tests "testsuite/testcases/basic/*" --junit ### Running the UI -In LocalDev, the UI is pre-built and served from the lookout component. To access it, open http://localhost:8089 in your browser. +In LocalDev, the UI is built seperately with `mage ui`. To access it, open http://localhost:8089 in your browser. -If you wish to run the UI locally, see the [UI Developer Guide](./developer/ui.md). +For more information see the [UI Developer Guide](./developer/ui.md). ### Choosing components to run diff --git a/docs/developer/ubuntu-setup.md b/docs/developer/ubuntu-setup.md new file mode 100644 index 00000000000..6aa02e39a7c --- /dev/null +++ b/docs/developer/ubuntu-setup.md @@ -0,0 +1,164 @@ +# Setting up an Ubuntu Linux instance for Armada development + +## Introduction + +This document is a list of the steps, packages, and tweaks that need to be done to get an Ubuntu Linux +instance running, with all the tools needed for Armada development and testing. + +The packages and steps were verified on an AWS EC2 instance (type t3.xlarge, 4 vcpu, 16GB RAM, +150GB EBS disk), but should be essentially the same on any comparable hardware system. + +### Install Ubuntu Linux + +Install Ubuntu Linux 22.04 (later versions may work as well). The default package set should +work. If you are setting up a new AWS EC2 instance, the default Ubuntu 22.04 image works well. + +When installing, ensure that the network configuration allows: +- SSH traffic from your client IP(s) +- HTTP traffic +- HTTPS traffic + +Apply all recent updates: +``` +$ sudo apt update +$ sudo apt upgrade +``` +You will likely need to reboot after applying the updates: +``` +$ sudo shutdown -r now +``` +After logging in, clean up any old, unused packages: +``` +$ sudo apt autoremove +``` + +AWS usually creates new EC2 instances with a very small root partion (8GB), which will quickly +fill up when using containers, or doing any serious development. Creating a new, large EBS volume, and +attaching it to the instance, will give a system usable for container work. + +First, provision an EBS volume in the AWS Console - of at least 150GB, or more - and attach it to +the instance. You will need to create the EBS volume in the same availability zone as the EC2 +instance - you can find the latter's AZ by clicking on the 'Networking' tab in the details page +for the instance, and you should see the Availabilty Zone listed in that panel. Once you've created +the volume, attach it to the instance. + +Then, format a filesystem on the volume and mount it. First, determine what block device the +parition is on, by running the `lsblk` comand. There should be a line where the TYPE is 'disk' +and the size matches the size you specified when creating the volume - e.g. +``` +nvme1n1 259:4 0 150G 0 disk +``` +Create a filesystem on that device by running `mkfs`: +``` +$ sudo mkfs -t ext4 /dev/nvme1n1 +``` +Then set a label on the partition - here, we will give it a label of 'VOL1': +``` +$ sudo e2label /dev/nvme1n1 VOL1 +``` +Create the mount-point directory: +``` +$ sudo mkdir /vol1 +``` +Add the following line to the end of `/etc/fstab`, so it will be mounted upon reboot: +``` +LABEL=VOL1 /vol1 ext4 defaults 0 2 +``` +Then mount it by doing `sudo mount -a`, and confirm the available space by running `df -h` - the `/vol1` +filesystem should be listed. + +### Install Language/Tool Packages + +Install several development packages that aren't installed by default in the base system: +``` +$ sudo apt install gcc make unzip +``` + +### Install Go, Protobuffers, and kubectl tools +Install the Go compiler and associated tools. Currently, the latest version is 1.20.5, but there may +be newer versions: + +``` +$ curl --location -O https://go.dev/dl/go1.20.5.linux-amd64.tar.gz +$ sudo tar -C /usr/local -xzvf go1.20.5.linux-amd64.tar.gl +$ echo 'export PATH=$PATH:/usr/local/go/bin' > go.sh +$ sudo cp go.sh /etc/profile.d/ +``` +Then, log out and back in again, then run `go version` to verify your path is now correct. + +Install protoc: +``` +$ curl -O --location https://github.com/protocolbuffers/protobuf/releases/download/v23.3/protoc-23.3-linux-x86_64.zip +$ cd /usr/local && sudo unzip ~/protoc-23.3-linux-x86_64.zip +$ cd ~ +$ type protoc +``` + +Install kubectl: +``` +$ curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" +$ sudo cp kubectl /usr/local/bin +$ sudo chmod 755 /usr/local/bin/kubectl +$ kubectl version +``` + +### Install Docker + +Warning: do not install Docker as provided by the `docker.io` and other packages in the Ubuntu base +packages repository - the version of Docker they provide is out-of-date. + +Instead, follow the instructions for installing Docker on Ubuntu at https://docs.docker.com/engine/install/ubuntu/ . +Specifically, follow the listed steps for installing using an apt repository, and install the latest Docker version. + +### Relocate Docker storage directory to secondary volume + +Since Docker can use a lot of filesystem space, the directory where it stores container images, logs, +and other datafiles should be relocated to the separate, larger non-root volume on the system, so that +the root filesystem does not fill up. + +Stop the Docker daemon(s) and copy the existing data directory to the new location: +``` +$ sudo systemctl stop docker +$ ps ax | grep -i docker # no Docker processes should be shown + +$ sudo rsync -av /var/lib/docker /vol1/ +$ sudo rm -rf /var/lib/docker +$ sudo ln -s /vol1/docker /var/lib/docker +``` +Then restart Docker and verify that it's working again: +``` +$ sudo systemctl start docker +$ sudo docker ps +$ sudo docker run hello-world +``` + +### Create user accounts, verify docker access + +First, make a home directory parent in the new larger filesystem: +``` +$ sudo mkdir /vol1/home +``` +Then, for each user to be added, run the following steps - we will be using the account named 'testuser' here. +First, create the account and their home directory. +``` +$ sudo adduser --shell /bin/bash --gecos 'Test User' --home /vol1/home/testuser testuser +``` +Set up their $HOME/.ssh directory and add their SSH public-key: +``` +$ sudo mkdir /vol1/home/testuser/.ssh +$ sudo vim /vol1/home/testuser/.ssh/authorized_keys +# In the editor, add the SSH public key string that the user has given you, save the file and exit +$ sudo chmod 600 /vol1/home/testuser/.ssh/authorized_keys +$ sudo chmod 700 /vol1/home/testuser/.ssh +$ sudo chown -R testuser:testuser /vol1/home/testuser/.ssh +``` +Finally, add them to the `docker` group so they can run Docker commands without `sudo` access: +``` +$ sudo gpasswd -a testuser docker +``` +**sudo Access (OPTIONAL)** + +If you want to give the new user `sudo` privileges, run the following command: +``` +$ sudo gpasswd -a testuser sudo +``` diff --git a/docs/developer/ui.md b/docs/developer/ui.md index 26e0849976a..bfcb219ed10 100644 --- a/docs/developer/ui.md +++ b/docs/developer/ui.md @@ -7,7 +7,7 @@ In short, Lookout is made of two components: * Lookout API: a Go service that provides an API to the Armada backend * Lookout UI: a React application that provides a web interface to the Lookout API -After running `mage localdev full`, the Lookout UI should be accessible through your browser at `http://localhost:8089` +After running `mage localdev full` and `mage ui`, the Lookout UI should be accessible through your browser at `http://localhost:8089` For UI development, you can also use the React development server and skip the build step. Note that the Lookout API service will still have to be running for this to work. Browse to `http://localhost:3000` with this. @@ -16,4 +16,4 @@ cd ./internal/lookout/ui yarn run start ``` -You can also build a production build of the UI by running `mage ui` in the root of the repo. \ No newline at end of file +You can also re-build a production build of the UI by running `mage ui` in the root of the repo. \ No newline at end of file diff --git a/internal/armada/configuration/constants.go b/internal/armada/configuration/constants.go index 992ab6c90c9..51669f2aec6 100644 --- a/internal/armada/configuration/constants.go +++ b/internal/armada/configuration/constants.go @@ -7,18 +7,16 @@ const ( // GangCardinalityAnnotation All jobs in a gang must specify the total number of jobs in the gang via this annotation. // The cardinality should be expressed as an integer, e.g., "3". GangCardinalityAnnotation = "armadaproject.io/gangCardinality" + // The jobs that make up a gang may be constrained to be scheduled across a set of uniform nodes. + // Specifically, if provided, all gang jobs are scheduled onto nodes for which the value of the provided label is equal. + // Used to ensure, e.g., that all gang jobs are scheduled onto the same cluster or rack. + GangNodeUniformityLabelAnnotation = "armadaproject.io/gangNodeUniformityLabel" // Armada normally tries to re-schedule jobs for which a pod fails to start. // Pods for which this annotation has value "true" are not retried. // Instead, the job the pod is part of fails immediately. FailFastAnnotation = "armadaproject.io/failFast" ) -var ArmadaManagedAnnotations = []string{ - GangIdAnnotation, - GangCardinalityAnnotation, - FailFastAnnotation, -} - var ReturnLeaseRequestTrackedAnnotations = map[string]struct{}{ FailFastAnnotation: {}, } diff --git a/internal/armada/configuration/types.go b/internal/armada/configuration/types.go index 7c8da6afefc..23fea24be64 100644 --- a/internal/armada/configuration/types.go +++ b/internal/armada/configuration/types.go @@ -113,7 +113,11 @@ type SchedulingConfig struct { DefaultJobTolerationsByResourceRequest map[string][]v1.Toleration // Maximum number of times a job is retried before considered failed. MaxRetries uint - // Weights used when computing fair share. + // Controls how fairness is calculated. Can be either AssetFairness or DominantResourceFairness. + FairnessModel FairnessModel + // List of resource names, e.g., []string{"cpu", "memory"}, to consider when computing DominantResourceFairness. + DominantResourceFairnessResourcesToConsider []string + // Weights used to compute fair share when using AssetFairness. // Overrides dynamic scarcity calculation if provided. // Applies to both the new and old scheduler. ResourceScarcity map[string]float64 @@ -149,6 +153,8 @@ type SchedulingConfig struct { // // Applies only to the new scheduler. IndexedTaints []string + // Default value of GangNodeUniformityLabelAnnotation if none is provided. + DefaultGangNodeUniformityLabel string // Kubernetes pods may specify a termination grace period. // When Pods are cancelled/preempted etc., they are first sent a SIGTERM. // If a pod has not exited within its termination grace period, @@ -187,6 +193,20 @@ type SchedulingConfig struct { AlwaysAttemptScheduling bool } +// FairnessModel controls how fairness is computed. +// More specifically, each queue has a cost associated with it and the next job to schedule +// is taken from the queue with smallest cost. FairnessModel determines how that cost is computed. +type FairnessModel string + +const ( + // AssetFairness sets the cost associated with a queue to a linear combination of its total allocation. + // E.g., w_CPU * "CPU allocation" + w_memory * "memory allocation". + AssetFairness FairnessModel = "AssetFairness" + // DominantResourceFairness set the cost associated with a queue to + // max("CPU allocation" / "CPU capacity", "memory allocation" / "mamory capacity", ...). + DominantResourceFairness FairnessModel = "DominantResourceFairness" +) + type IndexedResource struct { // Resource name. E.g., "cpu", "memory", or "nvidia.com/gpu". Name string @@ -209,6 +229,8 @@ type PreemptionConfig struct { // the probability of evicting jobs on oversubscribed nodes, i.e., // nodes on which the total resource requests are greater than the available resources. NodeOversubscriptionEvictionProbability float64 + // Only queues allocated more than this fraction of their fair share are considered for preemption. + ProtectedFractionOfFairShare float64 // If true, the Armada scheduler will add to scheduled pods a node selector // NodeIdLabel: . // If true, NodeIdLabel must be non-empty. @@ -233,12 +255,8 @@ type PriorityClass struct { Priority int32 // If true, Armada may preempt jobs of this class to improve fairness. Preemptible bool - // Limits resources assigned to jobs of priority equal to or lower than that of this priority class. + // Limits resources assigned to jobs of this priority class. // Specifically, jobs of this priority class are only scheduled if doing so does not exceed this limit. - // - // For example, if priority is 10 and MaximumResourceFractionPerQueue is map[string]float64{"cpu": 0.3}, - // jobs of this priority class are not scheduled if doing so would cause the total resources assigned - // to jobs of priority 10 or lower from the same queue to exceed 30% of the total. MaximumResourceFractionPerQueue map[string]float64 // Per-pool override of MaximumResourceFractionPerQueue. // If missing for a particular pool, MaximumResourceFractionPerQueue is used instead for that pool. diff --git a/internal/armada/server/applydefaults.go b/internal/armada/server/applydefaults.go index 6ffbd42a88d..510110c7799 100644 --- a/internal/armada/server/applydefaults.go +++ b/internal/armada/server/applydefaults.go @@ -10,6 +10,21 @@ import ( armadaresource "github.com/armadaproject/armada/internal/common/resource" ) +func applyDefaultsToAnnotations(annotations map[string]string, config configuration.SchedulingConfig) { + if annotations == nil { + return + } + applyDefaultNodeUniformityLabelAnnotation(annotations, config) +} + +func applyDefaultNodeUniformityLabelAnnotation(annotations map[string]string, config configuration.SchedulingConfig) { + if _, ok := annotations[configuration.GangIdAnnotation]; ok { + if _, ok := annotations[configuration.GangNodeUniformityLabelAnnotation]; !ok { + annotations[configuration.GangNodeUniformityLabelAnnotation] = config.DefaultGangNodeUniformityLabel + } + } +} + func applyDefaultsToPodSpec(spec *v1.PodSpec, config configuration.SchedulingConfig) { if spec == nil { return diff --git a/internal/armada/server/applydefaults_test.go b/internal/armada/server/applydefaults_test.go index 036c642dfe7..62293a45b1a 100644 --- a/internal/armada/server/applydefaults_test.go +++ b/internal/armada/server/applydefaults_test.go @@ -11,6 +11,53 @@ import ( "github.com/armadaproject/armada/internal/armada/configuration" ) +func TestApplyDefaultsToAnnotations(t *testing.T) { + tests := map[string]struct { + Config configuration.SchedulingConfig + Annotations map[string]string + Expected map[string]string + }{ + "no change": { + Annotations: make(map[string]string), + Expected: make(map[string]string), + }, + "DefaultNodeUniformityLabelAnnotation no change for non-gang jobs": { + Config: configuration.SchedulingConfig{ + DefaultGangNodeUniformityLabel: "foo", + }, + Annotations: make(map[string]string), + Expected: make(map[string]string), + }, + "DefaultNodeUniformityLabelAnnotation empty default": { + Annotations: map[string]string{ + configuration.GangIdAnnotation: "bar", + }, + Expected: map[string]string{ + configuration.GangIdAnnotation: "bar", + configuration.GangNodeUniformityLabelAnnotation: "", + }, + }, + "DefaultNodeUniformityLabelAnnotation": { + Config: configuration.SchedulingConfig{ + DefaultGangNodeUniformityLabel: "foo", + }, + Annotations: map[string]string{ + configuration.GangIdAnnotation: "bar", + }, + Expected: map[string]string{ + configuration.GangIdAnnotation: "bar", + configuration.GangNodeUniformityLabelAnnotation: "foo", + }, + }, + } + for name, tc := range tests { + t.Run(name, func(t *testing.T) { + applyDefaultsToAnnotations(tc.Annotations, tc.Config) + assert.Equal(t, tc.Expected, tc.Annotations) + }) + } +} + func TestApplyDefaultsToPodSpec(t *testing.T) { tests := map[string]struct { Config configuration.SchedulingConfig diff --git a/internal/armada/server/lease.go b/internal/armada/server/lease.go index a8559ea6932..9981c971742 100644 --- a/internal/armada/server/lease.go +++ b/internal/armada/server/lease.go @@ -40,6 +40,7 @@ import ( schedulerconstraints "github.com/armadaproject/armada/internal/scheduler/constraints" schedulercontext "github.com/armadaproject/armada/internal/scheduler/context" "github.com/armadaproject/armada/internal/scheduler/database" + "github.com/armadaproject/armada/internal/scheduler/interfaces" schedulerinterfaces "github.com/armadaproject/armada/internal/scheduler/interfaces" "github.com/armadaproject/armada/internal/scheduler/nodedb" "github.com/armadaproject/armada/internal/scheduler/schedulerobjects" @@ -282,14 +283,51 @@ func (q *AggregatedQueueServer) getJobs(ctx context.Context, req *api.StreamingL }) } + // Map queue names to priority factor for all active queues, i.e., + // all queues for which the jobs queue has not been deleted automatically by Redis. + queues, err := q.queueRepository.GetAllQueues() + if err != nil { + return nil, err + } + priorityFactorByQueue := make(map[string]float64, len(queues)) + apiQueues := make([]*api.Queue, len(queues)) + for i, queue := range queues { + priorityFactorByQueue[queue.Name] = float64(queue.PriorityFactor) + apiQueues[i] = &api.Queue{Name: queue.Name} + } + + // Record which queues are active, i.e., have jobs either queued or running. + queuesWithJobsQueued, err := q.jobRepository.FilterActiveQueues(apiQueues) + if err != nil { + return nil, err + } + isActiveByQueueName := make(map[string]bool, len(queuesWithJobsQueued)) + for _, queue := range queuesWithJobsQueued { + isActiveByQueueName[queue.Name] = true + } + // Nodes to be considered by the scheduler. lastSeen := q.clock.Now() - nodes := make([]*schedulerobjects.Node, 0, len(req.Nodes)) - allocatedByQueueForCluster := make(map[string]schedulerobjects.QuantityByPriorityAndResourceType) + + nodeDb, err := nodedb.NewNodeDb( + q.schedulingConfig.Preemption.PriorityClasses, + q.schedulingConfig.MaxExtraNodesToConsider, + q.schedulingConfig.IndexedResources, + q.schedulingConfig.IndexedTaints, + q.schedulingConfig.IndexedNodeLabels, + ) + if err != nil { + return nil, err + } + txn := nodeDb.Txn(true) + defer txn.Abort() + + allocatedByQueueAndPriorityClassForCluster := make(map[string]schedulerobjects.QuantityByTAndResourceType[string], len(queues)) jobIdsByGangId := make(map[string]map[string]bool) gangIdByJobId := make(map[string]string) nodeIdByJobId := make(map[string]string) - for _, nodeInfo := range req.Nodes { + nodes := make([]*schedulerobjects.Node, len(req.Nodes)) + for i, nodeInfo := range req.Nodes { node, err := api.NewNodeFromNodeInfo( &nodeInfo, req.ClusterId, @@ -302,6 +340,7 @@ func (q *AggregatedQueueServer) getJobs(ctx context.Context, req *api.StreamingL ) continue } + nodes[i] = node jobIds := make([]string, 0, len(nodeInfo.RunIdsByState)) for jobId, jobState := range nodeInfo.RunIdsByState { @@ -314,7 +353,7 @@ func (q *AggregatedQueueServer) getJobs(ctx context.Context, req *api.StreamingL if err != nil { return nil, err } - receivedJobIds := make(map[string]bool) + receivedJobIds := make(map[string]bool, len(jobs)) for _, job := range jobs { receivedJobIds[job.Id] = true } @@ -332,16 +371,14 @@ func (q *AggregatedQueueServer) getJobs(ctx context.Context, req *api.StreamingL } // Aggregate total resources allocated by queue for this cluster. - allocatedByQueueForCluster = scheduler.UpdateUsage( - allocatedByQueueForCluster, - jobs, - q.schedulingConfig.Preemption.PriorityClasses, - scheduler.Add, + allocatedByQueueAndPriorityClassForCluster = updateAllocatedByQueueAndPriorityClass( + allocatedByQueueAndPriorityClassForCluster, + add, jobs, ) // Group gangs. for _, job := range jobs { - gangId, _, isGangJob, err := scheduler.GangIdAndCardinalityFromLegacySchedulerJob(job, q.schedulingConfig.Preemption.PriorityClasses) + gangId, _, isGangJob, err := scheduler.GangIdAndCardinalityFromLegacySchedulerJob(job) if err != nil { return nil, err } @@ -356,72 +393,53 @@ func (q *AggregatedQueueServer) getJobs(ctx context.Context, req *api.StreamingL } // Bind pods to nodes, thus ensuring resources are marked as allocated on the node. - skipNode := false - for _, job := range jobs { - node, err = nodedb.BindPodToNode( - scheduler.PodRequirementFromLegacySchedulerJob( - job, - q.schedulingConfig.Preemption.PriorityClasses, - ), - node, - ) - if err != nil { - logging.WithStacktrace(log, err).Warnf( - "skipping node %s from executor %s: failed to bind job %s to node", - nodeInfo.GetName(), req.GetClusterId(), job.Id, - ) - skipNode = true - break - } - } - if skipNode { - continue + if err := nodeDb.CreateAndInsertWithApiJobsWithTxn(txn, jobs, node); err != nil { + return nil, err } // Record which node each job is scheduled on. Necessary for gang preemption. for _, job := range jobs { nodeIdByJobId[job.Id] = node.Id } - nodes = append(nodes, node) - } - nodeDb, err := nodedb.NewNodeDb( - q.schedulingConfig.Preemption.PriorityClasses, - q.schedulingConfig.MaxExtraNodesToConsider, - q.schedulingConfig.IndexedResources, - q.schedulingConfig.IndexedTaints, - q.schedulingConfig.IndexedNodeLabels, - ) - if err != nil { - return nil, err - } - if err := nodeDb.UpsertMany(nodes); err != nil { - return nil, err + + // Record which queues have jobs running. Necessary to omit inactive queues. + for _, job := range jobs { + isActiveByQueueName[job.Queue] = true + } } - // Load executor reports for all clusters, and insert an updated report for this cluster. + txn.Commit() + + // Load allocation reports for all executors from Redis. reportsByExecutor, err := q.usageRepository.GetClusterQueueResourceUsage() if err != nil { return nil, err } - executorReport := &schedulerobjects.ClusterResourceUsageReport{ + + // Insert an updated report for the current executor, which includes information received in this lease call. + currentExecutorReport := &schedulerobjects.ClusterResourceUsageReport{ Pool: req.Pool, Created: q.clock.Now(), - ResourcesByQueue: make(map[string]*schedulerobjects.QueueClusterResourceUsage), - } - for queue, allocated := range allocatedByQueueForCluster { - executorReport.ResourcesByQueue[queue] = &schedulerobjects.QueueClusterResourceUsage{ - Created: executorReport.Created, - Queue: queue, - ExecutorId: req.ClusterId, - ResourcesByPriority: allocated.DeepCopy(), + ResourcesByQueue: make(map[string]*schedulerobjects.QueueClusterResourceUsage, len(queues)), + } + for queue, allocatedByPriorityClass := range allocatedByQueueAndPriorityClassForCluster { + currentExecutorReport.ResourcesByQueue[queue] = &schedulerobjects.QueueClusterResourceUsage{ + Created: currentExecutorReport.Created, + Queue: queue, + ExecutorId: req.ClusterId, + ResourcesByPriorityClassName: armadamaps.DeepCopy(allocatedByPriorityClass), } } - reportsByExecutor[req.ClusterId] = executorReport - if err := q.usageRepository.UpdateClusterQueueResourceUsage(req.ClusterId, executorReport); err != nil { + reportsByExecutor[req.ClusterId] = currentExecutorReport + + // Write the updated report into Redis to make the information available to other replicas of the server. + if err := q.usageRepository.UpdateClusterQueueResourceUsage(req.ClusterId, currentExecutorReport); err != nil { return nil, errors.WithMessagef(err, "failed to update cluster usage for cluster %s", req.ClusterId) } - allocatedByQueueForPool := q.aggregateUsage(reportsByExecutor, req.Pool) - log.Infof("allocated resources per queue for pool %s before scheduling: %v", req.Pool, allocatedByQueueForPool) + + // Aggregate allocation across all clusters. + allocatedByQueueAndPriorityClassForPool := q.aggregateAllocationAcrossExecutor(reportsByExecutor, req.Pool) + log.Infof("allocated resources per queue for pool %s before scheduling: %v", req.Pool, allocatedByQueueAndPriorityClassForPool) // Store executor details in Redis so they can be used by submit checks and the new scheduler. if err := q.executorRepository.StoreExecutor(ctx, &schedulerobjects.Executor{ @@ -429,35 +447,13 @@ func (q *AggregatedQueueServer) getJobs(ctx context.Context, req *api.StreamingL Pool: req.Pool, Nodes: nodes, MinimumJobSize: schedulerobjects.ResourceList{Resources: req.MinimumJobSize}, - LastUpdateTime: time.Now(), + LastUpdateTime: q.clock.Now(), }); err != nil { // This is not fatal; we can still schedule if it doesn't happen. log.WithError(err).Warnf("could not store executor details for cluster %s", req.ClusterId) } - // Map queue names to priority factor for all active queues, i.e., - // all queues for which the jobs queue has not been deleted automatically by Redis. - queues, err := q.queueRepository.GetAllQueues() - if err != nil { - return nil, err - } - priorityFactorByQueue := make(map[string]float64, len(queues)) - apiQueues := make([]*api.Queue, len(queues)) - for i, queue := range queues { - priorityFactorByQueue[queue.Name] = float64(queue.PriorityFactor) - apiQueues[i] = &api.Queue{Name: queue.Name} - } - activeQueues, err := q.jobRepository.FilterActiveQueues(apiQueues) - if err != nil { - return nil, err - } - priorityFactorByActiveQueue := make(map[string]float64, len(activeQueues)) - for _, queue := range activeQueues { - priorityFactorByActiveQueue[queue.Name] = priorityFactorByQueue[queue.Name] - } - - // Give Schedule() a 3 second shorter deadline than ctx, - // to give it a chance to finish up before ctx is cancelled. + // Give Schedule() a 3 second shorter deadline than ctx to give it a chance to finish up before ctx deadline. if deadline, ok := ctx.Deadline(); ok { var cancel context.CancelFunc ctx, cancel = context.WithDeadline(ctx, deadline.Add(-3*time.Second)) @@ -472,8 +468,19 @@ func (q *AggregatedQueueServer) getJobs(ctx context.Context, req *api.StreamingL q.schedulingConfig.ResourceScarcity, schedulerobjects.ResourceList{Resources: totalCapacity}, ) + if q.schedulingConfig.FairnessModel == configuration.DominantResourceFairness { + sctx.EnableDominantResourceFairness(q.schedulingConfig.DominantResourceFairnessResourcesToConsider) + } for queue, priorityFactor := range priorityFactorByQueue { - if err := sctx.AddQueueSchedulingContext(queue, priorityFactor, allocatedByQueueForPool[queue]); err != nil { + if !isActiveByQueueName[queue] { + // To ensure fair share is computed only from active queues, i.e., queues with jobs queued or running. + continue + } + var weight float64 = 1 + if priorityFactor > 0 { + weight = 1 / priorityFactor + } + if err := sctx.AddQueueSchedulingContext(queue, weight, allocatedByQueueAndPriorityClassForPool[queue]); err != nil { return nil, err } } @@ -488,6 +495,7 @@ func (q *AggregatedQueueServer) getJobs(ctx context.Context, req *api.StreamingL constraints, q.schedulingConfig.Preemption.NodeEvictionProbability, q.schedulingConfig.Preemption.NodeOversubscriptionEvictionProbability, + q.schedulingConfig.Preemption.ProtectedFractionOfFairShare, &SchedulerJobRepositoryAdapter{ r: q.jobRepository, }, @@ -629,38 +637,37 @@ func (q *AggregatedQueueServer) getJobs(ctx context.Context, req *api.StreamingL } // Update resource cluster report to account for preempted/leased jobs and write it to Redis. - allocatedByQueueForCluster = scheduler.UpdateUsage( - allocatedByQueueForCluster, - result.PreemptedJobs, - q.schedulingConfig.Preemption.PriorityClasses, - scheduler.Subtract, + allocatedByQueueAndPriorityClassForCluster = updateAllocatedByQueueAndPriorityClass( + allocatedByQueueAndPriorityClassForCluster, + subtract, result.PreemptedJobs, ) - for queue, m := range allocatedByQueueForCluster { + for queue, m := range allocatedByQueueAndPriorityClassForCluster { // Any quantity in the negative indicates a resource accounting problem. - if !m.IsStrictlyNonNegative() { - log.Errorf("unexpected negative resource quantity for queue %s: %v", queue, m) + for _, rl := range m { + if !rl.IsStrictlyNonNegative() { + return nil, errors.Errorf("unexpected negative resource quantity for queue %s: %v", queue, m) + } } } - allocatedByQueueForCluster = scheduler.UpdateUsage( - allocatedByQueueForCluster, - successfullyLeasedApiJobs, - q.schedulingConfig.Preemption.PriorityClasses, - scheduler.Add, + allocatedByQueueAndPriorityClassForCluster = updateAllocatedByQueueAndPriorityClass( + allocatedByQueueAndPriorityClassForCluster, + add, successfullyLeasedApiJobs, ) - executorReport.Created = q.clock.Now() - for queue, usage := range allocatedByQueueForCluster { - executorReport.ResourcesByQueue[queue] = &schedulerobjects.QueueClusterResourceUsage{ - Created: executorReport.Created, - Queue: queue, - ExecutorId: req.ClusterId, - ResourcesByPriority: usage.DeepCopy(), + currentExecutorReport.Created = q.clock.Now() + for queue, usage := range allocatedByQueueAndPriorityClassForCluster { + currentExecutorReport.ResourcesByQueue[queue] = &schedulerobjects.QueueClusterResourceUsage{ + Created: currentExecutorReport.Created, + Queue: queue, + ExecutorId: req.ClusterId, + ResourcesByPriorityClassName: armadamaps.DeepCopy(usage), } } - if err := q.usageRepository.UpdateClusterQueueResourceUsage(req.ClusterId, executorReport); err != nil { + if err := q.usageRepository.UpdateClusterQueueResourceUsage(req.ClusterId, currentExecutorReport); err != nil { logging.WithStacktrace(log, err).Errorf("failed to update cluster usage") } - allocatedByQueueForPool = q.aggregateUsage(reportsByExecutor, req.Pool) - log.Infof("allocated resources per queue for pool %s after scheduling: %v", req.Pool, allocatedByQueueForPool) + + allocatedByQueueAndPriorityClassForPool = q.aggregateAllocationAcrossExecutor(reportsByExecutor, req.Pool) + log.Infof("allocated resources per queue for pool %s after scheduling: %v", req.Pool, allocatedByQueueAndPriorityClassForPool) // Optionally set node id selectors on scheduled jobs. if q.schedulingConfig.Preemption.SetNodeIdSelector { @@ -742,31 +749,69 @@ func (q *AggregatedQueueServer) getJobs(ctx context.Context, req *api.StreamingL return successfullyLeasedApiJobs, nil } -// aggregateUsage Creates a map of resource usage first by cluster and then by queue. -// Clusters in pools other than pool are excluded. -func (q *AggregatedQueueServer) aggregateUsage(reportsByCluster map[string]*schedulerobjects.ClusterResourceUsageReport, pool string) map[string]schedulerobjects.QuantityByPriorityAndResourceType { - const activeClusterExpiry = 10 * time.Minute +type addOrSubtract int + +const ( + add addOrSubtract = iota + subtract +) + +func updateAllocatedByQueueAndPriorityClass[T interfaces.LegacySchedulerJob]( + allocatedByQueueAndPriorityClass map[string]schedulerobjects.QuantityByTAndResourceType[string], + op addOrSubtract, + jobs []T, +) map[string]schedulerobjects.QuantityByTAndResourceType[string] { + if allocatedByQueueAndPriorityClass == nil { + allocatedByQueueAndPriorityClass = make(map[string]schedulerobjects.QuantityByTAndResourceType[string], 256) + } + for _, job := range jobs { + allocatedByPriorityClassName := allocatedByQueueAndPriorityClass[job.GetQueue()] + if allocatedByPriorityClassName == nil { + allocatedByPriorityClassName = make(map[string]schedulerobjects.ResourceList) + allocatedByQueueAndPriorityClass[job.GetQueue()] = allocatedByPriorityClassName + } + allocated := allocatedByPriorityClassName[job.GetPriorityClassName()] + if op == add { + allocated.AddV1ResourceList(job.GetResourceRequirements().Requests) + } else if op == subtract { + allocated.SubV1ResourceList(job.GetResourceRequirements().Requests) + } else { + panic(fmt.Sprintf("unknown op %d", op)) + } + allocatedByPriorityClassName[job.GetPriorityClassName()] = allocated + } + return allocatedByQueueAndPriorityClass +} + +func (q *AggregatedQueueServer) aggregateAllocationAcrossExecutor(reportsByExecutor map[string]*schedulerobjects.ClusterResourceUsageReport, pool string) map[string]schedulerobjects.QuantityByTAndResourceType[string] { now := q.clock.Now() - aggregatedUsageByQueue := make(map[string]schedulerobjects.QuantityByPriorityAndResourceType) - for _, clusterReport := range reportsByCluster { - if clusterReport.Pool != pool { - // Separate resource accounting per pool. + allocatedByQueueAndPriorityClass := make(map[string]schedulerobjects.QuantityByTAndResourceType[string]) + for _, executorReport := range reportsByExecutor { + if executorReport.Pool != pool { + // Only consider executors in the specified pool. continue } - if !clusterReport.Created.Add(activeClusterExpiry).After(now) { - // Stale report; omit. - continue + if q.schedulingConfig.ExecutorTimeout != 0 { + reportAge := now.Sub(executorReport.Created) + if reportAge > q.schedulingConfig.ExecutorTimeout { + // Stale report; omit. + continue + } } - for queue, report := range clusterReport.ResourcesByQueue { - quantityByPriorityAndResourceType, ok := aggregatedUsageByQueue[queue] - if !ok { - quantityByPriorityAndResourceType = make(schedulerobjects.QuantityByPriorityAndResourceType) - aggregatedUsageByQueue[queue] = quantityByPriorityAndResourceType + for queue, queueReport := range executorReport.ResourcesByQueue { + allocatedByPriorityClass := allocatedByQueueAndPriorityClass[queue] + if allocatedByPriorityClass == nil { + allocatedByPriorityClass = make(map[string]schedulerobjects.ResourceList) + allocatedByQueueAndPriorityClass[queue] = allocatedByPriorityClass + } + for priorityClassName, allocated := range queueReport.ResourcesByPriorityClassName { + rl := allocatedByPriorityClass[priorityClassName] + rl.Add(allocated) + allocatedByPriorityClass[priorityClassName] = rl } - quantityByPriorityAndResourceType.Add(report.ResourcesByPriority) } } - return aggregatedUsageByQueue + return allocatedByQueueAndPriorityClass } func (q *AggregatedQueueServer) decompressJobOwnershipGroups(jobs []*api.Job) error { diff --git a/internal/armada/server/submit.go b/internal/armada/server/submit.go index 0acd6969582..ca444ff3099 100644 --- a/internal/armada/server/submit.go +++ b/internal/armada/server/submit.go @@ -794,6 +794,7 @@ func (server *SubmitServer) createJobsObjects(request *api.JobSubmitRequest, own namespace = "default" } fillContainerRequestsAndLimits(podSpec.Containers) + applyDefaultsToAnnotations(item.Annotations, *server.schedulingConfig) applyDefaultsToPodSpec(podSpec, *server.schedulingConfig) if err := validation.ValidatePodSpec(podSpec, server.schedulingConfig); err != nil { return nil, errors.Errorf("[createJobs] error validating the %d-th job of job set %s: %v", i, request.JobSetId, err) diff --git a/internal/common/database/lookout/jobstates.go b/internal/common/database/lookout/jobstates.go index 9ba1ce54f31..20ea463dde5 100644 --- a/internal/common/database/lookout/jobstates.go +++ b/internal/common/database/lookout/jobstates.go @@ -51,6 +51,18 @@ const ( ) var ( + // JobStates is an ordered list of states + JobStates = []JobState{ + JobQueued, + JobLeased, + JobPending, + JobRunning, + JobSucceeded, + JobFailed, + JobCancelled, + JobPreempted, + } + JobStateMap = map[int]JobState{ JobLeasedOrdinal: JobLeased, JobQueuedOrdinal: JobQueued, diff --git a/internal/common/validation/job.go b/internal/common/validation/job.go index fcc25f1b825..5b50f6d8715 100644 --- a/internal/common/validation/job.go +++ b/internal/common/validation/job.go @@ -12,8 +12,7 @@ import ( ) func ValidateApiJobs(jobs []*api.Job, config configuration.SchedulingConfig) error { - err := validateGangs(jobs) - if err != nil { + if err := validateGangs(jobs); err != nil { return err } for _, job := range jobs { @@ -26,13 +25,15 @@ func ValidateApiJobs(jobs []*api.Job, config configuration.SchedulingConfig) err func validateGangs(jobs []*api.Job) error { gangDetailsByGangId := make(map[string]struct { - actualCardinality int - expectedCardinality int - expectedPriorityClassName string + actualCardinality int + expectedCardinality int + expectedPriorityClassName string + expectedNodeUniformityLabel string }) for i, job := range jobs { annotations := job.Annotations gangId, gangCardinality, isGangJob, err := scheduler.GangIdAndCardinalityFromAnnotations(annotations) + nodeUniformityLabel := annotations[configuration.GangNodeUniformityLabelAnnotation] if err != nil { return errors.WithMessagef(err, "%d-th job with id %s in gang %s", i, job.Id, gangId) } @@ -56,6 +57,12 @@ func validateGangs(jobs []*api.Job) error { i, job.Id, gangId, details.expectedPriorityClassName, podSpec.PriorityClassName, ) } + if nodeUniformityLabel != details.expectedNodeUniformityLabel { + return errors.Errorf( + "inconsistent nodeUniformityLabel for %d-th job with id %s in gang %s: expected %s but got %s", + i, job.Id, gangId, details.expectedNodeUniformityLabel, nodeUniformityLabel, + ) + } details.actualCardinality++ gangDetailsByGangId[gangId] = details } else { @@ -64,6 +71,7 @@ func validateGangs(jobs []*api.Job) error { if podSpec != nil { details.expectedPriorityClassName = podSpec.PriorityClassName } + details.expectedNodeUniformityLabel = nodeUniformityLabel gangDetailsByGangId[gangId] = details } } diff --git a/internal/common/validation/job_test.go b/internal/common/validation/job_test.go index 960b74ba2bb..08b54a9c719 100644 --- a/internal/common/validation/job_test.go +++ b/internal/common/validation/job_test.go @@ -325,7 +325,8 @@ func TestValidateGangs(t *testing.T) { }, { Annotations: map[string]string{ - configuration.GangIdAnnotation: "bar", + configuration.GangIdAnnotation: "bar", + configuration.GangCardinalityAnnotation: strconv.Itoa(2), }, PodSpec: &v1.PodSpec{ PriorityClassName: "zab", @@ -334,6 +335,27 @@ func TestValidateGangs(t *testing.T) { }, ExpectSuccess: false, }, + "inconsistent NodeUniformityLabel": { + Jobs: []*api.Job{ + { + Annotations: map[string]string{ + configuration.GangIdAnnotation: "bar", + configuration.GangCardinalityAnnotation: strconv.Itoa(2), + configuration.GangNodeUniformityLabelAnnotation: "foo", + }, + PodSpec: &v1.PodSpec{}, + }, + { + Annotations: map[string]string{ + configuration.GangIdAnnotation: "bar", + configuration.GangCardinalityAnnotation: strconv.Itoa(2), + configuration.GangNodeUniformityLabelAnnotation: "bar", + }, + PodSpec: &v1.PodSpec{}, + }, + }, + ExpectSuccess: false, + }, } for name, tc := range tests { t.Run(name, func(t *testing.T) { diff --git a/internal/executor/application.go b/internal/executor/application.go index 47c9f02dd33..d7ad549c47c 100644 --- a/internal/executor/application.go +++ b/internal/executor/application.go @@ -189,9 +189,11 @@ func setupExecutorApiComponents( jobRunState, submitter, etcdHealthMonitor) - podIssueService := service.NewPodIssueService( + podIssueService := service.NewIssueHandler( + jobRunState, clusterContext, eventReporter, + config.Kubernetes.StateChecks, pendingPodChecker, config.Kubernetes.StuckTerminatingPodExpiry) diff --git a/internal/executor/configuration/types.go b/internal/executor/configuration/types.go index 04f7ccfa482..4798f29710a 100644 --- a/internal/executor/configuration/types.go +++ b/internal/executor/configuration/types.go @@ -26,6 +26,17 @@ type PodDefaults struct { Ingress *IngressConfiguration } +type StateChecksConfiguration struct { + // Once a pod is submitted to kubernetes, this is how long we'll wait for it to appear in the kubernetes informer state + // If the pod hasn't appeared after this duration, it is considered missing + DeadlineForSubmittedPodConsideredMissing time.Duration + // Once the executor has seen a pod appear on the cluster, it considers that run Active + // If we get into a state where there is no longer a pod backing that Active run, this is how long we'll wait before we consider the pod missing + // The most likely cause of this is actually a bug in the executors processing of the kubernetes state + // However without it - we can have runs get indefinitely stuck as Active with no backing pod + DeadlineForActivePodConsideredMissing time.Duration +} + type IngressConfiguration struct { HostnameSuffix string CertNameSuffix string @@ -54,6 +65,7 @@ type KubernetesConfiguration struct { MaxTerminatedPods int MinimumJobSize armadaresource.ComputeResources PodDefaults *PodDefaults + StateChecks StateChecksConfiguration PendingPodChecks *podchecks.Checks FatalPodSubmissionErrors []string // Minimum amount of resources marked as allocated to non-Armada pods on each node. diff --git a/internal/executor/job/job_run_state_store.go b/internal/executor/job/job_run_state_store.go index 421d650e7d8..2752ac5bfb3 100644 --- a/internal/executor/job/job_run_state_store.go +++ b/internal/executor/job/job_run_state_store.go @@ -51,12 +51,14 @@ func NewJobRunStateStore(clusterContext context.ClusterContext) *JobRunStateStor return } - stateStore.reportRunActive(pod) + if !util.IsPodFinishedAndReported(pod) { + stateStore.reportRunActive(pod) + } }, }) // On start up, make sure our state matches current k8s state - err := stateStore.reconcileStateWithKubernetes() + err := stateStore.initialiseStateFromKubernetes() if err != nil { panic(err) } @@ -75,7 +77,7 @@ func NewJobRunStateStoreWithInitialState(initialJobRuns []*RunState) *JobRunStat return stateStore } -func (stateStore *JobRunStateStore) reconcileStateWithKubernetes() error { +func (stateStore *JobRunStateStore) initialiseStateFromKubernetes() error { pods, err := stateStore.clusterContext.GetAllPods() if err != nil { return err @@ -84,7 +86,9 @@ func (stateStore *JobRunStateStore) reconcileStateWithKubernetes() error { return !util.IsLegacyManagedPod(pod) }) for _, pod := range pods { - stateStore.reportRunActive(pod) + if !util.IsPodFinishedAndReported(pod) { + stateStore.reportRunActive(pod) + } } return nil diff --git a/internal/executor/job/job_run_state_store_test.go b/internal/executor/job/job_run_state_store_test.go index 9092ffa90d9..da29c9a4f7f 100644 --- a/internal/executor/job/job_run_state_store_test.go +++ b/internal/executor/job/job_run_state_store_test.go @@ -4,6 +4,7 @@ import ( "fmt" "sort" "testing" + "time" "github.com/stretchr/testify/assert" v1 "k8s.io/api/core/v1" @@ -23,7 +24,7 @@ var defaultRunInfoMeta = &RunMeta{ JobSet: "job-set-1", } -func TestOnStartUp_ReconcilesWithKubernetes(t *testing.T) { +func TestOnStartUp_ReconcilesWithKubernetes_ActivePod(t *testing.T) { existingPod := createPod() jobRunStateManager, _ := setup(t, []*v1.Pod{existingPod}) @@ -38,6 +39,18 @@ func TestOnStartUp_ReconcilesWithKubernetes(t *testing.T) { assert.Equal(t, allKnownJobRuns[0].Phase, Active) } +func TestOnStartUp_ReconcilesWithKubernetes_IgnoresDonePods(t *testing.T) { + donePod := createPod() + donePod.Status.Phase = v1.PodSucceeded + donePod.Annotations[domain.JobDoneAnnotation] = "true" + donePod.Annotations[string(donePod.Status.Phase)] = fmt.Sprintf("%s", time.Now()) + + jobRunStateManager, _ := setup(t, []*v1.Pod{donePod}) + allKnownJobRuns := jobRunStateManager.GetAll() + + assert.Len(t, allKnownJobRuns, 0) +} + func TestReportRunLeased(t *testing.T) { job := &SubmitJob{ Meta: SubmitJobMeta{ diff --git a/internal/executor/reporter/job_event_reporter.go b/internal/executor/reporter/job_event_reporter.go index 1ae228ed4c3..88c1091e002 100644 --- a/internal/executor/reporter/job_event_reporter.go +++ b/internal/executor/reporter/job_event_reporter.go @@ -169,6 +169,7 @@ func (eventReporter *JobEventReporter) reportStatusUpdate(old *v1.Pod, new *v1.P // Don't report status change for pods Armada is deleting // This prevents reporting JobFailed when we delete a pod - for example due to cancellation if util.IsMarkedForDeletion(new) { + log.Infof("not sending event to report pod %s moving into phase %s as pod is marked for deletion", new.Name, new.Status.Phase) return } eventReporter.reportCurrentStatus(new) diff --git a/internal/executor/service/pod_issue_handler.go b/internal/executor/service/pod_issue_handler.go index 8d6b6d99200..8d15d26bc84 100644 --- a/internal/executor/service/pod_issue_handler.go +++ b/internal/executor/service/pod_issue_handler.go @@ -8,67 +8,89 @@ import ( log "github.com/sirupsen/logrus" v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/util/clock" "k8s.io/client-go/tools/cache" + "github.com/armadaproject/armada/internal/executor/configuration" executorContext "github.com/armadaproject/armada/internal/executor/context" + "github.com/armadaproject/armada/internal/executor/job" "github.com/armadaproject/armada/internal/executor/podchecks" "github.com/armadaproject/armada/internal/executor/reporter" "github.com/armadaproject/armada/internal/executor/util" "github.com/armadaproject/armada/pkg/api" ) -type IssueType int +type podIssueType int const ( - UnableToSchedule IssueType = iota + UnableToSchedule podIssueType = iota StuckStartingUp StuckTerminating ExternallyDeleted + ErrorDuringIssueHandling ) type podIssue struct { // A copy of the pod when an issue was detected OriginalPodState *v1.Pod - JobId string - RunId string Message string Retryable bool - Reported bool DeletionRequested bool - Type IssueType + Type podIssueType Cause api.Cause } +type reconciliationIssue struct { + InitialDetectionTime time.Time + OriginalRunState *job.RunState +} + type issue struct { CurrentPodState *v1.Pod - Issue *podIssue + RunIssue *runIssue +} + +type runIssue struct { + JobId string + RunId string + PodIssue *podIssue + ReconciliationIssue *reconciliationIssue + Reported bool } -type PodIssueService struct { +type IssueHandler struct { clusterContext executorContext.ClusterContext eventReporter reporter.EventReporter pendingPodChecker podchecks.PodChecker + stateChecksConfig configuration.StateChecksConfiguration stuckTerminatingPodExpiry time.Duration // JobRunId -> PodIssue - knownPodIssues map[string]*podIssue + knownPodIssues map[string]*runIssue podIssueMutex sync.Mutex + jobRunState job.RunStateStore + clock clock.Clock } -func NewPodIssueService( +func NewIssueHandler( + jobRunState job.RunStateStore, clusterContext executorContext.ClusterContext, eventReporter reporter.EventReporter, + stateChecksConfig configuration.StateChecksConfiguration, pendingPodChecker podchecks.PodChecker, stuckTerminatingPodExpiry time.Duration, -) *PodIssueService { - podIssueService := &PodIssueService{ +) *IssueHandler { + issueHandler := &IssueHandler{ + jobRunState: jobRunState, clusterContext: clusterContext, eventReporter: eventReporter, pendingPodChecker: pendingPodChecker, + stateChecksConfig: stateChecksConfig, stuckTerminatingPodExpiry: stuckTerminatingPodExpiry, - knownPodIssues: map[string]*podIssue{}, + knownPodIssues: map[string]*runIssue{}, podIssueMutex: sync.Mutex{}, + clock: clock.RealClock{}, } clusterContext.AddPodEventHandler(cache.ResourceEventHandlerFuncs{ @@ -78,20 +100,20 @@ func NewPodIssueService( log.Errorf("Failed to process pod event due to it being an unexpected type. Failed to process %+v", obj) return } - podIssueService.handleDeletedPod(pod) + issueHandler.handleDeletedPod(pod) }, }) - return podIssueService + return issueHandler } -func (p *PodIssueService) registerIssue(issue *podIssue) { +func (p *IssueHandler) registerIssue(issue *runIssue) { p.podIssueMutex.Lock() defer p.podIssueMutex.Unlock() runId := issue.RunId if runId == "" { - log.Warnf("Not registering an issue for job %s (%s) as run id was empty", issue.JobId, issue.OriginalPodState.Name) + log.Warnf("Not registering an issue for job %s as run id was empty", issue.JobId) return } _, exists := p.knownPodIssues[issue.RunId] @@ -102,18 +124,18 @@ func (p *PodIssueService) registerIssue(issue *podIssue) { } } -func (p *PodIssueService) markIssuesResolved(issue *podIssue) { +func (p *IssueHandler) markIssuesResolved(issue *runIssue) { p.podIssueMutex.Lock() defer p.podIssueMutex.Unlock() delete(p.knownPodIssues, issue.RunId) } -func (p *PodIssueService) markIssueReported(issue *podIssue) { +func (p *IssueHandler) markIssueReported(issue *runIssue) { issue.Reported = true } -func (p *PodIssueService) HandlePodIssues() { +func (p *IssueHandler) HandlePodIssues() { managedPods, err := p.clusterContext.GetBatchPods() if err != nil { log.WithError(err).Errorf("unable to handle pod issus as failed to load pods") @@ -122,26 +144,29 @@ func (p *PodIssueService) HandlePodIssues() { return !util.IsLegacyManagedPod(pod) }) p.detectPodIssues(managedPods) + p.detectReconciliationIssues(managedPods) ctx, cancel := context.WithTimeout(context.Background(), time.Minute*2) defer cancel() - p.handleKnownPodIssues(ctx, managedPods) + p.handleKnownIssues(ctx, managedPods) } -func (p *PodIssueService) detectPodIssues(allManagedPods []*v1.Pod) { +func (p *IssueHandler) detectPodIssues(allManagedPods []*v1.Pod) { for _, pod := range allManagedPods { - if pod.DeletionTimestamp != nil && pod.DeletionTimestamp.Add(p.stuckTerminatingPodExpiry).Before(time.Now()) { + if pod.DeletionTimestamp != nil && pod.DeletionTimestamp.Add(p.stuckTerminatingPodExpiry).Before(p.clock.Now()) { // pod is stuck in terminating phase, this sometimes happen on node failure // it is safer to produce failed event than retrying as the job might have run already issue := &podIssue{ OriginalPodState: pod.DeepCopy(), - JobId: util.ExtractJobId(pod), - RunId: util.ExtractJobRunId(pod), Message: "pod stuck in terminating phase, this might be due to platform problems", Retryable: false, Type: StuckTerminating, } - p.registerIssue(issue) + p.registerIssue(&runIssue{ + JobId: util.ExtractJobId(pod), + RunId: util.ExtractJobRunId(pod), + PodIssue: issue, + }) } else if pod.Status.Phase == v1.PodUnknown || pod.Status.Phase == v1.PodPending { podEvents, err := p.clusterContext.GetPodEvents(pod) @@ -155,7 +180,7 @@ func (p *PodIssueService) detectPodIssues(allManagedPods []*v1.Pod) { continue } - action, cause, podCheckMessage := p.pendingPodChecker.GetAction(pod, podEvents, time.Now().Sub(lastStateChange)) + action, cause, podCheckMessage := p.pendingPodChecker.GetAction(pod, podEvents, p.clock.Now().Sub(lastStateChange)) if action != podchecks.ActionWait { retryable := action == podchecks.ActionRetry @@ -169,25 +194,27 @@ func (p *PodIssueService) detectPodIssues(allManagedPods []*v1.Pod) { issue := &podIssue{ OriginalPodState: pod.DeepCopy(), - JobId: util.ExtractJobId(pod), - RunId: util.ExtractJobRunId(pod), Message: message, Retryable: retryable, Type: podIssueType, } - p.registerIssue(issue) + p.registerIssue(&runIssue{ + JobId: util.ExtractJobId(pod), + RunId: util.ExtractJobRunId(pod), + PodIssue: issue, + }) } } } } -func (p *PodIssueService) handleKnownPodIssues(ctx context.Context, allManagedPods []*v1.Pod) { +func (p *IssueHandler) handleKnownIssues(ctx context.Context, allManagedPods []*v1.Pod) { // Make issues from pods + issues issues := createIssues(allManagedPods, p.knownPodIssues) - util.ProcessItemsWithThreadPool(ctx, 20, issues, p.handlePodIssue) + util.ProcessItemsWithThreadPool(ctx, 20, issues, p.handleRunIssue) } -func createIssues(managedPods []*v1.Pod, podIssues map[string]*podIssue) []*issue { +func createIssues(managedPods []*v1.Pod, runIssues map[string]*runIssue) []*issue { podsByRunId := make(map[string]*v1.Pod, len(managedPods)) for _, pod := range managedPods { @@ -199,25 +226,40 @@ func createIssues(managedPods []*v1.Pod, podIssues map[string]*podIssue) []*issu } } - result := make([]*issue, 0, len(podIssues)) + result := make([]*issue, 0, len(runIssues)) - for _, podIssue := range podIssues { - relatedPod := podsByRunId[podIssue.RunId] - result = append(result, &issue{CurrentPodState: relatedPod, Issue: podIssue}) + for _, runIssue := range runIssues { + relatedPod := podsByRunId[runIssue.RunId] + result = append(result, &issue{CurrentPodState: relatedPod, RunIssue: runIssue}) } return result } -func (p *PodIssueService) handlePodIssue(issue *issue) { +func (p *IssueHandler) handleRunIssue(issue *issue) { + if issue == nil || issue.RunIssue == nil { + log.Warnf("issue found with missing issue details") + return + } + if issue.RunIssue.PodIssue != nil { + p.handlePodIssue(issue) + } else if issue.RunIssue.ReconciliationIssue != nil { + p.handleReconciliationIssue(issue) + } else { + log.Warnf("issue found with no issue details set for job %s run %s", issue.RunIssue.JobId, issue.RunIssue.RunId) + p.markIssuesResolved(issue.RunIssue) + } +} + +func (p *IssueHandler) handlePodIssue(issue *issue) { hasSelfResolved := hasPodIssueSelfResolved(issue) if hasSelfResolved { - log.Infof("Issue for job %s run %s has self resolved", issue.Issue.JobId, issue.Issue.RunId) - p.markIssuesResolved(issue.Issue) + log.Infof("Issue for job %s run %s has self resolved", issue.RunIssue.JobId, issue.RunIssue.RunId) + p.markIssuesResolved(issue.RunIssue) return } - if issue.Issue.Retryable { + if issue.RunIssue.PodIssue.Retryable { p.handleRetryableJobIssue(issue) } else { p.handleNonRetryableJobIssue(issue) @@ -229,32 +271,32 @@ func (p *PodIssueService) handlePodIssue(issue *issue) { // - Report JobFailedEvent // // Once that is done we are free to cleanup the pod -func (p *PodIssueService) handleNonRetryableJobIssue(issue *issue) { - if !issue.Issue.Reported { - log.Infof("Non-retryable issue detected for job %s run %s - %s", issue.Issue.JobId, issue.Issue.RunId, issue.Issue.Message) - message := issue.Issue.Message +func (p *IssueHandler) handleNonRetryableJobIssue(issue *issue) { + if !issue.RunIssue.Reported { + log.Infof("Non-retryable issue detected for job %s run %s - %s", issue.RunIssue.JobId, issue.RunIssue.RunId, issue.RunIssue.PodIssue.Message) + message := issue.RunIssue.PodIssue.Message events := make([]reporter.EventMessage, 0, 2) - if issue.Issue.Type == StuckStartingUp || issue.Issue.Type == UnableToSchedule { - unableToScheduleEvent := reporter.CreateJobUnableToScheduleEvent(issue.Issue.OriginalPodState, message, p.clusterContext.GetClusterId()) - events = append(events, reporter.EventMessage{Event: unableToScheduleEvent, JobRunId: issue.Issue.RunId}) + if issue.RunIssue.PodIssue.Type == StuckStartingUp || issue.RunIssue.PodIssue.Type == UnableToSchedule { + unableToScheduleEvent := reporter.CreateJobUnableToScheduleEvent(issue.RunIssue.PodIssue.OriginalPodState, message, p.clusterContext.GetClusterId()) + events = append(events, reporter.EventMessage{Event: unableToScheduleEvent, JobRunId: issue.RunIssue.RunId}) } - failedEvent := reporter.CreateSimpleJobFailedEvent(issue.Issue.OriginalPodState, message, p.clusterContext.GetClusterId(), issue.Issue.Cause) - events = append(events, reporter.EventMessage{Event: failedEvent, JobRunId: issue.Issue.RunId}) + failedEvent := reporter.CreateSimpleJobFailedEvent(issue.RunIssue.PodIssue.OriginalPodState, message, p.clusterContext.GetClusterId(), issue.RunIssue.PodIssue.Cause) + events = append(events, reporter.EventMessage{Event: failedEvent, JobRunId: issue.RunIssue.RunId}) err := p.eventReporter.Report(events) if err != nil { - log.Errorf("Failed to report failed event for job %s because %s", issue.Issue.JobId, err) + log.Errorf("Failed to report failed event for job %s because %s", issue.RunIssue.JobId, err) return } - p.markIssueReported(issue.Issue) + p.markIssueReported(issue.RunIssue) } if issue.CurrentPodState != nil { p.clusterContext.DeletePods([]*v1.Pod{issue.CurrentPodState}) - issue.Issue.DeletionRequested = true + issue.RunIssue.PodIssue.DeletionRequested = true } else { - p.markIssuesResolved(issue.Issue) + p.markIssuesResolved(issue.RunIssue) } } @@ -262,76 +304,82 @@ func (p *PodIssueService) handleNonRetryableJobIssue(issue *issue) { // - Report JobUnableToScheduleEvent // - Report JobReturnLeaseEvent // -// Special consideration must be taken that most of these pods are somewhat "stuck" in pending. -// So can transition to Running/Completed/Failed in the middle of this -// We must not return the lease if the pod state changes - as likely it has become "unstuck" -func (p *PodIssueService) handleRetryableJobIssue(issue *issue) { - if !issue.Issue.Reported { - log.Infof("Retryable issue detected for job %s run %s - %s", issue.Issue.JobId, issue.Issue.RunId, issue.Issue.Message) - if issue.Issue.Type == StuckStartingUp || issue.Issue.Type == UnableToSchedule { - event := reporter.CreateJobUnableToScheduleEvent(issue.Issue.OriginalPodState, issue.Issue.Message, p.clusterContext.GetClusterId()) - err := p.eventReporter.Report([]reporter.EventMessage{{Event: event, JobRunId: issue.Issue.RunId}}) +// If the pod becomes Running/Completed/Failed in the middle of being deleted - swap this issue to a nonRetryableIssue where it will be Failed +func (p *IssueHandler) handleRetryableJobIssue(issue *issue) { + if !issue.RunIssue.Reported { + log.Infof("Retryable issue detected for job %s run %s - %s", issue.RunIssue.JobId, issue.RunIssue.RunId, issue.RunIssue.PodIssue.Message) + if issue.RunIssue.PodIssue.Type == StuckStartingUp || issue.RunIssue.PodIssue.Type == UnableToSchedule { + event := reporter.CreateJobUnableToScheduleEvent(issue.RunIssue.PodIssue.OriginalPodState, issue.RunIssue.PodIssue.Message, p.clusterContext.GetClusterId()) + err := p.eventReporter.Report([]reporter.EventMessage{{Event: event, JobRunId: issue.RunIssue.RunId}}) if err != nil { log.Errorf("Failure to report stuck pod event %+v because %s", event, err) return } } - p.markIssueReported(issue.Issue) + p.markIssueReported(issue.RunIssue) } if issue.CurrentPodState != nil { - // TODO consider moving this to a synchronous call - but long termination periods would need to be handled + if issue.CurrentPodState.Status.Phase != v1.PodPending { + p.markIssuesResolved(issue.RunIssue) + if issue.RunIssue.PodIssue.DeletionRequested { + p.registerIssue(&runIssue{ + JobId: issue.RunIssue.JobId, + RunId: issue.RunIssue.RunId, + PodIssue: &podIssue{ + OriginalPodState: issue.RunIssue.PodIssue.OriginalPodState, + Message: "Pod unexpectedly started up after delete was called", + Retryable: false, + DeletionRequested: false, + Type: ErrorDuringIssueHandling, + Cause: api.Cause_Error, + }, + }) + } + return + } + err := p.clusterContext.DeletePodWithCondition(issue.CurrentPodState, func(pod *v1.Pod) bool { return pod.Status.Phase == v1.PodPending }, true) if err != nil { - log.Errorf("Failed to delete pod of running job %s because %s", issue.Issue.JobId, err) + log.Errorf("Failed to delete pod of running job %s because %s", issue.RunIssue.JobId, err) return } else { - issue.Issue.DeletionRequested = true + issue.RunIssue.PodIssue.DeletionRequested = true } } else { // TODO // When we have our own internal state - we don't need to wait for the pod deletion to complete // We can just mark is to delete in our state and return the lease - jobRunAttempted := issue.Issue.Type != UnableToSchedule - returnLeaseEvent := reporter.CreateReturnLeaseEvent(issue.Issue.OriginalPodState, issue.Issue.Message, p.clusterContext.GetClusterId(), jobRunAttempted) - err := p.eventReporter.Report([]reporter.EventMessage{{Event: returnLeaseEvent, JobRunId: issue.Issue.RunId}}) + jobRunAttempted := issue.RunIssue.PodIssue.Type != UnableToSchedule + returnLeaseEvent := reporter.CreateReturnLeaseEvent(issue.RunIssue.PodIssue.OriginalPodState, issue.RunIssue.PodIssue.Message, p.clusterContext.GetClusterId(), jobRunAttempted) + err := p.eventReporter.Report([]reporter.EventMessage{{Event: returnLeaseEvent, JobRunId: issue.RunIssue.RunId}}) if err != nil { - log.Errorf("Failed to return lease for job %s because %s", issue.Issue.JobId, err) + log.Errorf("Failed to return lease for job %s because %s", issue.RunIssue.JobId, err) return } - p.markIssuesResolved(issue.Issue) + p.markIssuesResolved(issue.RunIssue) } } func hasPodIssueSelfResolved(issue *issue) bool { - if issue == nil || issue.Issue == nil { + if issue == nil || issue.RunIssue == nil || issue.RunIssue.PodIssue == nil { return true } - isStuckStartingUpAndResolvable := issue.Issue.Type == StuckStartingUp && - (issue.Issue.Retryable || (!issue.Issue.Retryable && !issue.Issue.Reported)) - if issue.Issue.Type == UnableToSchedule || isStuckStartingUpAndResolvable { + isStuckStartingUpAndResolvable := issue.RunIssue.PodIssue.Type == StuckStartingUp && + (issue.RunIssue.PodIssue.Retryable || (!issue.RunIssue.PodIssue.Retryable && !issue.RunIssue.Reported)) + if issue.RunIssue.PodIssue.Type == UnableToSchedule || isStuckStartingUpAndResolvable { // If pod has disappeared - don't consider it resolved as we still need to report the issue if issue.CurrentPodState == nil { return false } - // Pod has completed - no need to report any issues - if util.IsInTerminalState(issue.CurrentPodState) { - return true - } - - // Pod has started running, and we haven't requested deletion - let it continue - if issue.CurrentPodState.Status.Phase == v1.PodRunning && !issue.Issue.DeletionRequested { + // Pod has started up and we haven't tried to delete the pod yet - so resolve the issue + if issue.CurrentPodState.Status.Phase != v1.PodPending && !issue.RunIssue.PodIssue.DeletionRequested { return true } - // TODO There is an edge case here where the pod has started running but we have requested deletion - // Without a proper state model, we can't easily handle this correctly - // Ideally we'd see if it completes or deletes first and report it accordingly - // If it completes first - do nothing - // If it deletes first - report JobFailed (as we accidentally deleted it during the run) } return false @@ -344,19 +392,107 @@ func createStuckPodMessage(retryable bool, originalMessage string) string { return fmt.Sprintf("Unable to schedule pod with unrecoverable problem, Armada will not retry.\n%s", originalMessage) } -func (p *PodIssueService) handleDeletedPod(pod *v1.Pod) { +func (p *IssueHandler) handleDeletedPod(pod *v1.Pod) { jobId := util.ExtractJobId(pod) if jobId != "" { isUnexpectedDeletion := !util.IsMarkedForDeletion(pod) && !util.IsPodFinishedAndReported(pod) if isUnexpectedDeletion { - p.registerIssue(&podIssue{ - OriginalPodState: pod, - JobId: jobId, - RunId: util.ExtractJobRunId(pod), - Message: "Pod was unexpectedly deleted", - Retryable: false, - Reported: false, - Type: ExternallyDeleted, + p.registerIssue(&runIssue{ + JobId: jobId, + RunId: util.ExtractJobRunId(pod), + PodIssue: &podIssue{ + OriginalPodState: pod, + Message: "Pod was unexpectedly deleted", + Retryable: false, + Type: ExternallyDeleted, + }, + }) + } + } +} + +func (p *IssueHandler) handleReconciliationIssue(issue *issue) { + if issue.RunIssue.ReconciliationIssue == nil { + log.Warnf("unexpected trying to process an issue as a reconciliation issue for job %s run %s", issue.RunIssue.JobId, issue.RunIssue.RunId) + p.markIssuesResolved(issue.RunIssue) + return + } + + currentRunState := p.jobRunState.Get(issue.RunIssue.RunId) + if currentRunState == nil { + // No run for the run id - so there isn't a reconciliation issue + p.markIssuesResolved(issue.RunIssue) + return + } + + if issue.CurrentPodState != nil { + p.markIssuesResolved(issue.RunIssue) + return + } + + if issue.RunIssue.ReconciliationIssue.OriginalRunState.Phase != currentRunState.Phase || currentRunState.CancelRequested || currentRunState.PreemptionRequested { + // State of the run has changed - resolve + // If there is still an issue, it'll be re-detected + p.markIssuesResolved(issue.RunIssue) + return + } + + timeSinceInitialDetection := p.clock.Now().Sub(issue.RunIssue.ReconciliationIssue.InitialDetectionTime) + + // If there is an active run and the associated pod has been missing for more than a given time period, report the run as failed + if currentRunState.Phase == job.Active && timeSinceInitialDetection > p.stateChecksConfig.DeadlineForActivePodConsideredMissing { + log.Infof("Pod missing for active run detected for job %s run %s", issue.RunIssue.JobId, issue.RunIssue.RunId) + + event := &api.JobFailedEvent{ + JobId: currentRunState.Meta.JobId, + JobSetId: currentRunState.Meta.JobSet, + Queue: currentRunState.Meta.Queue, + Created: p.clock.Now(), + ClusterId: p.clusterContext.GetClusterId(), + Reason: fmt.Sprintf("Pod is unexpectedly missing in Kubernetes"), + Cause: api.Cause_Error, + } + + err := p.eventReporter.Report([]reporter.EventMessage{{Event: event, JobRunId: issue.RunIssue.RunId}}) + if err != nil { + log.Errorf("Failure to report failed event %+v because %s", event, err) + return + } + + p.markIssueReported(issue.RunIssue) + p.markIssuesResolved(issue.RunIssue) + } else if currentRunState.Phase == job.SuccessfulSubmission && timeSinceInitialDetection > p.stateChecksConfig.DeadlineForSubmittedPodConsideredMissing { + // If a pod hasn't shown up after a successful submission for a given time period, delete it from the run state + // This will cause it to be re-leased and submitted again + // If the issue is we are out of sync with kubernetes, the second submission will fail and kill the job + p.jobRunState.Delete(currentRunState.Meta.RunId) + p.markIssuesResolved(issue.RunIssue) + } +} + +func (p *IssueHandler) detectReconciliationIssues(pods []*v1.Pod) { + runs := p.jobRunState.GetAllWithFilter(func(state *job.RunState) bool { + return (state.Phase == job.Active || state.Phase == job.SuccessfulSubmission) && !state.CancelRequested && !state.PreemptionRequested + }) + + runIdsToPod := make(map[string]*v1.Pod, len(pods)) + for _, pod := range pods { + runId := util.ExtractJobRunId(pod) + if runId != "" { + runIdsToPod[runId] = pod + } + } + + for _, run := range runs { + _, present := runIdsToPod[run.Meta.RunId] + if !present { + p.registerIssue(&runIssue{ + JobId: run.Meta.JobId, + RunId: run.Meta.RunId, + ReconciliationIssue: &reconciliationIssue{ + InitialDetectionTime: p.clock.Now(), + OriginalRunState: run.DeepCopy(), + }, }) } } diff --git a/internal/executor/service/pod_issue_handler_test.go b/internal/executor/service/pod_issue_handler_test.go index bab9ea8bb2c..ccb8226d43d 100644 --- a/internal/executor/service/pod_issue_handler_test.go +++ b/internal/executor/service/pod_issue_handler_test.go @@ -6,8 +6,11 @@ import ( "github.com/stretchr/testify/assert" v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/util/clock" + "github.com/armadaproject/armada/internal/executor/configuration" fakecontext "github.com/armadaproject/armada/internal/executor/context/fake" + "github.com/armadaproject/armada/internal/executor/job" "github.com/armadaproject/armada/internal/executor/reporter" "github.com/armadaproject/armada/internal/executor/reporter/mocks" "github.com/armadaproject/armada/internal/executor/util" @@ -15,7 +18,7 @@ import ( ) func TestPodIssueService_DoesNothingIfNoPodsAreFound(t *testing.T) { - podIssueService, _, eventsReporter := setupTestComponents() + podIssueService, _, _, eventsReporter := setupTestComponents([]*job.RunState{}) podIssueService.HandlePodIssues() @@ -23,7 +26,7 @@ func TestPodIssueService_DoesNothingIfNoPodsAreFound(t *testing.T) { } func TestPodIssueService_DoesNothingIfNoStuckPodsAreFound(t *testing.T) { - podIssueService, fakeClusterContext, eventsReporter := setupTestComponents() + podIssueService, _, fakeClusterContext, eventsReporter := setupTestComponents([]*job.RunState{}) runningPod := makeRunningPod(false) addPod(t, fakeClusterContext, runningPod) @@ -35,7 +38,7 @@ func TestPodIssueService_DoesNothingIfNoStuckPodsAreFound(t *testing.T) { } func TestPodIssueService_DeletesPodAndReportsFailed_IfStuckAndUnretryable(t *testing.T) { - podIssueService, fakeClusterContext, eventsReporter := setupTestComponents() + podIssueService, _, fakeClusterContext, eventsReporter := setupTestComponents([]*job.RunState{}) unretryableStuckPod := makeUnretryableStuckPod(false) addPod(t, fakeClusterContext, unretryableStuckPod) @@ -54,7 +57,7 @@ func TestPodIssueService_DeletesPodAndReportsFailed_IfStuckAndUnretryable(t *tes } func TestPodIssueService_DeletesPodAndReportsFailed_IfStuckTerminating(t *testing.T) { - podIssueService, fakeClusterContext, eventsReporter := setupTestComponents() + podIssueService, _, fakeClusterContext, eventsReporter := setupTestComponents([]*job.RunState{}) terminatingPod := makeTerminatingPod(false) addPod(t, fakeClusterContext, terminatingPod) @@ -70,7 +73,7 @@ func TestPodIssueService_DeletesPodAndReportsFailed_IfStuckTerminating(t *testin } func TestPodIssueService_DeletesPodAndReportsLeaseReturned_IfRetryableStuckPod(t *testing.T) { - podIssueService, fakeClusterContext, eventsReporter := setupTestComponents() + podIssueService, _, fakeClusterContext, eventsReporter := setupTestComponents([]*job.RunState{}) retryableStuckPod := makeRetryableStuckPod(false) addPod(t, fakeClusterContext, retryableStuckPod) @@ -94,8 +97,39 @@ func TestPodIssueService_DeletesPodAndReportsLeaseReturned_IfRetryableStuckPod(t assert.True(t, ok) } +func TestPodIssueService_DeletesPodAndReportsFailed_IfRetryableStuckPodStartsUpAfterDeletionCalled(t *testing.T) { + podIssueService, _, fakeClusterContext, eventsReporter := setupTestComponents([]*job.RunState{}) + retryableStuckPod := makeRetryableStuckPod(false) + addPod(t, fakeClusterContext, retryableStuckPod) + + podIssueService.HandlePodIssues() + + // Reports UnableToSchedule + assert.Len(t, eventsReporter.ReceivedEvents, 1) + _, ok := eventsReporter.ReceivedEvents[0].Event.(*api.JobUnableToScheduleEvent) + assert.True(t, ok) + + // Reset events, and add pod back as running + eventsReporter.ReceivedEvents = []reporter.EventMessage{} + retryableStuckPod.Status.Phase = v1.PodRunning + addPod(t, fakeClusterContext, retryableStuckPod) + + // Detects pod is now unexpectedly running and marks it non-retryable + podIssueService.HandlePodIssues() + assert.Len(t, eventsReporter.ReceivedEvents, 0) + assert.Len(t, getActivePods(t, fakeClusterContext), 1) + + // Now processes the issue as non-retryable and fails the pod + podIssueService.HandlePodIssues() + assert.Len(t, getActivePods(t, fakeClusterContext), 0) + + assert.Len(t, eventsReporter.ReceivedEvents, 1) + _, ok = eventsReporter.ReceivedEvents[0].Event.(*api.JobFailedEvent) + assert.True(t, ok) +} + func TestPodIssueService_ReportsFailed_IfDeletedExternally(t *testing.T) { - podIssueService, fakeClusterContext, eventsReporter := setupTestComponents() + podIssueService, _, fakeClusterContext, eventsReporter := setupTestComponents([]*job.RunState{}) runningPod := makeRunningPod(false) fakeClusterContext.SimulateDeletionEvent(runningPod) @@ -108,17 +142,108 @@ func TestPodIssueService_ReportsFailed_IfDeletedExternally(t *testing.T) { assert.Equal(t, failedEvent.JobId, util.ExtractJobId(runningPod)) } -func setupTestComponents() (*PodIssueService, *fakecontext.SyncFakeClusterContext, *mocks.FakeEventReporter) { +func TestPodIssueService_ReportsFailed_IfPodOfActiveRunGoesMissing(t *testing.T) { + baseTime := time.Now() + fakeClock := clock.NewFakeClock(baseTime) + podIssueService, _, _, eventsReporter := setupTestComponents([]*job.RunState{createRunState("job-1", "run-1", job.Active)}) + podIssueService.clock = fakeClock + + podIssueService.HandlePodIssues() + // Nothing should happen, until the issue has been seen for a configured amount of time + assert.Len(t, eventsReporter.ReceivedEvents, 0) + + fakeClock.SetTime(baseTime.Add(10 * time.Minute)) + podIssueService.HandlePodIssues() + // Reports Failed + assert.Len(t, eventsReporter.ReceivedEvents, 1) + failedEvent, ok := eventsReporter.ReceivedEvents[0].Event.(*api.JobFailedEvent) + assert.True(t, ok) + assert.Equal(t, failedEvent.JobId, "job-1") +} + +func TestPodIssueService_DoesNothing_IfMissingPodOfActiveRunReturns(t *testing.T) { + baseTime := time.Now() + fakeClock := clock.NewFakeClock(baseTime) + runningPod := makeRunningPod(false) + runState := createRunState(util.ExtractJobId(runningPod), util.ExtractJobRunId(runningPod), job.Active) + podIssueService, _, fakeClusterContext, eventsReporter := setupTestComponents([]*job.RunState{runState}) + podIssueService.clock = fakeClock + + podIssueService.HandlePodIssues() + // Nothing should happen, until the issue has been seen for a configured amount of time + assert.Len(t, eventsReporter.ReceivedEvents, 0) + + addPod(t, fakeClusterContext, runningPod) + fakeClock.SetTime(baseTime.Add(10 * time.Minute)) + podIssueService.HandlePodIssues() + assert.Len(t, eventsReporter.ReceivedEvents, 0) +} + +func TestPodIssueService_DeleteRunFromRunState_IfSubmittedPodNeverAppears(t *testing.T) { + baseTime := time.Now() + fakeClock := clock.NewFakeClock(baseTime) + podIssueService, runStateStore, _, eventsReporter := setupTestComponents([]*job.RunState{createRunState("job-1", "run-1", job.SuccessfulSubmission)}) + podIssueService.clock = fakeClock + + podIssueService.HandlePodIssues() + // Nothing should happen, until the issue has been seen for a configured amount of time + assert.Len(t, eventsReporter.ReceivedEvents, 0) + assert.Len(t, runStateStore.GetAll(), 1) + + fakeClock.SetTime(baseTime.Add(20 * time.Minute)) + podIssueService.HandlePodIssues() + assert.Len(t, eventsReporter.ReceivedEvents, 0) + // Pod has been missing for greater than configured period, run should get deleted + assert.Len(t, runStateStore.GetAll(), 0) +} + +func TestPodIssueService_DoesNothing_IfSubmittedPodAppears(t *testing.T) { + baseTime := time.Now() + fakeClock := clock.NewFakeClock(baseTime) + runningPod := makeRunningPod(false) + runState := createRunState(util.ExtractJobId(runningPod), util.ExtractJobRunId(runningPod), job.SuccessfulSubmission) + podIssueService, runStateStore, fakeClusterContext, eventsReporter := setupTestComponents([]*job.RunState{runState}) + podIssueService.clock = fakeClock + + podIssueService.HandlePodIssues() + // Nothing should happen, until the issue has been seen for a configured amount of time + assert.Len(t, eventsReporter.ReceivedEvents, 0) + assert.Len(t, runStateStore.GetAll(), 1) + + addPod(t, fakeClusterContext, runningPod) + fakeClock.SetTime(baseTime.Add(20 * time.Minute)) + podIssueService.HandlePodIssues() + assert.Len(t, runStateStore.GetAll(), 1) +} + +func setupTestComponents(initialRunState []*job.RunState) (*IssueHandler, *job.JobRunStateStore, *fakecontext.SyncFakeClusterContext, *mocks.FakeEventReporter) { fakeClusterContext := fakecontext.NewSyncFakeClusterContext() eventReporter := mocks.NewFakeEventReporter() pendingPodChecker := makePodChecker() - - podIssueHandler := NewPodIssueService( + runStateStore := job.NewJobRunStateStoreWithInitialState(initialRunState) + stateChecksConfig := configuration.StateChecksConfiguration{ + DeadlineForSubmittedPodConsideredMissing: time.Minute * 15, + DeadlineForActivePodConsideredMissing: time.Minute * 5, + } + + podIssueHandler := NewIssueHandler( + runStateStore, fakeClusterContext, eventReporter, + stateChecksConfig, pendingPodChecker, time.Minute*3, ) - return podIssueHandler, fakeClusterContext, eventReporter + return podIssueHandler, runStateStore, fakeClusterContext, eventReporter +} + +func createRunState(jobId string, runId string, phase job.RunPhase) *job.RunState { + return &job.RunState{ + Phase: phase, + Meta: &job.RunMeta{ + JobId: jobId, + RunId: runId, + }, + } } diff --git a/internal/executor/utilisation/cluster_utilisation.go b/internal/executor/utilisation/cluster_utilisation.go index d6893cf7487..9c7f544b311 100644 --- a/internal/executor/utilisation/cluster_utilisation.go +++ b/internal/executor/utilisation/cluster_utilisation.go @@ -281,8 +281,8 @@ func groupPodsByNodes(pods []*v1.Pod) map[string][]*v1.Pod { return podsByNodes } -func allocatedByPriorityAndResourceTypeFromPods(pods []*v1.Pod) schedulerobjects.QuantityByPriorityAndResourceType { - rv := make(schedulerobjects.QuantityByPriorityAndResourceType) +func allocatedByPriorityAndResourceTypeFromPods(pods []*v1.Pod) schedulerobjects.QuantityByTAndResourceType[int32] { + rv := make(schedulerobjects.QuantityByTAndResourceType[int32]) for _, pod := range pods { var priority int32 = 0 if pod.Spec.Priority != nil { diff --git a/internal/lookout/ui/package.json b/internal/lookout/ui/package.json index 969aba79a98..34ee7b8898e 100644 --- a/internal/lookout/ui/package.json +++ b/internal/lookout/ui/package.json @@ -48,6 +48,7 @@ "eslint-plugin-import": "^2.23.3", "eslint-plugin-prettier": "^3.4.0", "eslint-plugin-react": "^7.31.11", + "jest-junit": "^16.0.0", "js-yaml": "^4.0.0", "notistack": "^2.0.8", "prettier": "^2.3.0", diff --git a/internal/lookout/ui/yarn.lock b/internal/lookout/ui/yarn.lock index 2c5239df332..18267034938 100644 --- a/internal/lookout/ui/yarn.lock +++ b/internal/lookout/ui/yarn.lock @@ -7044,6 +7044,16 @@ jest-jasmine2@^27.5.1: pretty-format "^27.5.1" throat "^6.0.1" +jest-junit@^16.0.0: + version "16.0.0" + resolved "https://registry.yarnpkg.com/jest-junit/-/jest-junit-16.0.0.tgz#d838e8c561cf9fdd7eb54f63020777eee4136785" + integrity sha512-A94mmw6NfJab4Fg/BlvVOUXzXgF0XIH6EmTgJ5NDPp4xoKq0Kr7sErb+4Xs9nZvu58pJojz5RFGpqnZYJTrRfQ== + dependencies: + mkdirp "^1.0.4" + strip-ansi "^6.0.1" + uuid "^8.3.2" + xml "^1.0.1" + jest-leak-detector@^27.5.1: version "27.5.1" resolved "https://registry.yarnpkg.com/jest-leak-detector/-/jest-leak-detector-27.5.1.tgz#6ec9d54c3579dd6e3e66d70e3498adf80fde3fb8" @@ -7876,6 +7886,11 @@ mitt@^2.1.0: resolved "https://registry.yarnpkg.com/mitt/-/mitt-2.1.0.tgz#f740577c23176c6205b121b2973514eade1b2230" integrity sha512-ILj2TpLiysu2wkBbWjAmww7TkZb65aiQO+DkVdUTBpBXq+MHYiETENkKFMtsJZX1Lf4pe4QOrTSjIfUwN5lRdg== +mkdirp@^1.0.4: + version "1.0.4" + resolved "https://registry.yarnpkg.com/mkdirp/-/mkdirp-1.0.4.tgz#3eb5ed62622756d79a5f0e2a221dfebad75c2f7e" + integrity sha512-vVqVZQyf3WLx2Shd0qJ9xuvqgAyKPLAiqITEtqW0oIUjzo3PePDd6fW9iFz30ef7Ysp/oiWqbhszeGWW2T6Gzw== + mkdirp@~0.5.1: version "0.5.6" resolved "https://registry.yarnpkg.com/mkdirp/-/mkdirp-0.5.6.tgz#7def03d2432dcae4ba1d611445c48396062255f6" @@ -10969,6 +10984,11 @@ xml-name-validator@^3.0.0: resolved "https://registry.yarnpkg.com/xml-name-validator/-/xml-name-validator-3.0.0.tgz#6ae73e06de4d8c6e47f9fb181f78d648ad457c6a" integrity sha512-A5CUptxDsvxKJEU3yO6DuWBSJz/qizqzJKOMIfUJHETbBw/sFaDxgd6fxm1ewUaM0jZ444Fc5vC5ROYurg/4Pw== +xml@^1.0.1: + version "1.0.1" + resolved "https://registry.yarnpkg.com/xml/-/xml-1.0.1.tgz#78ba72020029c5bc87b8a81a3cfcd74b4a2fc1e5" + integrity sha512-huCv9IH9Tcf95zuYCsQraZtWnJvBtLVE0QHMOs8bWyZAFZNDcYjsPq1nEx8jKA9y+Beo9v+7OBPRisQTjinQMw== + xmlchars@^2.2.0: version "2.2.0" resolved "https://registry.yarnpkg.com/xmlchars/-/xmlchars-2.2.0.tgz#060fe1bcb7f9c76fe2a17db86a9bc3ab894210cb" diff --git a/internal/lookoutv2/conversions/convert_test.go b/internal/lookoutv2/conversions/convert_test.go index 9d5649156ac..32130e63ff3 100644 --- a/internal/lookoutv2/conversions/convert_test.go +++ b/internal/lookoutv2/conversions/convert_test.go @@ -86,16 +86,22 @@ var ( } swaggerGroup = &models.Group{ - Aggregates: map[string]string{ + Aggregates: map[string]interface{}{ "averageTimeInState": "3d", + "state": map[string]int{ + "QUEUED": 321, + }, }, Count: 1000, Name: "queue-1", } group = &model.JobGroup{ - Aggregates: map[string]string{ + Aggregates: map[string]interface{}{ "averageTimeInState": "3d", + "state": map[string]int{ + "QUEUED": 321, + }, }, Count: 1000, Name: "queue-1", diff --git a/internal/lookoutv2/gen/models/group.go b/internal/lookoutv2/gen/models/group.go index 71adda73be1..25c8d68892a 100644 --- a/internal/lookoutv2/gen/models/group.go +++ b/internal/lookoutv2/gen/models/group.go @@ -21,7 +21,7 @@ type Group struct { // aggregates // Required: true - Aggregates map[string]string `json:"aggregates"` + Aggregates map[string]interface{} `json:"aggregates"` // count // Required: true @@ -61,6 +61,14 @@ func (m *Group) validateAggregates(formats strfmt.Registry) error { return err } + for k := range m.Aggregates { + + if err := validate.Required("aggregates"+"."+k, "body", m.Aggregates[k]); err != nil { + return err + } + + } + return nil } diff --git a/internal/lookoutv2/gen/restapi/embedded_spec.go b/internal/lookoutv2/gen/restapi/embedded_spec.go index 629d7b45500..c57b6290da2 100644 --- a/internal/lookoutv2/gen/restapi/embedded_spec.go +++ b/internal/lookoutv2/gen/restapi/embedded_spec.go @@ -426,7 +426,7 @@ func init() { "aggregates": { "type": "object", "additionalProperties": { - "type": "string" + "type": "object" }, "x-nullable": false }, @@ -1082,7 +1082,7 @@ func init() { "aggregates": { "type": "object", "additionalProperties": { - "type": "string" + "type": "object" }, "x-nullable": false }, diff --git a/internal/lookoutv2/model/model.go b/internal/lookoutv2/model/model.go index 349541d54cb..0d22f87ec3c 100644 --- a/internal/lookoutv2/model/model.go +++ b/internal/lookoutv2/model/model.go @@ -53,7 +53,7 @@ type Run struct { } type JobGroup struct { - Aggregates map[string]string + Aggregates map[string]interface{} Count int64 Name string } diff --git a/internal/lookoutv2/repository/aggregates.go b/internal/lookoutv2/repository/aggregates.go new file mode 100644 index 00000000000..ad7c1386dba --- /dev/null +++ b/internal/lookoutv2/repository/aggregates.go @@ -0,0 +1,133 @@ +package repository + +import ( + "fmt" + + "github.com/pkg/errors" + + "github.com/armadaproject/armada/internal/common/database/lookout" + "github.com/armadaproject/armada/internal/common/util" + "github.com/armadaproject/armada/internal/lookoutv2/model" +) + +type QueryAggregator interface { + AggregateSql() (string, error) +} + +type SqlFunctionAggregator struct { + queryCol *queryColumn + sqlFunction string +} + +func NewSqlFunctionAggregator(queryCol *queryColumn, fn string) *SqlFunctionAggregator { + return &SqlFunctionAggregator{ + queryCol: queryCol, + sqlFunction: fn, + } +} + +func (qa *SqlFunctionAggregator) aggregateColName() string { + return qa.queryCol.name +} + +func (qa *SqlFunctionAggregator) AggregateSql() (string, error) { + return fmt.Sprintf("%s(%s.%s) AS %s", qa.sqlFunction, qa.queryCol.abbrev, qa.queryCol.name, qa.aggregateColName()), nil +} + +type StateCountAggregator struct { + queryCol *queryColumn + stateString string +} + +func NewStateCountAggregator(queryCol *queryColumn, stateString string) *StateCountAggregator { + return &StateCountAggregator{ + queryCol: queryCol, + stateString: stateString, + } +} + +func (qa *StateCountAggregator) aggregateColName() string { + return fmt.Sprintf("%s_%s", qa.queryCol.name, qa.stateString) +} + +func (qa *StateCountAggregator) AggregateSql() (string, error) { + stateInt, ok := lookout.JobStateOrdinalMap[lookout.JobState(qa.stateString)] + if !ok { + return "", errors.Errorf("state %s does not exist", qa.stateString) + } + return fmt.Sprintf( + "SUM(CASE WHEN %s.%s = %d THEN 1 ELSE 0 END) AS %s", + qa.queryCol.abbrev, qa.queryCol.name, stateInt, qa.aggregateColName(), + ), nil +} + +func GetAggregatorsForColumn(queryCol *queryColumn, aggregateType AggregateType, filters []*model.Filter) ([]QueryAggregator, error) { + switch aggregateType { + case Max: + return []QueryAggregator{NewSqlFunctionAggregator(queryCol, "MAX")}, nil + case Average: + return []QueryAggregator{NewSqlFunctionAggregator(queryCol, "AVG")}, nil + case StateCounts: + states := GetStatesForFilter(filters) + aggregators := make([]QueryAggregator, len(states)) + for i, state := range states { + aggregators[i] = NewStateCountAggregator(queryCol, state) + } + return aggregators, nil + default: + return nil, errors.Errorf("cannot determine aggregate type: %v", aggregateType) + } +} + +// GetStatesForFilter returns a list of states as string if filter for state exists +// Will always return the states in the same order, irrespective of the ordering of the states in the filter +func GetStatesForFilter(filters []*model.Filter) []string { + var stateFilter *model.Filter + for _, f := range filters { + if f.Field == stateField { + stateFilter = f + } + } + allStates := util.Map(lookout.JobStates, func(jobState lookout.JobState) string { return string(jobState) }) + if stateFilter == nil { + // If no state filter is specified, use all states + return allStates + } + + switch stateFilter.Match { + case model.MatchExact: + return []string{fmt.Sprintf("%s", stateFilter.Value)} + case model.MatchAnyOf: + strSlice, err := toStringSlice(stateFilter.Value) + if err != nil { + return allStates + } + stateStringSet := util.StringListToSet(strSlice) + // Ensuring they are in the same order + var finalStates []string + for _, state := range allStates { + if _, ok := stateStringSet[state]; ok { + finalStates = append(finalStates, state) + } + } + return finalStates + default: + return allStates + } +} + +func toStringSlice(val interface{}) ([]string, error) { + switch v := val.(type) { + case []string: + return v, nil + case []interface{}: + result := make([]string, len(v)) + for i := 0; i < len(v); i++ { + str := fmt.Sprintf("%v", v[i]) + result[i] = str + } + return result, nil + default: + return nil, errors.Errorf("failed to convert interface to string slice: %v of type %T", val, val) + } +} diff --git a/internal/lookoutv2/repository/fieldparser.go b/internal/lookoutv2/repository/fieldparser.go new file mode 100644 index 00000000000..e8ddde0996b --- /dev/null +++ b/internal/lookoutv2/repository/fieldparser.go @@ -0,0 +1,122 @@ +package repository + +import ( + "fmt" + "math" + "time" + + "github.com/jackc/pgtype" + "github.com/pkg/errors" + + "github.com/armadaproject/armada/internal/common/database/lookout" + "github.com/armadaproject/armada/internal/lookoutv2/model" +) + +type FieldParser interface { + GetField() string + GetVariableRef() interface{} + ParseValue() (interface{}, error) +} + +type LastTransitionTimeParser struct { + variable pgtype.Numeric +} + +func (fp *LastTransitionTimeParser) GetField() string { + return lastTransitionTimeField +} + +func (fp *LastTransitionTimeParser) GetVariableRef() interface{} { + return &fp.variable +} + +func (fp *LastTransitionTimeParser) ParseValue() (interface{}, error) { + var dst float64 + err := fp.variable.AssignTo(&dst) + if err != nil { + return "", err + } + t := time.Unix(int64(math.Round(dst)), 0) + return t.Format(time.RFC3339), nil +} + +type TimeParser struct { + field string + variable time.Time +} + +func (fp *TimeParser) GetField() string { + return fp.field +} + +func (fp *TimeParser) GetVariableRef() interface{} { + return &fp.variable +} + +func (fp *TimeParser) ParseValue() (interface{}, error) { + return fp.variable.Format(time.RFC3339), nil +} + +type StateParser struct { + variable int16 +} + +func (fp *StateParser) GetField() string { + return stateField +} + +func (fp *StateParser) GetVariableRef() interface{} { + return &fp.variable +} + +func (fp *StateParser) ParseValue() (interface{}, error) { + state, ok := lookout.JobStateMap[int(fp.variable)] + if !ok { + return "", errors.Errorf("state not found: %d", fp.variable) + } + return string(state), nil +} + +type BasicParser[T any] struct { + field string + variable T +} + +func (fp *BasicParser[T]) GetField() string { + return fp.field +} + +func (fp *BasicParser[T]) GetVariableRef() interface{} { + return &fp.variable +} + +func (fp *BasicParser[T]) ParseValue() (interface{}, error) { + return fp.variable, nil +} + +func ParserForGroup(field string) FieldParser { + switch field { + case stateField: + return &StateParser{} + default: + return &BasicParser[string]{field: field} + } +} + +func ParsersForAggregate(field string, filters []*model.Filter) ([]FieldParser, error) { + var parsers []FieldParser + switch field { + case lastTransitionTimeField: + parsers = append(parsers, &LastTransitionTimeParser{}) + case submittedField: + parsers = append(parsers, &TimeParser{field: submittedField}) + case stateField: + states := GetStatesForFilter(filters) + for _, state := range states { + parsers = append(parsers, &BasicParser[int]{field: fmt.Sprintf("%s%s", stateAggregatePrefix, state)}) + } + default: + return nil, errors.Errorf("no aggregate found for field %s", field) + } + return parsers, nil +} diff --git a/internal/lookoutv2/repository/groupjobs.go b/internal/lookoutv2/repository/groupjobs.go index 1988e4a31ce..f8fe0b37206 100644 --- a/internal/lookoutv2/repository/groupjobs.go +++ b/internal/lookoutv2/repository/groupjobs.go @@ -2,16 +2,14 @@ package repository import ( "context" - "math" - "time" + "fmt" + "strings" - "github.com/jackc/pgtype" "github.com/jackc/pgx/v4" "github.com/jackc/pgx/v4/pgxpool" "github.com/pkg/errors" "github.com/armadaproject/armada/internal/common/database" - "github.com/armadaproject/armada/internal/common/database/lookout" "github.com/armadaproject/armada/internal/common/util" "github.com/armadaproject/armada/internal/lookoutv2/model" ) @@ -39,15 +37,7 @@ type SqlGroupJobsRepository struct { lookoutTables *LookoutTables } -type scanVarInit func() interface{} - -type parserFn func(interface{}) (string, error) - -type scanContext struct { - field string - varInit scanVarInit - parser parserFn -} +const stateAggregatePrefix = "state_" func NewSqlGroupJobsRepository(db *pgxpool.Pool) *SqlGroupJobsRepository { return &SqlGroupJobsRepository{ @@ -95,7 +85,7 @@ func (r *SqlGroupJobsRepository) GroupBy( if err != nil { return err } - groups, err = rowsToGroups(groupRows, groupedField, aggregates) + groups, err = rowsToGroups(groupRows, groupedField, aggregates, filters) return err }) if err != nil { @@ -108,10 +98,10 @@ func (r *SqlGroupJobsRepository) GroupBy( }, nil } -func rowsToGroups(rows pgx.Rows, groupedField *model.GroupedField, aggregates []string) ([]*model.JobGroup, error) { +func rowsToGroups(rows pgx.Rows, groupedField *model.GroupedField, aggregates []string, filters []*model.Filter) ([]*model.JobGroup, error) { var groups []*model.JobGroup for rows.Next() { - jobGroup, err := scanGroup(rows, groupedField.Field, aggregates) + jobGroup, err := scanGroup(rows, groupedField.Field, aggregates, filters) if err != nil { return nil, err } @@ -120,143 +110,59 @@ func rowsToGroups(rows pgx.Rows, groupedField *model.GroupedField, aggregates [] return groups, nil } -func scanGroup(rows pgx.Rows, field string, aggregates []string) (*model.JobGroup, error) { - groupScanContext, err := groupScanContextForField(field) - if err != nil { - return nil, err - } - group := groupScanContext.varInit() +func scanGroup(rows pgx.Rows, field string, aggregates []string, filters []*model.Filter) (*model.JobGroup, error) { + groupParser := ParserForGroup(field) var count int64 - - scanContexts := make([]*scanContext, len(aggregates)) - aggregateVars := make([]interface{}, len(aggregates)) - for i, aggregate := range aggregates { - sc, err := aggregateScanContextForField(aggregate) + var aggregateParsers []FieldParser + for _, aggregate := range aggregates { + parsers, err := ParsersForAggregate(aggregate, filters) if err != nil { return nil, err } - aggregateVars[i] = sc.varInit() - scanContexts[i] = sc + aggregateParsers = append(aggregateParsers, parsers...) } - aggregateRefs := make([]interface{}, len(aggregates)) - for i := 0; i < len(aggregates); i++ { - aggregateRefs[i] = &aggregateVars[i] + aggregateRefs := make([]interface{}, len(aggregateParsers)) + for i, parser := range aggregateParsers { + aggregateRefs[i] = parser.GetVariableRef() } - varAddresses := util.Concat([]interface{}{&group, &count}, aggregateRefs) - err = rows.Scan(varAddresses...) + varAddresses := util.Concat([]interface{}{groupParser.GetVariableRef(), &count}, aggregateRefs) + err := rows.Scan(varAddresses...) if err != nil { return nil, err } - parsedGroup, err := groupScanContext.parser(group) + parsedGroup, err := groupParser.ParseValue() if err != nil { return nil, err } - aggregatesMap := make(map[string]string) - for i, sc := range scanContexts { - val := aggregateVars[i] - parsedVal, err := sc.parser(val) + aggregatesMap := make(map[string]interface{}) + for _, parser := range aggregateParsers { + val, err := parser.ParseValue() if err != nil { - return nil, errors.Wrapf(err, "failed to parse value for field %s", sc.field) + return nil, errors.Wrapf(err, "failed to parse value for field %s", parser.GetField()) + } + if strings.HasPrefix(parser.GetField(), stateAggregatePrefix) { + singleStateCount, ok := val.(int) + if !ok { + return nil, errors.Errorf("failed to parse value for state aggregate: cannot convert value to int: %v: %T", singleStateCount, singleStateCount) + } + stateCountsVal, ok := aggregatesMap[stateField] + if !ok { + stateCountsVal = map[string]int{} + aggregatesMap[stateField] = stateCountsVal + } + stateCounts, ok := stateCountsVal.(map[string]int) + if !ok { + return nil, errors.Errorf("failed to parse value for state aggregate: cannot cast state counts to map") + } + state := parser.GetField()[len(stateAggregatePrefix):] + stateCounts[state] = singleStateCount + } else { + aggregatesMap[parser.GetField()] = val } - aggregatesMap[sc.field] = parsedVal } return &model.JobGroup{ - Name: parsedGroup, + Name: fmt.Sprintf("%s", parsedGroup), Count: count, Aggregates: aggregatesMap, }, nil } - -func groupScanContextForField(field string) (*scanContext, error) { - switch field { - case stateField: - return &scanContext{ - field: field, - varInit: int16ScanVar, - parser: stateParser, - }, nil - default: - return &scanContext{ - field: field, - varInit: stringScanVar, - parser: stringParser, - }, nil - } -} - -func aggregateScanContextForField(field string) (*scanContext, error) { - switch field { - case lastTransitionTimeField: - return &scanContext{ - field: lastTransitionTimeField, - varInit: numericScanVar, - parser: avgLastTransitionTimeParser, - }, nil - case submittedField: - return &scanContext{ - field: submittedField, - varInit: timeScanVar, - parser: maxSubmittedTimeParser, - }, nil - default: - return nil, errors.Errorf("no aggregate found for field %s", field) - } -} - -func stringScanVar() interface{} { - return "" -} - -func int16ScanVar() interface{} { - return int16(0) -} - -func numericScanVar() interface{} { - return pgtype.Numeric{} -} - -func timeScanVar() interface{} { - return time.Time{} -} - -func avgLastTransitionTimeParser(val interface{}) (string, error) { - lastTransitionTimeSeconds, ok := val.(pgtype.Numeric) - if !ok { - return "", errors.Errorf("could not convert %v: %T to int64", val, val) - } - var dst float64 - err := lastTransitionTimeSeconds.AssignTo(&dst) - if err != nil { - return "", err - } - t := time.Unix(int64(math.Round(dst)), 0) - return t.Format(time.RFC3339), nil -} - -func maxSubmittedTimeParser(val interface{}) (string, error) { - maxSubmittedTime, ok := val.(time.Time) - if !ok { - return "", errors.Errorf("could not convert %v: %T to time", val, val) - } - return maxSubmittedTime.Format(time.RFC3339), nil -} - -func stateParser(val interface{}) (string, error) { - stateInt, ok := val.(int16) - if !ok { - return "", errors.Errorf("could not convert %v: %T to int for state", val, val) - } - state, ok := lookout.JobStateMap[int(stateInt)] - if !ok { - return "", errors.Errorf("state not found: %d", stateInt) - } - return string(state), nil -} - -func stringParser(val interface{}) (string, error) { - str, ok := val.(string) - if !ok { - return "", errors.Errorf("could not convert %v: %T to string", val, val) - } - return str, nil -} diff --git a/internal/lookoutv2/repository/groupjobs_test.go b/internal/lookoutv2/repository/groupjobs_test.go index 29fb24a507c..2ca98fd8a26 100644 --- a/internal/lookoutv2/repository/groupjobs_test.go +++ b/internal/lookoutv2/repository/groupjobs_test.go @@ -59,17 +59,17 @@ func TestGroupByQueue(t *testing.T) { { Name: "queue-1", Count: 10, - Aggregates: map[string]string{}, + Aggregates: map[string]interface{}{}, }, { Name: "queue-2", Count: 5, - Aggregates: map[string]string{}, + Aggregates: map[string]interface{}{}, }, { Name: "queue-3", Count: 3, - Aggregates: map[string]string{}, + Aggregates: map[string]interface{}{}, }, }) return nil @@ -117,17 +117,17 @@ func TestGroupByJobSet(t *testing.T) { { Name: "job-set-1", Count: 10, - Aggregates: map[string]string{}, + Aggregates: map[string]interface{}{}, }, { Name: "job-set-2", Count: 5, - Aggregates: map[string]string{}, + Aggregates: map[string]interface{}{}, }, { Name: "job-set-3", Count: 3, - Aggregates: map[string]string{}, + Aggregates: map[string]interface{}{}, }, }) return nil @@ -183,22 +183,22 @@ func TestGroupByState(t *testing.T) { { Name: string(lookout.JobQueued), Count: 10, - Aggregates: map[string]string{}, + Aggregates: map[string]interface{}{}, }, { Name: string(lookout.JobPending), Count: 5, - Aggregates: map[string]string{}, + Aggregates: map[string]interface{}{}, }, { Name: string(lookout.JobRunning), Count: 3, - Aggregates: map[string]string{}, + Aggregates: map[string]interface{}{}, }, { Name: string(lookout.JobFailed), Count: 2, - Aggregates: map[string]string{}, + Aggregates: map[string]interface{}{}, }, }) return nil @@ -370,22 +370,22 @@ func TestGroupByWithFilters(t *testing.T) { { Name: string(lookout.JobQueued), Count: 10, - Aggregates: map[string]string{}, + Aggregates: map[string]interface{}{}, }, { Name: string(lookout.JobPending), Count: 5, - Aggregates: map[string]string{}, + Aggregates: map[string]interface{}{}, }, { Name: string(lookout.JobRunning), Count: 3, - Aggregates: map[string]string{}, + Aggregates: map[string]interface{}{}, }, { Name: string(lookout.JobFailed), Count: 2, - Aggregates: map[string]string{}, + Aggregates: map[string]interface{}{}, }, }) return nil @@ -468,21 +468,21 @@ func TestGroupJobsWithMaxSubmittedTime(t *testing.T) { { Name: "job-set-1", Count: 15, - Aggregates: map[string]string{ + Aggregates: map[string]interface{}{ "submitted": baseTime.Format(time.RFC3339), }, }, { Name: "job-set-2", Count: 12, - Aggregates: map[string]string{ + Aggregates: map[string]interface{}{ "submitted": baseTime.Add(-4 * time.Minute).Format(time.RFC3339), }, }, { Name: "job-set-3", Count: 18, - Aggregates: map[string]string{ + Aggregates: map[string]interface{}{ "submitted": baseTime.Add(-7 * time.Minute).Format(time.RFC3339), }, }, @@ -567,21 +567,21 @@ func TestGroupJobsWithAvgLastTransitionTime(t *testing.T) { { Name: "queue-3", Count: 18, - Aggregates: map[string]string{ + Aggregates: map[string]interface{}{ "lastTransitionTime": baseTime.Add(-8 * time.Minute).Format(time.RFC3339), }, }, { Name: "queue-2", Count: 12, - Aggregates: map[string]string{ + Aggregates: map[string]interface{}{ "lastTransitionTime": baseTime.Add(-5 * time.Minute).Format(time.RFC3339), }, }, { Name: "queue-1", Count: 15, - Aggregates: map[string]string{ + Aggregates: map[string]interface{}{ "lastTransitionTime": baseTime.Add(-1 * time.Minute).Format(time.RFC3339), }, }, @@ -591,6 +591,237 @@ func TestGroupJobsWithAvgLastTransitionTime(t *testing.T) { assert.NoError(t, err) } +func TestGroupJobsWithAllStateCounts(t *testing.T) { + err := lookout.WithLookoutDb(func(db *pgxpool.Pool) error { + converter := instructions.NewInstructionConverter(metrics.Get(), userAnnotationPrefix, &compress.NoOpCompressor{}, false) + store := lookoutdb.NewLookoutDb(db, metrics.Get(), 3, 10) + + manyJobs(5, &createJobsOpts{ + queue: "queue-1", + jobSet: "job-set-1", + state: lookout.JobQueued, + }, converter, store) + manyJobs(6, &createJobsOpts{ + queue: "queue-1", + jobSet: "job-set-1", + state: lookout.JobPending, + }, converter, store) + manyJobs(7, &createJobsOpts{ + queue: "queue-1", + jobSet: "job-set-1", + state: lookout.JobRunning, + }, converter, store) + + manyJobs(8, &createJobsOpts{ + queue: "queue-2", + jobSet: "job-set-2", + state: lookout.JobLeased, + }, converter, store) + manyJobs(9, &createJobsOpts{ + queue: "queue-2", + jobSet: "job-set-2", + state: lookout.JobPreempted, + }, converter, store) + manyJobs(10, &createJobsOpts{ + queue: "queue-2", + jobSet: "job-set-2", + state: lookout.JobCancelled, + }, converter, store) + + manyJobs(11, &createJobsOpts{ + queue: "queue-3", + jobSet: "job-set-3", + state: lookout.JobSucceeded, + }, converter, store) + manyJobs(12, &createJobsOpts{ + queue: "queue-3", + jobSet: "job-set-3", + state: lookout.JobFailed, + }, converter, store) + manyJobs(13, &createJobsOpts{ + queue: "queue-3", + jobSet: "job-set-3", + state: lookout.JobQueued, + }, converter, store) + + repo := NewSqlGroupJobsRepository(db) + result, err := repo.GroupBy( + context.TODO(), + []*model.Filter{}, + &model.Order{ + Field: "count", + Direction: "ASC", + }, + &model.GroupedField{ + Field: "jobSet", + }, + []string{"state"}, + 0, + 10, + ) + assert.NoError(t, err) + assert.Len(t, result.Groups, 3) + assert.Equal(t, 3, result.Count) + assert.Equal(t, []*model.JobGroup{ + { + Name: "job-set-1", + Count: 18, + Aggregates: map[string]interface{}{ + "state": map[string]int{ + string(lookout.JobQueued): 5, + string(lookout.JobLeased): 0, + string(lookout.JobPending): 6, + string(lookout.JobRunning): 7, + string(lookout.JobSucceeded): 0, + string(lookout.JobFailed): 0, + string(lookout.JobCancelled): 0, + string(lookout.JobPreempted): 0, + }, + }, + }, + { + Name: "job-set-2", + Count: 27, + Aggregates: map[string]interface{}{ + "state": map[string]int{ + string(lookout.JobQueued): 0, + string(lookout.JobLeased): 8, + string(lookout.JobPending): 0, + string(lookout.JobRunning): 0, + string(lookout.JobSucceeded): 0, + string(lookout.JobFailed): 0, + string(lookout.JobCancelled): 10, + string(lookout.JobPreempted): 9, + }, + }, + }, + { + Name: "job-set-3", + Count: 36, + Aggregates: map[string]interface{}{ + "state": map[string]int{ + string(lookout.JobQueued): 13, + string(lookout.JobLeased): 0, + string(lookout.JobPending): 0, + string(lookout.JobRunning): 0, + string(lookout.JobSucceeded): 11, + string(lookout.JobFailed): 12, + string(lookout.JobCancelled): 0, + string(lookout.JobPreempted): 0, + }, + }, + }, + }, result.Groups) + return nil + }) + assert.NoError(t, err) +} + +func TestGroupJobsWithFilteredStateCounts(t *testing.T) { + err := lookout.WithLookoutDb(func(db *pgxpool.Pool) error { + converter := instructions.NewInstructionConverter(metrics.Get(), userAnnotationPrefix, &compress.NoOpCompressor{}, false) + store := lookoutdb.NewLookoutDb(db, metrics.Get(), 3, 10) + + manyJobs(5, &createJobsOpts{ + queue: "queue-1", + jobSet: "job-set-1", + state: lookout.JobQueued, + }, converter, store) + manyJobs(6, &createJobsOpts{ + queue: "queue-1", + jobSet: "job-set-1", + state: lookout.JobPending, + }, converter, store) + manyJobs(7, &createJobsOpts{ + queue: "queue-1", + jobSet: "job-set-1", + state: lookout.JobRunning, + }, converter, store) + + manyJobs(9, &createJobsOpts{ + queue: "queue-2", + jobSet: "job-set-2", + state: lookout.JobPreempted, + }, converter, store) + manyJobs(10, &createJobsOpts{ + queue: "queue-2", + jobSet: "job-set-2", + state: lookout.JobCancelled, + }, converter, store) + + manyJobs(11, &createJobsOpts{ + queue: "queue-3", + jobSet: "job-set-3", + state: lookout.JobSucceeded, + }, converter, store) + manyJobs(12, &createJobsOpts{ + queue: "queue-3", + jobSet: "job-set-3", + state: lookout.JobFailed, + }, converter, store) + manyJobs(13, &createJobsOpts{ + queue: "queue-3", + jobSet: "job-set-3", + state: lookout.JobQueued, + }, converter, store) + + repo := NewSqlGroupJobsRepository(db) + result, err := repo.GroupBy( + context.TODO(), + []*model.Filter{ + { + Field: stateField, + Match: model.MatchAnyOf, + Value: []string{ + string(lookout.JobQueued), + string(lookout.JobPending), + string(lookout.JobRunning), + }, + }, + }, + &model.Order{ + Field: "count", + Direction: "DESC", + }, + &model.GroupedField{ + Field: "jobSet", + }, + []string{"state"}, + 0, + 10, + ) + assert.NoError(t, err) + assert.Len(t, result.Groups, 2) + assert.Equal(t, 2, result.Count) + assert.Equal(t, []*model.JobGroup{ + { + Name: "job-set-1", + Count: 18, + Aggregates: map[string]interface{}{ + "state": map[string]int{ + string(lookout.JobQueued): 5, + string(lookout.JobPending): 6, + string(lookout.JobRunning): 7, + }, + }, + }, + { + Name: "job-set-3", + Count: 13, + Aggregates: map[string]interface{}{ + "state": map[string]int{ + string(lookout.JobQueued): 13, + string(lookout.JobPending): 0, + string(lookout.JobRunning): 0, + }, + }, + }, + }, result.Groups) + return nil + }) + assert.NoError(t, err) +} + func TestGroupJobsComplex(t *testing.T) { err := lookout.WithLookoutDb(func(db *pgxpool.Pool) error { converter := instructions.NewInstructionConverter(metrics.Get(), userAnnotationPrefix, &compress.NoOpCompressor{}, true) @@ -709,7 +940,7 @@ func TestGroupJobsComplex(t *testing.T) { { Name: "job-set-2", Count: 2, - Aggregates: map[string]string{ + Aggregates: map[string]interface{}{ "submitted": baseTime.Add(20 * time.Minute).Format(time.RFC3339), "lastTransitionTime": baseTime.Add(50 * time.Minute).Format(time.RFC3339), }, @@ -717,7 +948,7 @@ func TestGroupJobsComplex(t *testing.T) { { Name: "job-set-1", Count: 15, - Aggregates: map[string]string{ + Aggregates: map[string]interface{}{ "submitted": baseTime.Add(3 * time.Minute).Format(time.RFC3339), "lastTransitionTime": baseTime.Add(5 * time.Minute).Format(time.RFC3339), }, @@ -778,17 +1009,17 @@ func TestGroupByAnnotation(t *testing.T) { { Name: "test-value-1", Count: 10, - Aggregates: map[string]string{}, + Aggregates: map[string]interface{}{}, }, { Name: "test-value-2", Count: 5, - Aggregates: map[string]string{}, + Aggregates: map[string]interface{}{}, }, { Name: "test-value-3", Count: 3, - Aggregates: map[string]string{}, + Aggregates: map[string]interface{}{}, }, }) return nil @@ -907,7 +1138,7 @@ func TestGroupByAnnotationWithFiltersAndAggregates(t *testing.T) { { Name: "4", Count: 2, - Aggregates: map[string]string{ + Aggregates: map[string]interface{}{ "submitted": baseTime.Add(20 * time.Minute).Format(time.RFC3339), "lastTransitionTime": baseTime.Add(50 * time.Minute).Format(time.RFC3339), }, @@ -915,7 +1146,7 @@ func TestGroupByAnnotationWithFiltersAndAggregates(t *testing.T) { { Name: "2", Count: 5, - Aggregates: map[string]string{ + Aggregates: map[string]interface{}{ "submitted": baseTime.Add(1 * time.Minute).Format(time.RFC3339), "lastTransitionTime": baseTime.Add(10 * time.Minute).Format(time.RFC3339), }, @@ -923,7 +1154,7 @@ func TestGroupByAnnotationWithFiltersAndAggregates(t *testing.T) { { Name: "3", Count: 5, - Aggregates: map[string]string{ + Aggregates: map[string]interface{}{ "submitted": baseTime.Add(3 * time.Minute).Format(time.RFC3339), "lastTransitionTime": baseTime.Add(5 * time.Minute).Format(time.RFC3339), }, @@ -931,7 +1162,7 @@ func TestGroupByAnnotationWithFiltersAndAggregates(t *testing.T) { { Name: "1", Count: 5, - Aggregates: map[string]string{ + Aggregates: map[string]interface{}{ "submitted": baseTime.Format(time.RFC3339), "lastTransitionTime": baseTime.Format(time.RFC3339), }, @@ -960,7 +1191,7 @@ func TestGroupJobsSkip(t *testing.T) { return &model.JobGroup{ Name: fmt.Sprintf("queue-%d", i), Count: int64(i), - Aggregates: map[string]string{}, + Aggregates: map[string]interface{}{}, } } @@ -1160,12 +1391,20 @@ func getCreateJobsFn(state lookout.JobState) createJobsFn { switch state { case lookout.JobQueued: return makeQueued + case lookout.JobLeased: + return makeLeased case lookout.JobPending: return makePending case lookout.JobRunning: return makeRunning + case lookout.JobSucceeded: + return makeSucceeded case lookout.JobFailed: return makeFailed + case lookout.JobCancelled: + return makeCancelled + case lookout.JobPreempted: + return makePreempted default: return makeQueued } @@ -1186,6 +1425,23 @@ func makeQueued(opts *createJobsOpts, converter *instructions.InstructionConvert Build() } +func makeLeased(opts *createJobsOpts, converter *instructions.InstructionConverter, store *lookoutdb.LookoutDb) { + tSubmit := baseTime + if opts.submittedTime != nil { + tSubmit = *opts.submittedTime + } + lastTransitionTime := baseTime + if opts.lastTransitionTime != nil { + lastTransitionTime = *opts.lastTransitionTime + } + NewJobSimulator(converter, store). + Submit(opts.queue, opts.jobSet, owner, tSubmit, &JobOptions{ + Annotations: opts.annotations, + }). + Lease(uuid.NewString(), lastTransitionTime). + Build() +} + func makePending(opts *createJobsOpts, converter *instructions.InstructionConverter, store *lookoutdb.LookoutDb) { tSubmit := baseTime if opts.submittedTime != nil { @@ -1222,6 +1478,27 @@ func makeRunning(opts *createJobsOpts, converter *instructions.InstructionConver Build() } +func makeSucceeded(opts *createJobsOpts, converter *instructions.InstructionConverter, store *lookoutdb.LookoutDb) { + tSubmit := baseTime + if opts.submittedTime != nil { + tSubmit = *opts.submittedTime + } + lastTransitionTime := baseTime + if opts.lastTransitionTime != nil { + lastTransitionTime = *opts.lastTransitionTime + } + runId := uuid.NewString() + NewJobSimulator(converter, store). + Submit(opts.queue, opts.jobSet, owner, tSubmit, &JobOptions{ + Annotations: opts.annotations, + }). + Pending(runId, cluster, lastTransitionTime.Add(-2*time.Minute)). + Running(runId, cluster, lastTransitionTime.Add(-1*time.Minute)). + RunSucceeded(runId, lastTransitionTime). + Succeeded(lastTransitionTime). + Build() +} + func makeFailed(opts *createJobsOpts, converter *instructions.InstructionConverter, store *lookoutdb.LookoutDb) { tSubmit := baseTime if opts.submittedTime != nil { @@ -1242,3 +1519,40 @@ func makeFailed(opts *createJobsOpts, converter *instructions.InstructionConvert Failed(node, 1, "error", lastTransitionTime). Build() } + +func makeCancelled(opts *createJobsOpts, converter *instructions.InstructionConverter, store *lookoutdb.LookoutDb) { + tSubmit := baseTime + if opts.submittedTime != nil { + tSubmit = *opts.submittedTime + } + lastTransitionTime := baseTime + if opts.lastTransitionTime != nil { + lastTransitionTime = *opts.lastTransitionTime + } + NewJobSimulator(converter, store). + Submit(opts.queue, opts.jobSet, owner, tSubmit, &JobOptions{ + Annotations: opts.annotations, + }). + Cancelled(lastTransitionTime). + Build() +} + +func makePreempted(opts *createJobsOpts, converter *instructions.InstructionConverter, store *lookoutdb.LookoutDb) { + tSubmit := baseTime + if opts.submittedTime != nil { + tSubmit = *opts.submittedTime + } + lastTransitionTime := baseTime + if opts.lastTransitionTime != nil { + lastTransitionTime = *opts.lastTransitionTime + } + runId := uuid.NewString() + NewJobSimulator(converter, store). + Submit(opts.queue, opts.jobSet, owner, tSubmit, &JobOptions{ + Annotations: opts.annotations, + }). + Pending(runId, cluster, lastTransitionTime.Add(-2*time.Minute)). + Running(runId, cluster, lastTransitionTime.Add(-1*time.Minute)). + Preempted(lastTransitionTime). + Build() +} diff --git a/internal/lookoutv2/repository/common.go b/internal/lookoutv2/repository/querybuilder.go similarity index 95% rename from internal/lookoutv2/repository/common.go rename to internal/lookoutv2/repository/querybuilder.go index 33e1725db02..c0999dbd5dd 100644 --- a/internal/lookoutv2/repository/common.go +++ b/internal/lookoutv2/repository/querybuilder.go @@ -58,14 +58,6 @@ type queryOrder struct { direction string } -// Get aggregation expression for column, e.g. MAX(j.submitted) -type aggregatorFn func(column *queryColumn) string - -type queryAggregator struct { - column *queryColumn - aggregator aggregatorFn -} - func NewQueryBuilder(lookoutTables *LookoutTables) *QueryBuilder { return &QueryBuilder{ lookoutTables: lookoutTables, @@ -368,11 +360,14 @@ func (qb *QueryBuilder) GroupBy( if err != nil { return nil, err } - queryAggregators, err := qb.getQueryAggregators(aggregates, queryTables) + queryAggregators, err := qb.getQueryAggregators(aggregates, normalFilters, queryTables) + if err != nil { + return nil, err + } + selectListSql, err := qb.getAggregatesSql(queryAggregators) if err != nil { return nil, err } - selectListSql := qb.getAggregatesSql(queryAggregators) orderSql, err := qb.groupByOrderSql(order) if err != nil { return nil, err @@ -912,9 +907,9 @@ func (qb *QueryBuilder) highestPrecedenceTableForColumn(col string, queryTables return selectedTable, nil } -func (qb *QueryBuilder) getQueryAggregators(aggregates []string, queryTables map[string]bool) ([]*queryAggregator, error) { - queryAggregators := make([]*queryAggregator, len(aggregates)) - for i, aggregate := range aggregates { +func (qb *QueryBuilder) getQueryAggregators(aggregates []string, filters []*model.Filter, queryTables map[string]bool) ([]QueryAggregator, error) { + var queryAggregators []QueryAggregator + for _, aggregate := range aggregates { col, err := qb.lookoutTables.ColumnFromField(aggregate) if err != nil { return nil, err @@ -927,25 +922,25 @@ func (qb *QueryBuilder) getQueryAggregators(aggregates []string, queryTables map if err != nil { return nil, err } - fn, err := getAggregatorFn(aggregateType) + newQueryAggregators, err := GetAggregatorsForColumn(qc, aggregateType, filters) if err != nil { return nil, err } - queryAggregators[i] = &queryAggregator{ - column: qc, - aggregator: fn, - } + queryAggregators = append(queryAggregators, newQueryAggregators...) } return queryAggregators, nil } -func (qb *QueryBuilder) getAggregatesSql(aggregators []*queryAggregator) string { +func (qb *QueryBuilder) getAggregatesSql(aggregators []QueryAggregator) (string, error) { selectList := []string{"COUNT(*) AS count"} for _, agg := range aggregators { - sql := fmt.Sprintf("%s AS %s", agg.aggregator(agg.column), agg.column.name) + sql, err := agg.AggregateSql() + if err != nil { + return "", err + } selectList = append(selectList, sql) } - return strings.Join(selectList, ", ") + return strings.Join(selectList, ", "), nil } func (qb *QueryBuilder) groupByOrderSql(order *model.Order) (string, error) { @@ -962,23 +957,6 @@ func (qb *QueryBuilder) groupByOrderSql(order *model.Order) (string, error) { return fmt.Sprintf("ORDER BY %s %s", col, order.Direction), nil } -func getAggregatorFn(aggregateType AggregateType) (aggregatorFn, error) { - switch aggregateType { - case Max: - return func(col *queryColumn) string { - return fmt.Sprintf("MAX(%s.%s)", col.abbrev, col.name) - }, nil - case Average: - return func(col *queryColumn) string { - return fmt.Sprintf("AVG(%s.%s)", col.abbrev, col.name) - }, nil - case Unknown: - return nil, errors.New("unknown aggregate type") - default: - return nil, errors.Errorf("cannot determine aggregate type: %v", aggregateType) - } -} - func (qb *QueryBuilder) getQueryColumn(col string, queryTables map[string]bool) (*queryColumn, error) { table, err := qb.highestPrecedenceTableForColumn(col, queryTables) if err != nil { diff --git a/internal/lookoutv2/repository/common_test.go b/internal/lookoutv2/repository/querybuilder_test.go similarity index 88% rename from internal/lookoutv2/repository/common_test.go rename to internal/lookoutv2/repository/querybuilder_test.go index 3fe2dd708c5..aa15d3b82c0 100644 --- a/internal/lookoutv2/repository/common_test.go +++ b/internal/lookoutv2/repository/querybuilder_test.go @@ -7,6 +7,7 @@ import ( "github.com/stretchr/testify/assert" + "github.com/armadaproject/armada/internal/common/database/lookout" "github.com/armadaproject/armada/internal/common/util" "github.com/armadaproject/armada/internal/lookoutv2/model" ) @@ -446,6 +447,64 @@ func TestQueryBuilder_GroupByMultipleAggregates(t *testing.T) { assert.Equal(t, []interface{}{"test\\queue", "1234", "abcd", "test\\queue", "5678", "efgh%", "test\\queue", "anon\\\\one%"}, query.Args) } +func TestQueryBuilder_GroupByStateAggregates(t *testing.T) { + stateFilter := &model.Filter{ + Field: "state", + Match: model.MatchAnyOf, + Value: []string{ + string(lookout.JobQueued), + string(lookout.JobLeased), + string(lookout.JobPending), + string(lookout.JobRunning), + }, + } + query, err := NewQueryBuilder(NewTables()).GroupBy( + append(testFilters, stateFilter), + &model.Order{ + Direction: "DESC", + Field: "lastTransitionTime", + }, + &model.GroupedField{ + Field: "jobSet", + }, + []string{ + "lastTransitionTime", + "submitted", + "state", + }, + 20, + 100, + ) + assert.NoError(t, err) + assert.Equal(t, splitByWhitespace(` + SELECT j.jobset, + COUNT(*) AS count, + AVG(j.last_transition_time_seconds) AS last_transition_time_seconds, + MAX(j.submitted) AS submitted, + SUM(CASE WHEN j.state = 1 THEN 1 ELSE 0 END) AS state_QUEUED, + SUM(CASE WHEN j.state = 8 THEN 1 ELSE 0 END) AS state_LEASED, + SUM(CASE WHEN j.state = 2 THEN 1 ELSE 0 END) AS state_PENDING, + SUM(CASE WHEN j.state = 3 THEN 1 ELSE 0 END) AS state_RUNNING + FROM job AS j + INNER JOIN ( + SELECT job_id + FROM user_annotation_lookup + WHERE queue = $1 AND key = $2 AND value = $3 + ) AS ual0 ON j.job_id = ual0.job_id + INNER JOIN ( + SELECT job_id + FROM user_annotation_lookup + WHERE queue = $4 AND key = $5 AND value LIKE $6 + ) AS ual1 ON j.job_id = ual1.job_id + WHERE j.queue = $7 AND j.owner LIKE $8 AND j.state IN ($9, $10, $11, $12) + GROUP BY j.jobset + ORDER BY last_transition_time_seconds DESC + LIMIT 100 OFFSET 20 + `), + splitByWhitespace(query.Sql)) + assert.Equal(t, []interface{}{"test\\queue", "1234", "abcd", "test\\queue", "5678", "efgh%", "test\\queue", "anon\\\\one%", 1, 8, 2, 3}, query.Args) +} + func TestQueryBuilder_GroupByAnnotationMultipleAggregates(t *testing.T) { query, err := NewQueryBuilder(NewTables()).GroupBy( testFilters, diff --git a/internal/lookoutv2/repository/tables.go b/internal/lookoutv2/repository/tables.go index 4633620ec31..779f53fc854 100644 --- a/internal/lookoutv2/repository/tables.go +++ b/internal/lookoutv2/repository/tables.go @@ -41,9 +41,10 @@ const ( type AggregateType int const ( - Unknown AggregateType = -1 - Max = 0 - Average = 1 + Unknown AggregateType = -1 + Max = 0 + Average = 1 + StateCounts = 2 ) type LookoutTables struct { @@ -134,6 +135,7 @@ func NewTables() *LookoutTables { groupAggregates: map[string]AggregateType{ submittedCol: Max, lastTransitionTimeCol: Average, + stateCol: StateCounts, }, } } diff --git a/internal/lookoutv2/repository/util.go b/internal/lookoutv2/repository/util.go index 2b7d8820e38..00af6da2b06 100644 --- a/internal/lookoutv2/repository/util.go +++ b/internal/lookoutv2/repository/util.go @@ -166,6 +166,30 @@ func (js *JobSimulator) Submit(queue, jobSet, owner string, timestamp time.Time, return js } +func (js *JobSimulator) Lease(runId string, timestamp time.Time) *JobSimulator { + ts := timestampOrNow(timestamp) + leasedEvent := &armadaevents.EventSequence_Event{ + Created: &ts, + Event: &armadaevents.EventSequence_Event_JobRunLeased{ + JobRunLeased: &armadaevents.JobRunLeased{ + RunId: armadaevents.ProtoUuidFromUuid(uuid.MustParse(runId)), + JobId: js.jobId, + }, + }, + } + js.events = append(js.events, leasedEvent) + + js.job.LastActiveRunId = &runId + js.job.LastTransitionTime = ts + js.job.State = string(lookout.JobLeased) + updateRun(js.job, &runPatch{ + runId: runId, + jobRunState: pointer.String(string(lookout.JobRunLeased)), + pending: &ts, + }) + return js +} + func (js *JobSimulator) Pending(runId string, cluster string, timestamp time.Time) *JobSimulator { ts := timestampOrNow(timestamp) assignedEvent := &armadaevents.EventSequence_Event{ @@ -417,6 +441,31 @@ func (js *JobSimulator) Failed(node string, exitCode int32, message string, time return js } +func (js *JobSimulator) Preempted(timestamp time.Time) *JobSimulator { + ts := timestampOrNow(timestamp) + jobIdProto, err := armadaevents.ProtoUuidFromUlidString(util.NewULID()) + if err != nil { + log.WithError(err).Errorf("Could not convert job ID to UUID: %s", util.NewULID()) + } + + preempted := &armadaevents.EventSequence_Event{ + Created: &ts, + Event: &armadaevents.EventSequence_Event_JobRunPreempted{ + JobRunPreempted: &armadaevents.JobRunPreempted{ + PreemptedJobId: js.jobId, + PreemptiveJobId: jobIdProto, + PreemptedRunId: armadaevents.ProtoUuidFromUuid(uuid.MustParse(uuid.NewString())), + PreemptiveRunId: armadaevents.ProtoUuidFromUuid(uuid.MustParse(uuid.NewString())), + }, + }, + } + js.events = append(js.events, preempted) + + js.job.LastTransitionTime = ts + js.job.State = string(lookout.JobPreempted) + return js +} + func (js *JobSimulator) RunTerminated(runId string, cluster string, node string, message string, timestamp time.Time) *JobSimulator { ts := timestampOrNow(timestamp) terminated := &armadaevents.EventSequence_Event{ diff --git a/internal/lookoutv2/swagger.yaml b/internal/lookoutv2/swagger.yaml index 1b81fffe86d..6a18a4dc1e9 100644 --- a/internal/lookoutv2/swagger.yaml +++ b/internal/lookoutv2/swagger.yaml @@ -178,7 +178,7 @@ definitions: aggregates: type: object additionalProperties: - type: string + type: object x-nullable: false filter: type: object diff --git a/internal/scheduler/api.go b/internal/scheduler/api.go index d0ded087588..2e869782731 100644 --- a/internal/scheduler/api.go +++ b/internal/scheduler/api.go @@ -24,16 +24,25 @@ import ( "github.com/armadaproject/armada/pkg/executorapi" ) -// ExecutorApi is a gRPC service that exposes functionality required by the armada executors +// ExecutorApi is the gRPC service executors use to synchronise their state with that of the scheduler. type ExecutorApi struct { - producer pulsar.Producer - jobRepository database.JobRepository - executorRepository database.ExecutorRepository - legacyExecutorRepository database.ExecutorRepository - allowedPriorities []int32 // allowed priority classes - maxJobsPerCall uint // maximum number of jobs that will be leased in a single call - maxPulsarMessageSize uint // maximum sizer of pulsar messages produced - nodeIdLabel string + // Used to send Pulsar messages when, e.g., executors report a job has finished. + producer pulsar.Producer + // Interface to the component storing job information, such as which jobs are leased to a particular executor. + jobRepository database.JobRepository + // Interface to the component storing executor information, such as which when we last heard from an executor. + executorRepository database.ExecutorRepository + // Like executorRepository + legacyExecutorRepository database.ExecutorRepository + // Allowed priority class priorities. + allowedPriorities []int32 + // Max number of job leases sent per call to LeaseJobRuns. + maxJobsPerCall uint + // Max size of Pulsar messages produced. + maxPulsarMessageSizeBytes uint + // See scheduling config. + nodeIdLabel string + // See scheduling config. priorityClassNameOverride *string clock clock.Clock } @@ -46,6 +55,7 @@ func NewExecutorApi(producer pulsar.Producer, maxJobsPerCall uint, nodeIdLabel string, priorityClassNameOverride *string, + maxPulsarMessageSizeBytes uint, ) (*ExecutorApi, error) { if len(allowedPriorities) == 0 { return nil, errors.New("allowedPriorities cannot be empty") @@ -60,60 +70,56 @@ func NewExecutorApi(producer pulsar.Producer, legacyExecutorRepository: legacyExecutorRepository, allowedPriorities: allowedPriorities, maxJobsPerCall: maxJobsPerCall, - maxPulsarMessageSize: 1024 * 1024 * 2, + maxPulsarMessageSizeBytes: maxPulsarMessageSizeBytes, nodeIdLabel: nodeIdLabel, priorityClassNameOverride: priorityClassNameOverride, clock: clock.RealClock{}, }, nil } -// LeaseJobRuns performs the following actions: -// - Stores the request in postgres so that the scheduler can use the job + capacity information in the next scheduling round -// - Determines if any of the job runs in the request are no longer active and should be cancelled -// - Determines if any new job runs should be leased to the executor +// LeaseJobRuns reconciles the state of the executor with that of the scheduler. Specifically it: +// 1. Stores job and capacity information received from the executor to make it available to the scheduler. +// 2. Notifies the executor if any of its jobs are no longer active, e.g., due to being preempted by the scheduler. +// 3. Transfers any jobs scheduled on this executor cluster that the executor don't already have. func (srv *ExecutorApi) LeaseJobRuns(stream executorapi.ExecutorApi_LeaseJobRunsServer) error { - ctx := stream.Context() - log := ctxlogrus.Extract(ctx) // Receive once to get info necessary to get jobs to lease. req, err := stream.Recv() if err != nil { return errors.WithStack(err) } - log.Infof("Handling lease request for executor %s", req.ExecutorId) + ctx := stream.Context() + log := ctxlogrus.Extract(ctx) + log = log.WithField("executor", req.ExecutorId) - // store the executor state for use by the scheduler - executorState := srv.createExecutorState(ctx, req) - if err = srv.executorRepository.StoreExecutor(stream.Context(), executorState); err != nil { + executor := srv.executorFromLeaseRequest(ctx, req) + if err := srv.executorRepository.StoreExecutor(ctx, executor); err != nil { return err } - - // store the executor state for the legacy executor to use - if err = srv.legacyExecutorRepository.StoreExecutor(stream.Context(), executorState); err != nil { + if err = srv.legacyExecutorRepository.StoreExecutor(ctx, executor); err != nil { return err } - requestRuns, err := extractRunIds(req) + requestRuns, err := runIdsFromLeaseRequest(req) if err != nil { return err } - log.Debugf("Executor is currently aware of %d job runs", len(requestRuns)) - - runsToCancel, err := srv.jobRepository.FindInactiveRuns(stream.Context(), requestRuns) + runsToCancel, err := srv.jobRepository.FindInactiveRuns(ctx, requestRuns) if err != nil { return err } - log.Debugf("Detected %d runs that need cancelling", len(runsToCancel)) - - // Fetch new leases from the db - leases, err := srv.jobRepository.FetchJobRunLeases(stream.Context(), req.ExecutorId, srv.maxJobsPerCall, requestRuns) + newRuns, err := srv.jobRepository.FetchJobRunLeases(ctx, req.ExecutorId, srv.maxJobsPerCall, requestRuns) if err != nil { return err } + log.Infof( + "executor currently has %d job runs; sending %d cancellations and %d new runs", + len(requestRuns), len(runsToCancel), len(newRuns), + ) - // if necessary send a list of runs to cancel + // Send any runs that should be cancelled. if len(runsToCancel) > 0 { - err = stream.Send(&executorapi.LeaseStreamMessage{ + if err := stream.Send(&executorapi.LeaseStreamMessage{ Event: &executorapi.LeaseStreamMessage_CancelRuns{ CancelRuns: &executorapi.CancelRuns{ JobRunIdsToCancel: util.Map(runsToCancel, func(x uuid.UUID) *armadaevents.Uuid { @@ -121,25 +127,22 @@ func (srv *ExecutorApi) LeaseJobRuns(stream executorapi.ExecutorApi_LeaseJobRuns }), }, }, - }) - - if err != nil { + }); err != nil { return errors.WithStack(err) } } - // Now send any leases + // Send any scheduled jobs the executor doesn't already have. decompressor := compress.NewZlibDecompressor() - for _, lease := range leases { + for _, lease := range newRuns { submitMsg := &armadaevents.SubmitJob{} - err = decompressAndMarshall(lease.SubmitMessage, decompressor, submitMsg) - if err != nil { + if err := unmarshalFromCompressedBytes(lease.SubmitMessage, decompressor, submitMsg); err != nil { return err } if srv.priorityClassNameOverride != nil { srv.setPriorityClassName(submitMsg, *srv.priorityClassNameOverride) } - srv.addNodeSelector(submitMsg, lease.Node) + srv.addNodeIdSelector(submitMsg, lease.Node) var groups []string if len(lease.Groups) > 0 { @@ -148,7 +151,7 @@ func (srv *ExecutorApi) LeaseJobRuns(stream executorapi.ExecutorApi_LeaseJobRuns return err } } - err = stream.Send(&executorapi.LeaseStreamMessage{ + err := stream.Send(&executorapi.LeaseStreamMessage{ Event: &executorapi.LeaseStreamMessage_Lease{ Lease: &executorapi.JobRunLease{ JobRunId: armadaevents.ProtoUuidFromUuid(lease.RunID), @@ -189,11 +192,10 @@ func (srv *ExecutorApi) setPriorityClassName(job *armadaevents.SubmitJob, priori } } -func (srv *ExecutorApi) addNodeSelector(job *armadaevents.SubmitJob, nodeId string) { +func (srv *ExecutorApi) addNodeIdSelector(job *armadaevents.SubmitJob, nodeId string) { if job == nil || nodeId == "" { return } - if job.MainObject != nil { switch typed := job.MainObject.Object.(type) { case *armadaevents.KubernetesMainObject_PodSpec: @@ -207,9 +209,10 @@ func addNodeSelector(podSpec *armadaevents.PodSpecWithAvoidList, key string, val return } if podSpec.PodSpec.NodeSelector == nil { - podSpec.PodSpec.NodeSelector = make(map[string]string, 1) + podSpec.PodSpec.NodeSelector = map[string]string{key: value} + } else { + podSpec.PodSpec.NodeSelector[key] = value } - podSpec.PodSpec.NodeSelector[key] = value } func setPriorityClassName(podSpec *armadaevents.PodSpecWithAvoidList, priorityClassName string) { @@ -219,19 +222,19 @@ func setPriorityClassName(podSpec *armadaevents.PodSpecWithAvoidList, priorityCl podSpec.PodSpec.PriorityClassName = priorityClassName } -// ReportEvents publishes all events to pulsar. The events are compacted for more efficient publishing +// ReportEvents publishes all events to Pulsar. The events are compacted for more efficient publishing. func (srv *ExecutorApi) ReportEvents(ctx context.Context, list *executorapi.EventList) (*types.Empty, error) { - err := pulsarutils.CompactAndPublishSequences(ctx, list.Events, srv.producer, srv.maxPulsarMessageSize, schedulers.Pulsar) + err := pulsarutils.CompactAndPublishSequences(ctx, list.Events, srv.producer, srv.maxPulsarMessageSizeBytes, schedulers.Pulsar) return &types.Empty{}, err } -// createExecutorState extracts a schedulerobjects.Executor from the requesrt -func (srv *ExecutorApi) createExecutorState(ctx context.Context, req *executorapi.LeaseRequest) *schedulerobjects.Executor { +// executorFromLeaseRequest extracts a schedulerobjects.Executor from the request. +func (srv *ExecutorApi) executorFromLeaseRequest(ctx context.Context, req *executorapi.LeaseRequest) *schedulerobjects.Executor { log := ctxlogrus.Extract(ctx) nodes := make([]*schedulerobjects.Node, 0, len(req.Nodes)) + now := srv.clock.Now().UTC() for _, nodeInfo := range req.Nodes { - node, err := api.NewNodeFromNodeInfo(nodeInfo, req.ExecutorId, srv.allowedPriorities, srv.clock.Now().UTC()) - if err != nil { + if node, err := api.NewNodeFromNodeInfo(nodeInfo, req.ExecutorId, srv.allowedPriorities, now); err != nil { logging.WithStacktrace(log, err).Warnf( "skipping node %s from executor %s", nodeInfo.GetName(), req.GetExecutorId(), ) @@ -244,37 +247,35 @@ func (srv *ExecutorApi) createExecutorState(ctx context.Context, req *executorap Pool: req.Pool, Nodes: nodes, MinimumJobSize: schedulerobjects.ResourceList{Resources: req.MinimumJobSize}, - LastUpdateTime: srv.clock.Now().UTC(), - UnassignedJobRuns: util.Map(req.UnassignedJobRunIds, func(x armadaevents.Uuid) string { - return strings.ToLower(armadaevents.UuidFromProtoUuid(&x).String()) + LastUpdateTime: now, + UnassignedJobRuns: util.Map(req.UnassignedJobRunIds, func(jobId armadaevents.Uuid) string { + return strings.ToLower(armadaevents.UuidFromProtoUuid(&jobId).String()) }), } } -// extractRunIds extracts all the job runs contained in the executor request -func extractRunIds(req *executorapi.LeaseRequest) ([]uuid.UUID, error) { - runIds := make([]uuid.UUID, 0) - // add all runids from nodes +// runIdsFromLeaseRequest returns the ids of all runs in a lease request, including any not yet assigned to a node. +func runIdsFromLeaseRequest(req *executorapi.LeaseRequest) ([]uuid.UUID, error) { + runIds := make([]uuid.UUID, 0, 256) for _, node := range req.Nodes { for runIdStr := range node.RunIdsByState { - runId, err := uuid.Parse(runIdStr) - if err != nil { + if runId, err := uuid.Parse(runIdStr); err != nil { return nil, errors.WithStack(err) + } else { + runIds = append(runIds, runId) } - runIds = append(runIds, runId) } } - // add all unassigned runids for _, runId := range req.UnassignedJobRunIds { runIds = append(runIds, armadaevents.UuidFromProtoUuid(&runId)) } return runIds, nil } -func decompressAndMarshall(b []byte, decompressor compress.Decompressor, msg proto.Message) error { - decompressed, err := decompressor.Decompress(b) +func unmarshalFromCompressedBytes(bytes []byte, decompressor compress.Decompressor, msg proto.Message) error { + decompressedBytes, err := decompressor.Decompress(bytes) if err != nil { return err } - return proto.Unmarshal(decompressed, msg) + return proto.Unmarshal(decompressedBytes, msg) } diff --git a/internal/scheduler/api_test.go b/internal/scheduler/api_test.go index e0e30371755..5587c8cfb96 100644 --- a/internal/scheduler/api_test.go +++ b/internal/scheduler/api_test.go @@ -171,7 +171,7 @@ func TestExecutorApi_LeaseJobRuns(t *testing.T) { mockLegacyExecutorRepository := schedulermocks.NewMockExecutorRepository(ctrl) mockStream := schedulermocks.NewMockExecutorApi_LeaseJobRunsServer(ctrl) - runIds, err := extractRunIds(tc.request) + runIds, err := runIdsFromLeaseRequest(tc.request) require.NoError(t, err) // set up mocks @@ -204,6 +204,7 @@ func TestExecutorApi_LeaseJobRuns(t *testing.T) { maxJobsPerCall, "kubernetes.io/hostname", nil, + 4*1024*1024, ) require.NoError(t, err) server.clock = testClock @@ -331,6 +332,7 @@ func TestExecutorApi_Publish(t *testing.T) { 100, "kubernetes.io/hostname", nil, + 4*1024*1024, ) require.NoError(t, err) diff --git a/internal/scheduler/common.go b/internal/scheduler/common.go index e569412f866..5c27148b871 100644 --- a/internal/scheduler/common.go +++ b/internal/scheduler/common.go @@ -2,9 +2,7 @@ package scheduler import ( "fmt" - "math" "strconv" - "time" "github.com/pkg/errors" "golang.org/x/exp/maps" @@ -13,7 +11,6 @@ import ( armadamaps "github.com/armadaproject/armada/internal/common/maps" armadaslices "github.com/armadaproject/armada/internal/common/slices" schedulerconfig "github.com/armadaproject/armada/internal/scheduler/configuration" - schedulercontext "github.com/armadaproject/armada/internal/scheduler/context" "github.com/armadaproject/armada/internal/scheduler/interfaces" "github.com/armadaproject/armada/internal/scheduler/schedulerobjects" ) @@ -84,11 +81,7 @@ func JobsSummary(jobs []interfaces.LegacySchedulerJob) string { func(jobs []interfaces.LegacySchedulerJob) schedulerobjects.ResourceList { rv := schedulerobjects.NewResourceListWithDefaultSize() for _, job := range jobs { - req := PodRequirementFromLegacySchedulerJob(job, nil) - if req == nil { - continue - } - rv.AddV1ResourceList(req.ResourceRequirements.Requests) + rv.AddV1ResourceList(job.GetResourceRequirements().Requests) } return rv }, @@ -116,94 +109,18 @@ func JobsSummary(jobs []interfaces.LegacySchedulerJob) string { ) } -type AddOrSubtract int - -const ( - Add AddOrSubtract = iota - Subtract -) - -func UpdateUsage[S ~[]E, E interfaces.LegacySchedulerJob]( - usage map[string]schedulerobjects.QuantityByPriorityAndResourceType, - jobs S, - priorityClasses map[string]configuration.PriorityClass, - addOrSubtract AddOrSubtract, -) map[string]schedulerobjects.QuantityByPriorityAndResourceType { - if usage == nil { - usage = make(map[string]schedulerobjects.QuantityByPriorityAndResourceType) - } - for _, job := range jobs { - req := PodRequirementFromLegacySchedulerJob(job, priorityClasses) - if req == nil { - continue - } - requests := schedulerobjects.ResourceListFromV1ResourceList(req.ResourceRequirements.Requests) - queue := job.GetQueue() - m := usage[queue] - if m == nil { - m = make(schedulerobjects.QuantityByPriorityAndResourceType) - } - switch addOrSubtract { - case Add: - m.Add(schedulerobjects.QuantityByPriorityAndResourceType{req.Priority: requests}) - case Subtract: - m.Sub(schedulerobjects.QuantityByPriorityAndResourceType{req.Priority: requests}) - default: - panic(fmt.Sprintf("invalid operation %d", addOrSubtract)) - } - usage[queue] = m - } - return usage -} - -func jobSchedulingContextsFromJobs[T interfaces.LegacySchedulerJob](jobs []T, executorId string, priorityClasses map[string]configuration.PriorityClass) []*schedulercontext.JobSchedulingContext { - if jobs == nil { - return nil - } - if len(jobs) == 0 { - return make([]*schedulercontext.JobSchedulingContext, 0) - } - jctxs := make([]*schedulercontext.JobSchedulingContext, len(jobs)) - timestamp := time.Now() - for i, job := range jobs { - jctxs[i] = &schedulercontext.JobSchedulingContext{ - Created: timestamp, - ExecutorId: executorId, - JobId: job.GetId(), - Job: job, - Req: PodRequirementFromLegacySchedulerJob(job, priorityClasses), - } - } - return jctxs -} - func isEvictedJob(job interfaces.LegacySchedulerJob) bool { return job.GetAnnotations()[schedulerconfig.IsEvictedAnnotation] == "true" } -func targetNodeIdFromLegacySchedulerJob(job interfaces.LegacySchedulerJob) (string, bool) { - req := PodRequirementFromLegacySchedulerJob(job, nil) - if req == nil { - return "", false - } - nodeId, ok := req.NodeSelector[schedulerconfig.NodeIdLabel] +func targetNodeIdFromNodeSelector(nodeSelector map[string]string) (string, bool) { + nodeId, ok := nodeSelector[schedulerconfig.NodeIdLabel] return nodeId, ok } // GangIdAndCardinalityFromLegacySchedulerJob returns a tuple (gangId, gangCardinality, isGangJob, error). -func GangIdAndCardinalityFromLegacySchedulerJob(job interfaces.LegacySchedulerJob, priorityClasses map[string]configuration.PriorityClass) (string, int, bool, error) { - reqs := job.GetRequirements(priorityClasses) - if reqs == nil { - return "", 0, false, nil - } - if len(reqs.ObjectRequirements) != 1 { - return "", 0, false, errors.Errorf("expected exactly one object requirement in %v", reqs) - } - podReqs := reqs.ObjectRequirements[0].GetPodRequirements() - if podReqs == nil { - return "", 0, false, nil - } - return GangIdAndCardinalityFromAnnotations(podReqs.Annotations) +func GangIdAndCardinalityFromLegacySchedulerJob(job interfaces.LegacySchedulerJob) (string, int, bool, error) { + return GangIdAndCardinalityFromAnnotations(job.GetAnnotations()) } // GangIdAndCardinalityFromAnnotations returns a tuple (gangId, gangCardinality, isGangJob, error). @@ -228,59 +145,3 @@ func GangIdAndCardinalityFromAnnotations(annotations map[string]string) (string, } return gangId, gangCardinality, true, nil } - -// ResourceListAsWeightedMillis returns the linear combination of the milli values in rl with given weights. -// This function overflows for values that exceed MaxInt64. E.g., 1Pi is fine but not 10Pi. -func ResourceListAsWeightedMillis(weights map[string]float64, rl schedulerobjects.ResourceList) int64 { - var rv int64 - for t, f := range weights { - q := rl.Get(t) - rv += int64(math.Round(float64(q.MilliValue()) * f)) - } - return rv -} - -func PodRequirementsFromLegacySchedulerJobs[S ~[]E, E interfaces.LegacySchedulerJob](jobs S, priorityClasses map[string]configuration.PriorityClass) []*schedulerobjects.PodRequirements { - rv := make([]*schedulerobjects.PodRequirements, len(jobs)) - for i, job := range jobs { - rv[i] = PodRequirementFromLegacySchedulerJob(job, priorityClasses) - } - return rv -} - -func PodRequirementFromLegacySchedulerJob[E interfaces.LegacySchedulerJob](job E, priorityClasses map[string]configuration.PriorityClass) *schedulerobjects.PodRequirements { - annotations := make(map[string]string, len(configuration.ArmadaManagedAnnotations)+len(schedulerconfig.ArmadaSchedulerManagedAnnotations)) - for _, key := range configuration.ArmadaManagedAnnotations { - if value, ok := job.GetAnnotations()[key]; ok { - annotations[key] = value - } - } - for _, key := range schedulerconfig.ArmadaSchedulerManagedAnnotations { - if value, ok := job.GetAnnotations()[key]; ok { - annotations[key] = value - } - } - annotations[schedulerconfig.JobIdAnnotation] = job.GetId() - annotations[schedulerconfig.QueueAnnotation] = job.GetQueue() - info := job.GetRequirements(priorityClasses) - req := PodRequirementFromJobSchedulingInfo(info) - req.Annotations = annotations - return req -} - -func PodRequirementsFromJobSchedulingInfos(infos []*schedulerobjects.JobSchedulingInfo) []*schedulerobjects.PodRequirements { - rv := make([]*schedulerobjects.PodRequirements, 0, len(infos)) - for _, info := range infos { - rv = append(rv, PodRequirementFromJobSchedulingInfo(info)) - } - return rv -} - -func PodRequirementFromJobSchedulingInfo(info *schedulerobjects.JobSchedulingInfo) *schedulerobjects.PodRequirements { - for _, oreq := range info.ObjectRequirements { - if preq := oreq.GetPodRequirements(); preq != nil { - return preq - } - } - return nil -} diff --git a/internal/scheduler/common_test.go b/internal/scheduler/common_test.go index e1a87d287c1..73aae7be637 100644 --- a/internal/scheduler/common_test.go +++ b/internal/scheduler/common_test.go @@ -10,12 +10,11 @@ import ( "github.com/armadaproject/armada/internal/armada/configuration" "github.com/armadaproject/armada/internal/common/util" - schedulerconfig "github.com/armadaproject/armada/internal/scheduler/configuration" "github.com/armadaproject/armada/internal/scheduler/schedulerobjects" "github.com/armadaproject/armada/pkg/api" ) -func TestPodRequirementFromLegacySchedulerJob(t *testing.T) { +func TestGetPodRequirements(t *testing.T) { resourceLimit := v1.ResourceList{ "cpu": resource.MustParse("1"), "memory": resource.MustParse("128Mi"), @@ -64,13 +63,12 @@ func TestPodRequirementFromLegacySchedulerJob(t *testing.T) { PreemptionPolicy: string(v1.PreemptLowerPriority), ResourceRequirements: requirements, Annotations: map[string]string{ + "something": "test", configuration.GangIdAnnotation: "gang-id", configuration.GangCardinalityAnnotation: "1", - schedulerconfig.JobIdAnnotation: j.Id, - schedulerconfig.QueueAnnotation: j.Queue, }, } - actual := PodRequirementFromLegacySchedulerJob(j, map[string]configuration.PriorityClass{"armada-default": {Priority: int32(1)}}) + actual := j.GetPodRequirements(map[string]configuration.PriorityClass{"armada-default": {Priority: int32(1)}}) assert.Equal(t, expected, actual) } @@ -134,7 +132,7 @@ func TestResourceListAsWeightedMillis(t *testing.T) { } for name, tc := range tests { t.Run(name, func(t *testing.T) { - assert.Equal(t, tc.expected, ResourceListAsWeightedMillis(tc.weights, tc.rl)) + assert.Equal(t, tc.expected, tc.rl.AsWeightedMillis(tc.weights)) }) } } @@ -151,6 +149,6 @@ func BenchmarkResourceListAsWeightedMillis(b *testing.B) { } b.ResetTimer() for n := 0; n < b.N; n++ { - ResourceListAsWeightedMillis(weights, rl) + rl.AsWeightedMillis(weights) } } diff --git a/internal/scheduler/configuration/configuration.go b/internal/scheduler/configuration/configuration.go index 2a6227501d5..d0faf931e4c 100644 --- a/internal/scheduler/configuration/configuration.go +++ b/internal/scheduler/configuration/configuration.go @@ -10,23 +10,14 @@ import ( ) const ( - // IsEvictedAnnotation, indicates a pod was evicted in this round and is currently running. - // Used by the scheduler to differentiate between pods from running and queued jobs. + // IsEvictedAnnotation is set on evicted jobs; the scheduler uses it to differentiate between + // already-running and queued jobs. IsEvictedAnnotation = "armadaproject.io/isEvicted" - // JobIdAnnotation if set on a pod, indicates which job this pod is part of. - JobIdAnnotation = "armadaproject.io/jobId" - // QueueAnnotation if set on a pod, indicates which queue this pod is part of. - QueueAnnotation = "armadaproject.io/queue" - // IdNodeLabel is automatically added to nodes in the NodeDb. + // NodeIdLabel maps to a unique id associated with each node. + // This label is automatically added to nodes within the NodeDb. NodeIdLabel = "armadaproject.io/nodeId" ) -var ArmadaSchedulerManagedAnnotations = []string{ - IsEvictedAnnotation, - JobIdAnnotation, - QueueAnnotation, -} - type Configuration struct { // Database configuration Postgres configuration.PostgresConfig @@ -42,6 +33,7 @@ type Configuration struct { Scheduling configuration.SchedulingConfig Auth authconfig.AuthConfig Grpc grpcconfig.GrpcConfig + Http HttpConfig // Maximum number of strings that should be cached at any one time InternedStringsCacheSize uint32 `validate:"required"` // How often the scheduling cycle should run @@ -77,3 +69,7 @@ type LeaderConfig struct { // RetryPeriod is the duration the LeaderElector clients should waite between tries of actions. RetryPeriod time.Duration } + +type HttpConfig struct { + Port int `validate:"required"` +} diff --git a/internal/scheduler/constraints/constraints.go b/internal/scheduler/constraints/constraints.go index 2325737962d..477b121b2e4 100644 --- a/internal/scheduler/constraints/constraints.go +++ b/internal/scheduler/constraints/constraints.go @@ -53,11 +53,8 @@ type SchedulingConstraints struct { type PriorityClassSchedulingConstraints struct { PriorityClassName string PriorityClassPriority int32 - // Prevents jobs of this priority class from being scheduled if doing so would exceed - // cumulative resource usage at priority priorityClassPriority for the queue the job originates from. - // - // Cumulative resource usage at priority x includes resources allocated to jobs of priorityClassPriority x or lower. - MaximumCumulativeResourcesPerQueue schedulerobjects.ResourceList + // Limits total resources allocated to jobs of this priority class per queue. + MaximumResourcesPerQueue schedulerobjects.ResourceList } func SchedulingConstraintsFromSchedulingConfig( @@ -74,9 +71,9 @@ func SchedulingConstraintsFromSchedulingConfig( maximumResourceFractionPerQueue = m } priorityClassSchedulingConstraintsByPriorityClassName[name] = PriorityClassSchedulingConstraints{ - PriorityClassName: name, - PriorityClassPriority: priorityClass.Priority, - MaximumCumulativeResourcesPerQueue: absoluteFromRelativeLimits(totalResources, maximumResourceFractionPerQueue), + PriorityClassName: name, + PriorityClassPriority: priorityClass.Priority, + MaximumResourcesPerQueue: absoluteFromRelativeLimits(totalResources, maximumResourceFractionPerQueue), } } maximumResourceFractionToSchedule := config.MaximumResourceFractionToSchedule @@ -114,7 +111,7 @@ func (constraints *SchedulingConstraints) CheckRoundConstraints(sctx *schedulerc } // MaximumResourcesToSchedule check. - if exceedsResourceLimits(sctx.ScheduledResources, constraints.MaximumResourcesToSchedule) { + if !sctx.ScheduledResources.IsStrictlyLessOrEqual(constraints.MaximumResourcesToSchedule) { return false, UnschedulableReasonMaximumResourcesScheduled, nil } return true, "", nil @@ -132,26 +129,13 @@ func (constraints *SchedulingConstraints) CheckPerQueueAndPriorityClassConstrain // PriorityClassSchedulingConstraintsByPriorityClassName check. if priorityClassConstraint, ok := constraints.PriorityClassSchedulingConstraintsByPriorityClassName[priorityClassName]; ok { - allocatedByPriorityAndResourceType := schedulerobjects.NewAllocatedByPriorityAndResourceType([]int32{priorityClassConstraint.PriorityClassPriority}) - for p, rl := range qctx.AllocatedByPriority { - allocatedByPriorityAndResourceType.MarkAllocated(p, rl) - } - if exceedsResourceLimits( - // TODO: Avoid allocation. - schedulerobjects.QuantityByPriorityAndResourceType(allocatedByPriorityAndResourceType).AggregateByResource(), - priorityClassConstraint.MaximumCumulativeResourcesPerQueue, - ) { + if !qctx.AllocatedByPriorityClass[priorityClassName].IsStrictlyLessOrEqual(priorityClassConstraint.MaximumResourcesPerQueue) { return false, UnschedulableReasonMaximumResourcesPerQueueExceeded, nil } } return true, "", nil } -// exceedsResourceLimits returns true if used/total > limits for some resource. -func exceedsResourceLimits(used, limits schedulerobjects.ResourceList) bool { - return !used.IsStrictlyLessOrEqual(limits) -} - // ScaleQuantity scales q in-place by a factor f. // This functions overflows for quantities the milli value of which can't be expressed as an int64. // E.g., 1Pi is ok, but not 10Pi. diff --git a/internal/scheduler/context/context.go b/internal/scheduler/context/context.go index 40902b4eb00..3fd61bfd1c9 100644 --- a/internal/scheduler/context/context.go +++ b/internal/scheduler/context/context.go @@ -10,7 +10,7 @@ import ( "github.com/pkg/errors" "golang.org/x/exp/maps" "golang.org/x/exp/slices" - v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" "github.com/armadaproject/armada/internal/armada/configuration" "github.com/armadaproject/armada/internal/common/armadaerrors" @@ -35,18 +35,26 @@ type SchedulingContext struct { PriorityClasses map[string]configuration.PriorityClass // Default priority class. DefaultPriorityClass string - // Weights used when computing total resource usage. + // Determines how fairness is computed. + FairnessModel configuration.FairnessModel + // Resources considered when computing DominantResourceFairness. + DominantResourceFairnessResourcesToConsider []string + // Weights used when computing AssetFairness. ResourceScarcity map[string]float64 + // Sum of queue weights across all queues. + WeightSum float64 // Per-queue scheduling contexts. QueueSchedulingContexts map[string]*QueueSchedulingContext // Total resources across all clusters available at the start of the scheduling cycle. TotalResources schedulerobjects.ResourceList + // = TotalResources.AsWeightedMillis(ResourceScarcity). + TotalResourcesAsWeightedMillis int64 // Resources assigned across all queues during this scheduling cycle. - ScheduledResources schedulerobjects.ResourceList - ScheduledResourcesByPriority schedulerobjects.QuantityByPriorityAndResourceType + ScheduledResources schedulerobjects.ResourceList + ScheduledResourcesByPriorityClass schedulerobjects.QuantityByTAndResourceType[string] // Resources evicted across all queues during this scheduling cycle. - EvictedResources schedulerobjects.ResourceList - EvictedResourcesByPriority schedulerobjects.QuantityByPriorityAndResourceType + EvictedResources schedulerobjects.ResourceList + EvictedResourcesByPriorityClass schedulerobjects.QuantityByTAndResourceType[string] // Total number of successfully scheduled jobs. NumScheduledJobs int // Total number of successfully scheduled gangs. @@ -73,22 +81,29 @@ func NewSchedulingContext( totalResources schedulerobjects.ResourceList, ) *SchedulingContext { return &SchedulingContext{ - Started: time.Now(), - ExecutorId: executorId, - Pool: pool, - PriorityClasses: priorityClasses, - DefaultPriorityClass: defaultPriorityClass, - ResourceScarcity: resourceScarcity, - QueueSchedulingContexts: make(map[string]*QueueSchedulingContext), - TotalResources: totalResources.DeepCopy(), - ScheduledResources: schedulerobjects.NewResourceListWithDefaultSize(), - ScheduledResourcesByPriority: make(schedulerobjects.QuantityByPriorityAndResourceType), - EvictedResourcesByPriority: make(schedulerobjects.QuantityByPriorityAndResourceType), - SchedulingKeyGenerator: schedulerobjects.NewSchedulingKeyGenerator(), - UnfeasibleSchedulingKeys: make(map[schedulerobjects.SchedulingKey]*JobSchedulingContext), + Started: time.Now(), + ExecutorId: executorId, + Pool: pool, + PriorityClasses: priorityClasses, + DefaultPriorityClass: defaultPriorityClass, + FairnessModel: configuration.AssetFairness, + ResourceScarcity: resourceScarcity, + QueueSchedulingContexts: make(map[string]*QueueSchedulingContext), + TotalResources: totalResources.DeepCopy(), + TotalResourcesAsWeightedMillis: totalResources.AsWeightedMillis(resourceScarcity), + ScheduledResources: schedulerobjects.NewResourceListWithDefaultSize(), + ScheduledResourcesByPriorityClass: make(schedulerobjects.QuantityByTAndResourceType[string]), + EvictedResourcesByPriorityClass: make(schedulerobjects.QuantityByTAndResourceType[string]), + SchedulingKeyGenerator: schedulerobjects.NewSchedulingKeyGenerator(), + UnfeasibleSchedulingKeys: make(map[schedulerobjects.SchedulingKey]*JobSchedulingContext), } } +func (sctx *SchedulingContext) EnableDominantResourceFairness(dominantResourceFairnessResourcesToConsider []string) { + sctx.FairnessModel = configuration.DominantResourceFairness + sctx.DominantResourceFairnessResourcesToConsider = dominantResourceFairnessResourcesToConsider +} + func (sctx *SchedulingContext) SchedulingKeyFromLegacySchedulerJob(job interfaces.LegacySchedulerJob) schedulerobjects.SchedulingKey { var priority int32 if priorityClass, ok := sctx.PriorityClasses[job.GetPriorityClassName()]; ok { @@ -107,7 +122,7 @@ func (sctx *SchedulingContext) ClearUnfeasibleSchedulingKeys() { sctx.UnfeasibleSchedulingKeys = make(map[schedulerobjects.SchedulingKey]*JobSchedulingContext) } -func (sctx *SchedulingContext) AddQueueSchedulingContext(queue string, priorityFactor float64, initialAllocatedByPriority schedulerobjects.QuantityByPriorityAndResourceType) error { +func (sctx *SchedulingContext) AddQueueSchedulingContext(queue string, weight float64, initialAllocatedByPriorityClass schedulerobjects.QuantityByTAndResourceType[string]) error { if _, ok := sctx.QueueSchedulingContexts[queue]; ok { return errors.WithStack(&armadaerrors.ErrInvalidArgument{ Name: "queue", @@ -115,21 +130,26 @@ func (sctx *SchedulingContext) AddQueueSchedulingContext(queue string, priorityF Message: fmt.Sprintf("there already exists a context for queue %s", queue), }) } - if initialAllocatedByPriority == nil { - initialAllocatedByPriority = make(schedulerobjects.QuantityByPriorityAndResourceType) + if initialAllocatedByPriorityClass == nil { + initialAllocatedByPriorityClass = make(schedulerobjects.QuantityByTAndResourceType[string]) } else { - initialAllocatedByPriority = initialAllocatedByPriority.DeepCopy() + initialAllocatedByPriorityClass = initialAllocatedByPriorityClass.DeepCopy() + } + allocated := schedulerobjects.NewResourceListWithDefaultSize() + for _, rl := range initialAllocatedByPriorityClass { + allocated.Add(rl) } + sctx.WeightSum += weight qctx := &QueueSchedulingContext{ SchedulingContext: sctx, Created: time.Now(), ExecutorId: sctx.ExecutorId, Queue: queue, - PriorityFactor: priorityFactor, - Allocated: initialAllocatedByPriority.AggregateByResource(), - AllocatedByPriority: initialAllocatedByPriority, - ScheduledResourcesByPriority: make(schedulerobjects.QuantityByPriorityAndResourceType), - EvictedResourcesByPriority: make(schedulerobjects.QuantityByPriorityAndResourceType), + Weight: weight, + Allocated: allocated, + AllocatedByPriorityClass: initialAllocatedByPriorityClass, + ScheduledResourcesByPriorityClass: make(schedulerobjects.QuantityByTAndResourceType[string]), + EvictedResourcesByPriorityClass: make(schedulerobjects.QuantityByTAndResourceType[string]), SuccessfulJobSchedulingContexts: make(map[string]*JobSchedulingContext), UnsuccessfulJobSchedulingContexts: make(map[string]*JobSchedulingContext), EvictedJobsById: make(map[string]bool), @@ -142,6 +162,15 @@ func (sctx *SchedulingContext) String() string { return sctx.ReportString(0) } +// TotalCost returns the sum of the costs across all queues. +func (sctx *SchedulingContext) TotalCost() float64 { + var rv float64 + for _, qctx := range sctx.QueueSchedulingContexts { + rv += qctx.TotalCostForQueue() + } + return rv +} + func (sctx *SchedulingContext) ReportString(verbosity int32) string { var sb strings.Builder w := tabwriter.NewWriter(&sb, 1, 1, 1, ' ', 0) @@ -219,12 +248,12 @@ func (sctx *SchedulingContext) AddJobSchedulingContext(jctx *JobSchedulingContex } if jctx.IsSuccessful() { if evictedInThisRound { - sctx.EvictedResources.SubV1ResourceList(jctx.Req.ResourceRequirements.Requests) - sctx.EvictedResourcesByPriority.SubV1ResourceList(jctx.Req.Priority, jctx.Req.ResourceRequirements.Requests) + sctx.EvictedResources.SubV1ResourceList(jctx.PodRequirements.ResourceRequirements.Requests) + sctx.EvictedResourcesByPriorityClass.SubV1ResourceList(jctx.Job.GetPriorityClassName(), jctx.PodRequirements.ResourceRequirements.Requests) sctx.NumEvictedJobs-- } else { - sctx.ScheduledResources.AddV1ResourceList(jctx.Req.ResourceRequirements.Requests) - sctx.ScheduledResourcesByPriority.AddV1ResourceList(jctx.Req.Priority, jctx.Req.ResourceRequirements.Requests) + sctx.ScheduledResources.AddV1ResourceList(jctx.PodRequirements.ResourceRequirements.Requests) + sctx.ScheduledResourcesByPriorityClass.AddV1ResourceList(jctx.Job.GetPriorityClassName(), jctx.PodRequirements.ResourceRequirements.Requests) sctx.NumScheduledJobs++ } } @@ -255,14 +284,14 @@ func (sctx *SchedulingContext) EvictJob(job interfaces.LegacySchedulerJob) (bool if err != nil { return false, err } - priority, rl := priorityAndRequestsFromLegacySchedulerJob(job, sctx.PriorityClasses) + rl := job.GetResourceRequirements().Requests if scheduledInThisRound { sctx.ScheduledResources.SubV1ResourceList(rl) - sctx.ScheduledResourcesByPriority.SubV1ResourceList(priority, rl) + sctx.ScheduledResourcesByPriorityClass.SubV1ResourceList(job.GetPriorityClassName(), rl) sctx.NumScheduledJobs-- } else { sctx.EvictedResources.AddV1ResourceList(rl) - sctx.EvictedResourcesByPriority.AddV1ResourceList(priority, rl) + sctx.EvictedResourcesByPriorityClass.AddV1ResourceList(job.GetPriorityClassName(), rl) sctx.NumEvictedJobs++ } return scheduledInThisRound, nil @@ -286,14 +315,14 @@ func (sctx *SchedulingContext) SuccessfulJobSchedulingContexts() []*JobSchedulin } // AllocatedByQueueAndPriority returns map from queue name and priority to resources allocated. -func (sctx *SchedulingContext) AllocatedByQueueAndPriority() map[string]schedulerobjects.QuantityByPriorityAndResourceType { +func (sctx *SchedulingContext) AllocatedByQueueAndPriority() map[string]schedulerobjects.QuantityByTAndResourceType[string] { rv := make( - map[string]schedulerobjects.QuantityByPriorityAndResourceType, + map[string]schedulerobjects.QuantityByTAndResourceType[string], len(sctx.QueueSchedulingContexts), ) for queue, qctx := range sctx.QueueSchedulingContexts { - if len(qctx.AllocatedByPriority) > 0 { - rv[queue] = qctx.AllocatedByPriority.DeepCopy() + if !qctx.AllocatedByPriorityClass.IsZero() { + rv[queue] = qctx.AllocatedByPriorityClass.DeepCopy() } } return rv @@ -310,18 +339,18 @@ type QueueSchedulingContext struct { ExecutorId string // Queue name. Queue string - // These factors influence the fraction of resources assigned to each queue. - PriorityFactor float64 + // Determines the fair share of this queue relative to other queues. + Weight float64 // Total resources assigned to the queue across all clusters by priority class priority. // Includes jobs scheduled during this invocation of the scheduler. Allocated schedulerobjects.ResourceList - // Total resources assigned to the queue across all clusters by priority class priority. + // Total resources assigned to the queue across all clusters by priority class. // Includes jobs scheduled during this invocation of the scheduler. - AllocatedByPriority schedulerobjects.QuantityByPriorityAndResourceType + AllocatedByPriorityClass schedulerobjects.QuantityByTAndResourceType[string] // Resources assigned to this queue during this scheduling cycle. - ScheduledResourcesByPriority schedulerobjects.QuantityByPriorityAndResourceType + ScheduledResourcesByPriorityClass schedulerobjects.QuantityByTAndResourceType[string] // Resources evicted from this queue during this scheduling cycle. - EvictedResourcesByPriority schedulerobjects.QuantityByPriorityAndResourceType + EvictedResourcesByPriorityClass schedulerobjects.QuantityByTAndResourceType[string] // Job scheduling contexts associated with successful scheduling attempts. SuccessfulJobSchedulingContexts map[string]*JobSchedulingContext // Job scheduling contexts associated with unsuccessful scheduling attempts. @@ -350,13 +379,13 @@ func (qctx *QueueSchedulingContext) ReportString(verbosity int32) string { fmt.Fprintf(w, "Time:\t%s\n", qctx.Created) fmt.Fprintf(w, "Queue:\t%s\n", qctx.Queue) } - fmt.Fprintf(w, "Scheduled resources:\t%s\n", qctx.ScheduledResourcesByPriority.AggregateByResource().CompactString()) - fmt.Fprintf(w, "Scheduled resources (by priority):\t%s\n", qctx.ScheduledResourcesByPriority.String()) - fmt.Fprintf(w, "Preempted resources:\t%s\n", qctx.EvictedResourcesByPriority.AggregateByResource().CompactString()) - fmt.Fprintf(w, "Preempted resources (by priority):\t%s\n", qctx.EvictedResourcesByPriority.String()) + fmt.Fprintf(w, "Scheduled resources:\t%s\n", qctx.ScheduledResourcesByPriorityClass.AggregateByResource().CompactString()) + fmt.Fprintf(w, "Scheduled resources (by priority):\t%s\n", qctx.ScheduledResourcesByPriorityClass.String()) + fmt.Fprintf(w, "Preempted resources:\t%s\n", qctx.EvictedResourcesByPriorityClass.AggregateByResource().CompactString()) + fmt.Fprintf(w, "Preempted resources (by priority):\t%s\n", qctx.EvictedResourcesByPriorityClass.String()) if verbosity >= 0 { - fmt.Fprintf(w, "Total allocated resources after scheduling:\t%s\n", qctx.AllocatedByPriority.AggregateByResource().CompactString()) - fmt.Fprintf(w, "Total allocated resources after scheduling (by priority):\t%s\n", qctx.AllocatedByPriority.String()) + fmt.Fprintf(w, "Total allocated resources after scheduling:\t%s\n", qctx.Allocated.CompactString()) + fmt.Fprintf(w, "Total allocated resources after scheduling by priority class:\t%s\n", qctx.AllocatedByPriorityClass) fmt.Fprintf(w, "Number of jobs scheduled:\t%d\n", len(qctx.SuccessfulJobSchedulingContexts)) fmt.Fprintf(w, "Number of jobs preempted:\t%d\n", len(qctx.EvictedJobsById)) fmt.Fprintf(w, "Number of jobs that could not be scheduled:\t%d\n", len(qctx.UnsuccessfulJobSchedulingContexts)) @@ -431,23 +460,23 @@ func (qctx *QueueSchedulingContext) AddJobSchedulingContext(jctx *JobSchedulingC } _, evictedInThisRound := qctx.EvictedJobsById[jctx.JobId] if jctx.IsSuccessful() { - if jctx.Req == nil { + if jctx.PodRequirements == nil { return false, errors.Errorf("failed adding job %s to queue: job requirements are missing", jctx.JobId) } // Always update ResourcesByPriority. // Since ResourcesByPriority is used to order queues by fraction of fair share. - qctx.Allocated.AddV1ResourceList(jctx.Req.ResourceRequirements.Requests) - qctx.AllocatedByPriority.AddV1ResourceList(jctx.Req.Priority, jctx.Req.ResourceRequirements.Requests) + qctx.Allocated.AddV1ResourceList(jctx.PodRequirements.ResourceRequirements.Requests) + qctx.AllocatedByPriorityClass.AddV1ResourceList(jctx.Job.GetPriorityClassName(), jctx.PodRequirements.ResourceRequirements.Requests) // Only if the job is not evicted, update ScheduledResourcesByPriority. // Since ScheduledResourcesByPriority is used to control per-round scheduling constraints. if evictedInThisRound { delete(qctx.EvictedJobsById, jctx.JobId) - qctx.EvictedResourcesByPriority.SubV1ResourceList(jctx.Req.Priority, jctx.Req.ResourceRequirements.Requests) + qctx.EvictedResourcesByPriorityClass.SubV1ResourceList(jctx.Job.GetPriorityClassName(), jctx.PodRequirements.ResourceRequirements.Requests) } else { qctx.SuccessfulJobSchedulingContexts[jctx.JobId] = jctx - qctx.ScheduledResourcesByPriority.AddV1ResourceList(jctx.Req.Priority, jctx.Req.ResourceRequirements.Requests) + qctx.ScheduledResourcesByPriorityClass.AddV1ResourceList(jctx.Job.GetPriorityClassName(), jctx.PodRequirements.ResourceRequirements.Requests) } } else { qctx.UnsuccessfulJobSchedulingContexts[jctx.JobId] = jctx @@ -457,38 +486,26 @@ func (qctx *QueueSchedulingContext) AddJobSchedulingContext(jctx *JobSchedulingC func (qctx *QueueSchedulingContext) EvictJob(job interfaces.LegacySchedulerJob) (bool, error) { jobId := job.GetId() - priority, rl := priorityAndRequestsFromLegacySchedulerJob(job, qctx.SchedulingContext.PriorityClasses) if _, ok := qctx.UnsuccessfulJobSchedulingContexts[jobId]; ok { return false, errors.Errorf("failed evicting job %s from queue: job already marked unsuccessful", jobId) } if _, ok := qctx.EvictedJobsById[jobId]; ok { return false, errors.Errorf("failed evicting job %s from queue: job already marked evicted", jobId) } + rl := job.GetResourceRequirements().Requests _, scheduledInThisRound := qctx.SuccessfulJobSchedulingContexts[jobId] if scheduledInThisRound { - qctx.ScheduledResourcesByPriority.SubV1ResourceList(priority, rl) + qctx.ScheduledResourcesByPriorityClass.SubV1ResourceList(job.GetPriorityClassName(), rl) delete(qctx.SuccessfulJobSchedulingContexts, jobId) } else { - qctx.EvictedResourcesByPriority.AddV1ResourceList(priority, rl) + qctx.EvictedResourcesByPriorityClass.AddV1ResourceList(job.GetPriorityClassName(), rl) qctx.EvictedJobsById[jobId] = true } qctx.Allocated.SubV1ResourceList(rl) - qctx.AllocatedByPriority.SubV1ResourceList(priority, rl) + qctx.AllocatedByPriorityClass.SubV1ResourceList(job.GetPriorityClassName(), rl) return scheduledInThisRound, nil } -func priorityAndRequestsFromLegacySchedulerJob(job interfaces.LegacySchedulerJob, priorityClasses map[string]configuration.PriorityClass) (int32, v1.ResourceList) { - req := job.GetRequirements(priorityClasses) - for _, r := range req.ObjectRequirements { - podReqs := r.GetPodRequirements() - if podReqs == nil { - continue - } - return podReqs.Priority, podReqs.ResourceRequirements.Requests - } - return 0, nil -} - // ClearJobSpecs zeroes out job specs to reduce memory usage. func (qctx *QueueSchedulingContext) ClearJobSpecs() { for _, jctx := range qctx.SuccessfulJobSchedulingContexts { @@ -499,6 +516,50 @@ func (qctx *QueueSchedulingContext) ClearJobSpecs() { } } +// TotalCostForQueue returns the total cost of this queue. +func (qctx *QueueSchedulingContext) TotalCostForQueue() float64 { + return qctx.TotalCostForQueueWithAllocation(qctx.Allocated) +} + +// TotalCostForQueueWithAllocation returns the total cost of this queue if its total allocation is given by allocated. +func (qctx *QueueSchedulingContext) TotalCostForQueueWithAllocation(allocated schedulerobjects.ResourceList) float64 { + switch qctx.SchedulingContext.FairnessModel { + case configuration.AssetFairness: + return qctx.assetFairnessCostWithAllocation(allocated) + case configuration.DominantResourceFairness: + return qctx.dominantResourceFairnessCostWithAllocation(allocated) + default: + panic(fmt.Sprintf("unknown fairness type: %s", qctx.SchedulingContext.FairnessModel)) + } +} + +func (qctx *QueueSchedulingContext) assetFairnessCostWithAllocation(allocated schedulerobjects.ResourceList) float64 { + if len(qctx.SchedulingContext.ResourceScarcity) == 0 { + panic("ResourceScarcity is not set") + } + return float64(allocated.AsWeightedMillis(qctx.SchedulingContext.ResourceScarcity)) / qctx.Weight +} + +func (qctx *QueueSchedulingContext) dominantResourceFairnessCostWithAllocation(allocated schedulerobjects.ResourceList) float64 { + if len(qctx.SchedulingContext.DominantResourceFairnessResourcesToConsider) == 0 { + panic("DominantResourceFairnessResourcesToConsider is not set") + } + var cost float64 + for _, t := range qctx.SchedulingContext.DominantResourceFairnessResourcesToConsider { + capacity := qctx.SchedulingContext.TotalResources.Get(t) + if capacity.Equal(resource.Quantity{}) { + // Ignore any resources with zero capacity. + continue + } + q := allocated.Get(t) + tcost := float64(q.MilliValue()) / float64(capacity.MilliValue()) + if tcost > cost { + cost = tcost + } + } + return cost / qctx.Weight +} + type GangSchedulingContext struct { Created time.Time Queue string @@ -506,6 +567,7 @@ type GangSchedulingContext struct { JobSchedulingContexts []*JobSchedulingContext TotalResourceRequests schedulerobjects.ResourceList AllJobsEvicted bool + NodeUniformityLabel string } func NewGangSchedulingContext(jctxs []*JobSchedulingContext) *GangSchedulingContext { @@ -513,15 +575,19 @@ func NewGangSchedulingContext(jctxs []*JobSchedulingContext) *GangSchedulingCont // (which we enforce at job submission). queue := "" priorityClassName := "" + nodeUniformityLabel := "" if len(jctxs) > 0 { queue = jctxs[0].Job.GetQueue() priorityClassName = jctxs[0].Job.GetPriorityClassName() + if jctxs[0].PodRequirements != nil { + nodeUniformityLabel = jctxs[0].PodRequirements.Annotations[configuration.GangNodeUniformityLabelAnnotation] + } } allJobsEvicted := true totalResourceRequests := schedulerobjects.NewResourceList(4) for _, jctx := range jctxs { allJobsEvicted = allJobsEvicted && isEvictedJob(jctx.Job) - totalResourceRequests.AddV1ResourceList(jctx.Req.ResourceRequirements.Requests) + totalResourceRequests.AddV1ResourceList(jctx.PodRequirements.ResourceRequirements.Requests) } return &GangSchedulingContext{ Created: time.Now(), @@ -530,17 +596,10 @@ func NewGangSchedulingContext(jctxs []*JobSchedulingContext) *GangSchedulingCont JobSchedulingContexts: jctxs, TotalResourceRequests: totalResourceRequests, AllJobsEvicted: allJobsEvicted, + NodeUniformityLabel: nodeUniformityLabel, } } -func (gctx GangSchedulingContext) PodRequirements() []*schedulerobjects.PodRequirements { - rv := make([]*schedulerobjects.PodRequirements, len(gctx.JobSchedulingContexts)) - for i, jctx := range gctx.JobSchedulingContexts { - rv[i] = jctx.Req - } - return rv -} - func isEvictedJob(job interfaces.LegacySchedulerJob) bool { return job.GetAnnotations()[schedulerconfig.IsEvictedAnnotation] == "true" } @@ -550,17 +609,13 @@ func isEvictedJob(job interfaces.LegacySchedulerJob) bool { type JobSchedulingContext struct { // Time at which this context was created. Created time.Time - // Executor this job was attempted to be assigned to. - ExecutorId string - // Total number of nodes in the cluster when trying to schedule. - NumNodes int // Id of the job this pod corresponds to. JobId string // Job spec. Job interfaces.LegacySchedulerJob // Scheduling requirements of this job. // We currently require that each job contains exactly one pod spec. - Req *schedulerobjects.PodRequirements + PodRequirements *schedulerobjects.PodRequirements // Reason for why the job could not be scheduled. // Empty if the job was scheduled successfully. UnschedulableReason string @@ -573,7 +628,6 @@ func (jctx *JobSchedulingContext) String() string { w := tabwriter.NewWriter(&sb, 1, 1, 1, ' ', 0) fmt.Fprintf(w, "Time:\t%s\n", jctx.Created) fmt.Fprintf(w, "Job ID:\t%s\n", jctx.JobId) - fmt.Fprintf(w, "Number of nodes in cluster:\t%d\n", jctx.NumNodes) if jctx.UnschedulableReason != "" { fmt.Fprintf(w, "UnschedulableReason:\t%s\n", jctx.UnschedulableReason) } else { @@ -590,16 +644,32 @@ func (jctx *JobSchedulingContext) IsSuccessful() bool { return jctx.UnschedulableReason == "" } +func JobSchedulingContextsFromJobs[J interfaces.LegacySchedulerJob](priorityClasses map[string]configuration.PriorityClass, jobs []J) []*JobSchedulingContext { + jctxs := make([]*JobSchedulingContext, len(jobs)) + timestamp := time.Now() + for i, job := range jobs { + jctxs[i] = &JobSchedulingContext{ + Created: timestamp, + JobId: job.GetId(), + Job: job, + PodRequirements: job.GetPodRequirements(priorityClasses), + } + } + return jctxs +} + // PodSchedulingContext is returned by SelectAndBindNodeToPod and // contains detailed information on the scheduling decision made for this pod. type PodSchedulingContext struct { // Time at which this context was created. Created time.Time - // Node the pod was assigned to. - // If nil, the pod could not be assigned to any node. - Node *schedulerobjects.Node + // ID of the node that the pod was assigned to, or empty. + NodeId string // Score indicates how well the pod fits on the selected node. Score int + // Priority class priority at which this pod was scheduled. + // Only set if NodeId is. + ScheduledAtPriority int32 // Node types on which this pod could be scheduled. MatchingNodeTypes []*schedulerobjects.NodeType // Total number of nodes in the cluster when trying to schedule. @@ -611,11 +681,12 @@ type PodSchedulingContext struct { func (pctx *PodSchedulingContext) String() string { var sb strings.Builder w := tabwriter.NewWriter(&sb, 1, 1, 1, ' ', 0) - if pctx.Node != nil { - fmt.Fprintf(w, "Node:\t%s\n", pctx.Node.Id) + if pctx.NodeId != "" { + fmt.Fprintf(w, "Node:\t%s\n", pctx.NodeId) } else { fmt.Fprint(w, "Node:\tnone\n") } + fmt.Fprintf(w, "Number of nodes in cluster:\t%d\n", pctx.NumNodes) if len(pctx.NumExcludedNodesByReason) == 0 { fmt.Fprint(w, "Excluded nodes:\tnone\n") } else { diff --git a/internal/scheduler/context/context_test.go b/internal/scheduler/context/context_test.go index 150eb593b05..05e17c24068 100644 --- a/internal/scheduler/context/context_test.go +++ b/internal/scheduler/context/context_test.go @@ -42,13 +42,13 @@ func TestSchedulingContextAccounting(t *testing.T) { schedulerobjects.ResourceList{Resources: map[string]resource.Quantity{"cpu": resource.MustParse("1")}}, ) priorityFactorByQueue := map[string]float64{"A": 1, "B": 1} - allocatedByQueueAndPriority := map[string]schedulerobjects.QuantityByPriorityAndResourceType{ + allocatedByQueueAndPriorityClass := map[string]schedulerobjects.QuantityByTAndResourceType[string]{ "A": { - 0: schedulerobjects.ResourceList{Resources: map[string]resource.Quantity{"cpu": resource.MustParse("1")}}, + "foo": schedulerobjects.ResourceList{Resources: map[string]resource.Quantity{"cpu": resource.MustParse("1")}}, }, } for _, queue := range []string{"A", "B"} { - err := sctx.AddQueueSchedulingContext(queue, priorityFactorByQueue[queue], allocatedByQueueAndPriority[queue]) + err := sctx.AddQueueSchedulingContext(queue, priorityFactorByQueue[queue], allocatedByQueueAndPriorityClass[queue]) require.NoError(t, err) } @@ -82,12 +82,10 @@ func testNSmallCpuJobSchedulingContext(queue, priorityClassName string, n int) [ } func testSmallCpuJobSchedulingContext(queue, priorityClassName string) *JobSchedulingContext { - job := testfixtures.Test1CpuJob(queue, priorityClassName) + job := testfixtures.Test1Cpu4GiJob(queue, priorityClassName) return &JobSchedulingContext{ - ExecutorId: "executor", - NumNodes: 1, - JobId: job.GetId(), - Job: job, - Req: job.GetRequirements(nil).ObjectRequirements[0].GetPodRequirements(), + JobId: job.GetId(), + Job: job, + PodRequirements: job.GetPodRequirements(testfixtures.TestPriorityClasses), } } diff --git a/internal/scheduler/gang_scheduler.go b/internal/scheduler/gang_scheduler.go index 1ebbcb189a6..f1a39e31ead 100644 --- a/internal/scheduler/gang_scheduler.go +++ b/internal/scheduler/gang_scheduler.go @@ -4,7 +4,7 @@ import ( "context" "fmt" - "github.com/pkg/errors" + "github.com/hashicorp/go-memdb" "github.com/armadaproject/armada/internal/common/util" schedulerconstraints "github.com/armadaproject/armada/internal/scheduler/constraints" @@ -94,7 +94,7 @@ func (sch *GangScheduler) Schedule(ctx context.Context, gctx *schedulercontext.G // Check that the job is large enough for this executor. // This check needs to be here, since it relates to a specific job. // Only perform limit checks for new jobs to avoid preempting jobs if, e.g., MinimumJobSize changes. - if ok, unschedulableReason = requestIsLargeEnough(gctx.TotalResourceRequests, sch.constraints.MinimumJobSize); !ok { + if ok, unschedulableReason = requestsAreLargeEnough(gctx.TotalResourceRequests, sch.constraints.MinimumJobSize); !ok { return } if ok, unschedulableReason, err = sch.constraints.CheckPerQueueAndPriorityClassConstraints( @@ -105,40 +105,127 @@ func (sch *GangScheduler) Schedule(ctx context.Context, gctx *schedulercontext.G return } } - if ok, unschedulableReason, err = sch.trySchedule(ctx, gctx); err != nil || ok { + return sch.trySchedule(ctx, gctx) +} + +func (sch *GangScheduler) trySchedule(ctx context.Context, gctx *schedulercontext.GangSchedulingContext) (ok bool, unschedulableReason string, err error) { + // If no node uniformity constraint, try scheduling across all nodes. + if gctx.NodeUniformityLabel == "" { + return sch.tryScheduleGang(ctx, gctx) + } + + // Otherwise try scheduling such that all nodes onto which a gang job lands have the same value for gctx.NodeUniformityLabel. + // We do this by making a separate scheduling attempt for each unique value of gctx.NodeUniformityLabel. + nodeUniformityLabelValues, ok := sch.nodeDb.IndexedNodeLabelValues(gctx.NodeUniformityLabel) + if !ok { + ok = false + unschedulableReason = fmt.Sprintf("uniformity label %s is not indexed", gctx.NodeUniformityLabel) + return + } + if len(nodeUniformityLabelValues) == 0 { + ok = false + unschedulableReason = fmt.Sprintf("no nodes with uniformity label %s", gctx.NodeUniformityLabel) return } - return -} -func (sch *GangScheduler) trySchedule(ctx context.Context, gctx *schedulercontext.GangSchedulingContext) (bool, string, error) { - pctxs, ok, err := sch.nodeDb.ScheduleMany(gctx.PodRequirements()) - if err != nil { - return false, "", err + // Try all possible values of nodeUniformityLabel one at a time to find the best fit. + bestValue := "" + var minMeanScheduledAtPriority float64 + var i int + for value := range nodeUniformityLabelValues { + i++ + if value == "" { + continue + } + addNodeSelectorToGctx(gctx, gctx.NodeUniformityLabel, value) + txn := sch.nodeDb.Txn(true) + if ok, unschedulableReason, err = sch.tryScheduleGangWithTxn(ctx, txn, gctx); err != nil { + txn.Abort() + return + } else if ok { + meanScheduledAtPriority, ok := meanScheduledAtPriorityFromGctx(gctx) + if !ok { + txn.Abort() + continue + } + if meanScheduledAtPriority == float64(nodedb.MinPriority) { + // Best possible; no need to keep looking. + txn.Commit() + return true, "", nil + } + if bestValue == "" || meanScheduledAtPriority <= minMeanScheduledAtPriority { + if i == len(nodeUniformityLabelValues) { + // Minimal meanScheduledAtPriority and no more options; commit and return. + txn.Commit() + return true, "", nil + } + // Record the best value seen so far. + bestValue = value + minMeanScheduledAtPriority = meanScheduledAtPriority + } + } + txn.Abort() } - if len(pctxs) > len(gctx.JobSchedulingContexts) { - return false, "", errors.Errorf( - "received %d pod scheduling context(s), but gang has cardinality %d", - len(pctxs), len(gctx.JobSchedulingContexts), - ) + if bestValue == "" { + ok = false + unschedulableReason = "at least one job in the gang does not fit on any node" + return } - for i, pctx := range pctxs { - gctx.JobSchedulingContexts[i].PodSchedulingContext = pctx - gctx.JobSchedulingContexts[i].NumNodes = pctx.NumNodes + addNodeSelectorToGctx(gctx, gctx.NodeUniformityLabel, bestValue) + return sch.tryScheduleGang(ctx, gctx) +} + +func (sch *GangScheduler) tryScheduleGang(ctx context.Context, gctx *schedulercontext.GangSchedulingContext) (ok bool, unschedulableReason string, err error) { + txn := sch.nodeDb.Txn(true) + defer txn.Abort() + ok, unschedulableReason, err = sch.tryScheduleGangWithTxn(ctx, txn, gctx) + if ok && err == nil { + txn.Commit() } - if !ok { - unschedulableReason := "" + return +} + +func (sch *GangScheduler) tryScheduleGangWithTxn(ctx context.Context, txn *memdb.Txn, gctx *schedulercontext.GangSchedulingContext) (ok bool, unschedulableReason string, err error) { + if ok, err = sch.nodeDb.ScheduleManyWithTxn(txn, gctx.JobSchedulingContexts); err != nil { + return + } else if !ok { + for _, jctx := range gctx.JobSchedulingContexts { + if jctx.PodSchedulingContext != nil { + // Clear any node bindings on failure to schedule. + jctx.PodSchedulingContext.NodeId = "" + } + } if len(gctx.JobSchedulingContexts) > 1 { unschedulableReason = "at least one job in the gang does not fit on any node" } else { unschedulableReason = "job does not fit on any node" } - return false, unschedulableReason, nil + return + } + return +} + +func addNodeSelectorToGctx(gctx *schedulercontext.GangSchedulingContext, nodeSelectorKey, nodeSelectorValue string) { + for _, jctx := range gctx.JobSchedulingContexts { + if jctx.PodRequirements.NodeSelector == nil { + jctx.PodRequirements.NodeSelector = make(map[string]string) + } + jctx.PodRequirements.NodeSelector[nodeSelectorKey] = nodeSelectorValue + } +} + +func meanScheduledAtPriorityFromGctx(gctx *schedulercontext.GangSchedulingContext) (float64, bool) { + var sum int32 + for _, jctx := range gctx.JobSchedulingContexts { + if jctx.PodSchedulingContext == nil { + return 0, false + } + sum += jctx.PodSchedulingContext.ScheduledAtPriority } - return true, "", nil + return float64(sum) / float64(len(gctx.JobSchedulingContexts)), true } -func requestIsLargeEnough(totalResourceRequests, minRequest schedulerobjects.ResourceList) (bool, string) { +func requestsAreLargeEnough(totalResourceRequests, minRequest schedulerobjects.ResourceList) (bool, string) { if len(minRequest.Resources) == 0 { return true, "" } diff --git a/internal/scheduler/gang_scheduler_test.go b/internal/scheduler/gang_scheduler_test.go index d94d0436dd2..ea7153b077a 100644 --- a/internal/scheduler/gang_scheduler_test.go +++ b/internal/scheduler/gang_scheduler_test.go @@ -37,7 +37,7 @@ func TestGangScheduler(t *testing.T) { SchedulingConfig: testfixtures.TestSchedulingConfig(), Nodes: testfixtures.N32CpuNodes(1, testfixtures.TestPriorities), Gangs: [][]*jobdb.Job{ - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 32), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 32), }, ExpectedScheduledIndices: testfixtures.IntRange(0, 0), }, @@ -45,7 +45,7 @@ func TestGangScheduler(t *testing.T) { SchedulingConfig: testfixtures.TestSchedulingConfig(), Nodes: testfixtures.N32CpuNodes(1, testfixtures.TestPriorities), Gangs: [][]*jobdb.Job{ - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 33), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 33), }, ExpectedScheduledIndices: nil, }, @@ -53,8 +53,8 @@ func TestGangScheduler(t *testing.T) { SchedulingConfig: testfixtures.TestSchedulingConfig(), Nodes: testfixtures.N32CpuNodes(1, testfixtures.TestPriorities), Gangs: [][]*jobdb.Job{ - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 32), - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 1), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 32), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 1), }, ExpectedScheduledIndices: testfixtures.IntRange(0, 0), }, @@ -62,7 +62,7 @@ func TestGangScheduler(t *testing.T) { SchedulingConfig: testfixtures.TestSchedulingConfig(), Nodes: testfixtures.N32CpuNodes(2, testfixtures.TestPriorities), Gangs: [][]*jobdb.Job{ - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 64), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 64), }, ExpectedScheduledIndices: testfixtures.IntRange(0, 0), }, @@ -73,9 +73,9 @@ func TestGangScheduler(t *testing.T) { ), Nodes: testfixtures.N32CpuNodes(1, testfixtures.TestPriorities), Gangs: [][]*jobdb.Job{ - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 8), - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 16), - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 8), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 8), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 16), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 8), }, ExpectedScheduledIndices: []int{0, 1}, }, @@ -89,11 +89,11 @@ func TestGangScheduler(t *testing.T) { ), Nodes: testfixtures.N32CpuNodes(1, testfixtures.TestPriorities), Gangs: [][]*jobdb.Job{ - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 1), - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 1), - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 1), - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 1), - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 1), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 1), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 1), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 1), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 1), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 1), }, ExpectedScheduledIndices: []int{0, 1, 2}, }, @@ -107,36 +107,36 @@ func TestGangScheduler(t *testing.T) { ), Nodes: testfixtures.N32CpuNodes(1, testfixtures.TestPriorities), Gangs: [][]*jobdb.Job{ - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 1), - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 1), - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 1), - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 1), - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 1), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 1), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 1), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 1), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 1), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 1), }, ExpectedScheduledIndices: []int{0, 1, 2, 3}, }, "MaximumResourceFractionPerQueue": { SchedulingConfig: testfixtures.WithPerPriorityLimitsConfig( - map[int32]map[string]float64{ - 0: {"cpu": 1.0}, - 1: {"cpu": 15.0 / 32.0}, - 2: {"cpu": 10.0 / 32.0}, - 3: {"cpu": 3.0 / 32.0}, + map[string]map[string]float64{ + testfixtures.PriorityClass0: {"cpu": 1.0 / 32.0}, + testfixtures.PriorityClass1: {"cpu": 2.0 / 32.0}, + testfixtures.PriorityClass2: {"cpu": 3.0 / 32.0}, + testfixtures.PriorityClass3: {"cpu": 4.0 / 32.0}, }, testfixtures.TestSchedulingConfig(), ), Nodes: testfixtures.N32CpuNodes(1, testfixtures.TestPriorities), Gangs: [][]*jobdb.Job{ - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass3, 4), - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass3, 3), - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass2, 8), - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass2, 7), - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass1, 6), - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass1, 5), - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 18), - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 17), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 1), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 2), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass1, 2), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass1, 3), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass2, 3), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass2, 4), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass3, 4), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass3, 5), }, - ExpectedScheduledIndices: []int{1, 3, 5, 7}, + ExpectedScheduledIndices: []int{0, 2, 4, 6}, }, "resolution has no impact on jobs of size a multiple of the resolution": { SchedulingConfig: testfixtures.WithIndexedResourcesConfig( @@ -148,12 +148,12 @@ func TestGangScheduler(t *testing.T) { ), Nodes: testfixtures.N32CpuNodes(3, testfixtures.TestPriorities), Gangs: [][]*jobdb.Job{ - testfixtures.N16CpuJobs("A", testfixtures.PriorityClass0, 1), - testfixtures.N16CpuJobs("A", testfixtures.PriorityClass0, 1), - testfixtures.N16CpuJobs("A", testfixtures.PriorityClass0, 1), - testfixtures.N16CpuJobs("A", testfixtures.PriorityClass0, 1), - testfixtures.N16CpuJobs("A", testfixtures.PriorityClass0, 1), - testfixtures.N16CpuJobs("A", testfixtures.PriorityClass0, 1), + testfixtures.N16Cpu128GiJobs("A", testfixtures.PriorityClass0, 1), + testfixtures.N16Cpu128GiJobs("A", testfixtures.PriorityClass0, 1), + testfixtures.N16Cpu128GiJobs("A", testfixtures.PriorityClass0, 1), + testfixtures.N16Cpu128GiJobs("A", testfixtures.PriorityClass0, 1), + testfixtures.N16Cpu128GiJobs("A", testfixtures.PriorityClass0, 1), + testfixtures.N16Cpu128GiJobs("A", testfixtures.PriorityClass0, 1), }, ExpectedScheduledIndices: testfixtures.IntRange(0, 5), }, @@ -167,10 +167,10 @@ func TestGangScheduler(t *testing.T) { ), Nodes: testfixtures.N32CpuNodes(3, testfixtures.TestPriorities), Gangs: [][]*jobdb.Job{ - testfixtures.N16CpuJobs("A", testfixtures.PriorityClass0, 1), - testfixtures.N16CpuJobs("A", testfixtures.PriorityClass0, 1), - testfixtures.N16CpuJobs("A", testfixtures.PriorityClass0, 1), - testfixtures.N16CpuJobs("A", testfixtures.PriorityClass0, 1), + testfixtures.N16Cpu128GiJobs("A", testfixtures.PriorityClass0, 1), + testfixtures.N16Cpu128GiJobs("A", testfixtures.PriorityClass0, 1), + testfixtures.N16Cpu128GiJobs("A", testfixtures.PriorityClass0, 1), + testfixtures.N16Cpu128GiJobs("A", testfixtures.PriorityClass0, 1), }, ExpectedScheduledIndices: testfixtures.IntRange(0, 2), }, @@ -210,19 +210,113 @@ func TestGangScheduler(t *testing.T) { }, ExpectedScheduledIndices: testfixtures.IntRange(0, 0), }, + "NodeUniformityLabel set but not indexed": { + SchedulingConfig: testfixtures.TestSchedulingConfig(), + Nodes: testfixtures.WithLabelsNodes( + map[string]string{"foo": "foov"}, + testfixtures.N32CpuNodes(1, testfixtures.TestPriorities), + ), + Gangs: [][]*jobdb.Job{ + testfixtures.WithNodeUniformityLabelAnnotationJobs( + "foo", + testfixtures.N16Cpu128GiJobs("A", testfixtures.PriorityClass0, 1), + ), + }, + ExpectedScheduledIndices: nil, + }, + "NodeUniformityLabel not set": { + SchedulingConfig: testfixtures.WithIndexedNodeLabelsConfig( + []string{"foo", "bar"}, + testfixtures.TestSchedulingConfig(), + ), + Nodes: testfixtures.N32CpuNodes(1, testfixtures.TestPriorities), + Gangs: [][]*jobdb.Job{ + testfixtures.WithNodeUniformityLabelAnnotationJobs( + "foo", + testfixtures.N16Cpu128GiJobs("A", testfixtures.PriorityClass0, 1), + ), + }, + ExpectedScheduledIndices: nil, + }, + "NodeUniformityLabel insufficient capacity": { + SchedulingConfig: testfixtures.WithIndexedNodeLabelsConfig( + []string{"foo", "bar"}, + testfixtures.TestSchedulingConfig(), + ), + Nodes: armadaslices.Concatenate( + testfixtures.WithLabelsNodes( + map[string]string{"foo": "foov1"}, + testfixtures.N32CpuNodes(1, testfixtures.TestPriorities), + ), + testfixtures.WithLabelsNodes( + map[string]string{"foo": "foov2"}, + testfixtures.N32CpuNodes(1, testfixtures.TestPriorities), + ), + ), + Gangs: [][]*jobdb.Job{ + testfixtures.WithNodeUniformityLabelAnnotationJobs( + "foo", + testfixtures.N16Cpu128GiJobs("A", testfixtures.PriorityClass0, 3), + ), + }, + ExpectedScheduledIndices: nil, + }, + "NodeUniformityLabel": { + SchedulingConfig: testfixtures.WithIndexedNodeLabelsConfig( + []string{"foo", "bar"}, + testfixtures.TestSchedulingConfig(), + ), + Nodes: armadaslices.Concatenate( + testfixtures.WithLabelsNodes( + map[string]string{"foo": "foov1"}, + testfixtures.WithUsedResourcesNodes( + 0, + schedulerobjects.ResourceList{Resources: map[string]resource.Quantity{"cpu": resource.MustParse("1")}}, + testfixtures.N32CpuNodes(2, testfixtures.TestPriorities), + ), + ), + testfixtures.WithLabelsNodes( + map[string]string{"foo": "foov2"}, + testfixtures.N32CpuNodes(2, testfixtures.TestPriorities), + ), + testfixtures.WithLabelsNodes( + map[string]string{"foo": "foov3"}, + testfixtures.WithUsedResourcesNodes( + 0, + schedulerobjects.ResourceList{Resources: map[string]resource.Quantity{"cpu": resource.MustParse("1")}}, + testfixtures.N32CpuNodes(2, testfixtures.TestPriorities), + ), + ), + ), + Gangs: [][]*jobdb.Job{ + testfixtures.WithNodeUniformityLabelAnnotationJobs( + "foo", + testfixtures.N16Cpu128GiJobs("A", testfixtures.PriorityClass0, 4), + ), + }, + ExpectedScheduledIndices: []int{0}, + }, } for name, tc := range tests { t.Run(name, func(t *testing.T) { + nodesById := make(map[string]*schedulerobjects.Node, len(tc.Nodes)) + for _, node := range tc.Nodes { + nodesById[node.Id] = node + } nodeDb, err := nodedb.NewNodeDb( testfixtures.TestPriorityClasses, testfixtures.TestMaxExtraNodesToConsider, tc.SchedulingConfig.IndexedResources, testfixtures.TestIndexedTaints, - testfixtures.TestIndexedNodeLabels, + tc.SchedulingConfig.IndexedNodeLabels, ) require.NoError(t, err) - err = nodeDb.UpsertMany(tc.Nodes) - require.NoError(t, err) + txn := nodeDb.Txn(true) + for _, node := range tc.Nodes { + err := nodeDb.CreateAndInsertWithJobDbJobsWithTxn(txn, nil, node) + require.NoError(t, err) + } + txn.Commit() if tc.TotalResources.Resources == nil { // Default to NodeDb total. tc.TotalResources = nodeDb.TotalResources() @@ -256,13 +350,30 @@ func TestGangScheduler(t *testing.T) { var actualScheduledIndices []int for i, gang := range tc.Gangs { - jctxs := jobSchedulingContextsFromJobs(gang, "", testfixtures.TestPriorityClasses) + jctxs := schedulercontext.JobSchedulingContextsFromJobs(testfixtures.TestPriorityClasses, gang) gctx := schedulercontext.NewGangSchedulingContext(jctxs) ok, reason, err := sch.Schedule(context.Background(), gctx) require.NoError(t, err) if ok { require.Empty(t, reason) actualScheduledIndices = append(actualScheduledIndices, i) + + // If there's a node uniformity constraint, check that it's met. + if gctx.NodeUniformityLabel != "" { + nodeUniformityLabelValues := make(map[string]bool) + for _, jctx := range jctxs { + require.NotNil(t, jctx.PodSchedulingContext) + node := nodesById[jctx.PodSchedulingContext.NodeId] + require.NotNil(t, node) + value, ok := node.Labels[gctx.NodeUniformityLabel] + require.True(t, ok, "gang job scheduled onto node with missing nodeUniformityLabel") + nodeUniformityLabelValues[value] = true + } + require.Equal( + t, 1, len(nodeUniformityLabelValues), + "node uniformity constraint not met: %s", nodeUniformityLabelValues, + ) + } } else { require.NotEmpty(t, reason) } diff --git a/internal/scheduler/interfaces/interfaces.go b/internal/scheduler/interfaces/interfaces.go index 7786fb995a0..d77432d46ab 100644 --- a/internal/scheduler/interfaces/interfaces.go +++ b/internal/scheduler/interfaces/interfaces.go @@ -1,6 +1,8 @@ package interfaces import ( + "time" + v1 "k8s.io/api/core/v1" "github.com/armadaproject/armada/internal/armada/configuration" @@ -12,24 +14,13 @@ type LegacySchedulerJob interface { GetId() string GetQueue() string GetJobSet() string + GetPerQueuePriority() uint32 + GetSubmitTime() time.Time GetAnnotations() map[string]string - GetRequirements(map[string]configuration.PriorityClass) *schedulerobjects.JobSchedulingInfo + GetPodRequirements(priorityClasses map[string]configuration.PriorityClass) *schedulerobjects.PodRequirements GetPriorityClassName() string GetNodeSelector() map[string]string GetAffinity() *v1.Affinity GetTolerations() []v1.Toleration GetResourceRequirements() v1.ResourceRequirements } - -func PodRequirementFromLegacySchedulerJob(job LegacySchedulerJob, priorityClasses map[string]configuration.PriorityClass) *schedulerobjects.PodRequirements { - schedulingInfo := job.GetRequirements(priorityClasses) - if schedulingInfo == nil { - return nil - } - for _, objectReq := range schedulingInfo.ObjectRequirements { - if req := objectReq.GetPodRequirements(); req != nil { - return req - } - } - return nil -} diff --git a/internal/scheduler/jobdb/job.go b/internal/scheduler/jobdb/job.go index aae2e08be0f..b9ae76d27b4 100644 --- a/internal/scheduler/jobdb/job.go +++ b/internal/scheduler/jobdb/job.go @@ -13,25 +13,25 @@ import ( // Job is the scheduler-internal representation of a job. type Job struct { - // String representation of the job id + // String representation of the job id. id string // Name of the queue this job belongs to. queue string - // Jobset the job belongs to - // We store this as it's needed for sending job event messages + // Jobset the job belongs to. + // We store this as it's needed for sending job event messages. jobset string // Per-queue priority of this job. priority uint32 // Requested per queue priority of this job. - // This is used when syncing the postgres database with the scheduler-internal database + // This is used when syncing the postgres database with the scheduler-internal database. requestedPriority uint32 // Logical timestamp indicating the order in which jobs are submitted. // Jobs with identical Queue and Priority are sorted by this. created int64 // True if the job is currently queued. - // If this is set then the job will not be considered for scheduling + // If this is set then the job will not be considered for scheduling. queued bool - // The current version of the queued state + // The current version of the queued state. queuedVersion int32 // Scheduling requirements of this job. jobSchedulingInfo *schedulerobjects.JobSchedulingInfo @@ -71,6 +71,20 @@ func NewJob( cancelled bool, created int64, ) *Job { + // Initialise the annotation and nodeSelector maps if nil. + // Since those need to be mutated in-place. + if schedulingInfo != nil { + for _, req := range schedulingInfo.ObjectRequirements { + if podReq := req.GetPodRequirements(); podReq != nil { + if podReq.Annotations == nil { + podReq.Annotations = make(map[string]string) + } + if podReq.NodeSelector == nil { + podReq.NodeSelector = make(map[string]string) + } + } + } + } return &Job{ id: jobId, jobset: jobset, @@ -126,6 +140,19 @@ func (job *Job) Priority() uint32 { return job.priority } +// GetPerQueuePriority exists for compatibility with the LegacyJob interface. +func (job *Job) GetPerQueuePriority() uint32 { + return job.priority +} + +// GetSubmitTime exists for compatibility with the LegacyJob interface. +func (job *Job) GetSubmitTime() time.Time { + if job.jobSchedulingInfo == nil { + return time.Time{} + } + return job.jobSchedulingInfo.SubmitTime +} + // RequestedPriority returns the requested priority of the job. func (job *Job) RequestedPriority() uint32 { return job.requestedPriority @@ -153,18 +180,12 @@ func (job *Job) JobSchedulingInfo() *schedulerobjects.JobSchedulingInfo { // GetAnnotations returns the annotations on the job. // This is needed for compatibility with interfaces.LegacySchedulerJob func (job *Job) GetAnnotations() map[string]string { - if req := job.getPodRequirements(); req != nil { + if req := job.PodRequirements(); req != nil { return req.Annotations } return nil } -// GetRequirements returns the scheduling requirements associated with the job. -// Needed for compatibility with interfaces.LegacySchedulerJob -func (job *Job) GetRequirements(_ map[string]configuration.PriorityClass) *schedulerobjects.JobSchedulingInfo { - return job.JobSchedulingInfo() -} - // Needed for compatibility with interfaces.LegacySchedulerJob func (job *Job) GetPriorityClassName() string { return job.JobSchedulingInfo().PriorityClassName @@ -172,7 +193,7 @@ func (job *Job) GetPriorityClassName() string { // Needed for compatibility with interfaces.LegacySchedulerJob func (job *Job) GetNodeSelector() map[string]string { - if req := job.getPodRequirements(); req != nil { + if req := job.PodRequirements(); req != nil { return req.NodeSelector } return nil @@ -180,7 +201,7 @@ func (job *Job) GetNodeSelector() map[string]string { // Needed for compatibility with interfaces.LegacySchedulerJob func (job *Job) GetAffinity() *v1.Affinity { - if req := job.getPodRequirements(); req != nil { + if req := job.PodRequirements(); req != nil { return req.Affinity } return nil @@ -188,7 +209,7 @@ func (job *Job) GetAffinity() *v1.Affinity { // Needed for compatibility with interfaces.LegacySchedulerJob func (job *Job) GetTolerations() []v1.Toleration { - if req := job.getPodRequirements(); req != nil { + if req := job.PodRequirements(); req != nil { return req.Tolerations } return nil @@ -196,21 +217,19 @@ func (job *Job) GetTolerations() []v1.Toleration { // Needed for compatibility with interfaces.LegacySchedulerJob func (job *Job) GetResourceRequirements() v1.ResourceRequirements { - if req := job.getPodRequirements(); req != nil { + if req := job.PodRequirements(); req != nil { return req.ResourceRequirements } return v1.ResourceRequirements{} } -func (job *Job) getPodRequirements() *schedulerobjects.PodRequirements { - requirements := job.jobSchedulingInfo.GetObjectRequirements() - if len(requirements) == 0 { - return nil - } - if podReqs := requirements[0].GetPodRequirements(); podReqs != nil { - return podReqs - } - return nil +func (job *Job) PodRequirements() *schedulerobjects.PodRequirements { + return job.jobSchedulingInfo.GetPodRequirements() +} + +// GetPodRequirements is needed for compatibility with interfaces.LegacySchedulerJob. +func (job *Job) GetPodRequirements(_ map[string]configuration.PriorityClass) *schedulerobjects.PodRequirements { + return job.PodRequirements() } // Queued returns true if the job should be considered by the scheduler for assignment or false otherwise. diff --git a/internal/scheduler/jobdb/job_test.go b/internal/scheduler/jobdb/job_test.go index 349cb0d998a..f5121c107ca 100644 --- a/internal/scheduler/jobdb/job_test.go +++ b/internal/scheduler/jobdb/job_test.go @@ -54,7 +54,6 @@ func TestJob_TestGetter(t *testing.T) { assert.Equal(t, baseJob.queue, baseJob.Queue()) assert.Equal(t, baseJob.queue, baseJob.GetQueue()) assert.Equal(t, baseJob.created, baseJob.Created()) - assert.Equal(t, schedulingInfo, baseJob.GetRequirements(nil)) assert.Equal(t, schedulingInfo, baseJob.JobSchedulingInfo()) assert.Equal(t, baseJob.GetAnnotations(), map[string]string{ "foo": "bar", diff --git a/internal/scheduler/jobiteration.go b/internal/scheduler/jobiteration.go index d9b30f434cf..202bc38bd45 100644 --- a/internal/scheduler/jobiteration.go +++ b/internal/scheduler/jobiteration.go @@ -88,23 +88,23 @@ func (repo *InMemoryJobRepository) Enqueue(job interfaces.LegacySchedulerJob) { // finally by submit time, with earlier submit times first. func (repo *InMemoryJobRepository) sortQueue(queue string) { slices.SortFunc(repo.jobsByQueue[queue], func(a, b interfaces.LegacySchedulerJob) bool { - infoa := a.GetRequirements(repo.priorityClasses) - infob := b.GetRequirements(repo.priorityClasses) if repo.sortByPriorityClass { - pca := repo.priorityClasses[infoa.PriorityClassName] - pcb := repo.priorityClasses[infob.PriorityClassName] + pca := repo.priorityClasses[a.GetPriorityClassName()] + pcb := repo.priorityClasses[b.GetPriorityClassName()] if pca.Priority > pcb.Priority { return true } else if pca.Priority < pcb.Priority { return false } } - if infoa.GetPriority() < infob.GetPriority() { + pa := a.GetPerQueuePriority() + pb := b.GetPerQueuePriority() + if pa < pb { return true - } else if infoa.GetPriority() > infob.GetPriority() { + } else if pa > pb { return false } - return infoa.GetSubmitTime().Before(infob.GetSubmitTime()) + return a.GetSubmitTime().Before(b.GetSubmitTime()) }) } diff --git a/internal/scheduler/leader.go b/internal/scheduler/leader.go index 91985139743..9ccfae7476e 100644 --- a/internal/scheduler/leader.go +++ b/internal/scheduler/leader.go @@ -103,6 +103,10 @@ func NewKubernetesLeaderController(config schedulerconfig.LeaderConfig, client c return controller } +func (lc *KubernetesLeaderController) RegisterListener(listener LeaseListener) { + lc.listener = listener +} + func (lc *KubernetesLeaderController) GetToken() LeaderToken { return lc.token.Load().(LeaderToken) } diff --git a/internal/scheduler/leader_metrics.go b/internal/scheduler/leader_metrics.go new file mode 100644 index 00000000000..74bb63bb6dc --- /dev/null +++ b/internal/scheduler/leader_metrics.go @@ -0,0 +1,63 @@ +package scheduler + +import ( + "context" + "sync" + + "github.com/prometheus/client_golang/prometheus" + + "github.com/armadaproject/armada/internal/common/metrics" +) + +var leaderStatusDesc = prometheus.NewDesc( + metrics.MetricPrefix+"_scheduler_leader_status", + "Gauge of if the reporting system is leader, 0 indicates hot replica, 1 indicates leader.", + []string{"name"}, nil, +) + +type LeaderStatusMetricsCollector struct { + currentInstanceName string + isCurrentlyLeader bool + lock sync.Mutex +} + +func NewLeaderStatusMetricsCollector(currentInstanceName string) *LeaderStatusMetricsCollector { + return &LeaderStatusMetricsCollector{ + isCurrentlyLeader: false, + currentInstanceName: currentInstanceName, + lock: sync.Mutex{}, + } +} + +func (l *LeaderStatusMetricsCollector) onStartedLeading(context.Context) { + l.lock.Lock() + defer l.lock.Unlock() + + l.isCurrentlyLeader = true +} + +func (l *LeaderStatusMetricsCollector) onStoppedLeading() { + l.lock.Lock() + defer l.lock.Unlock() + + l.isCurrentlyLeader = false +} + +func (l *LeaderStatusMetricsCollector) isLeading() bool { + l.lock.Lock() + defer l.lock.Unlock() + + return l.isCurrentlyLeader +} + +func (l *LeaderStatusMetricsCollector) Describe(desc chan<- *prometheus.Desc) { + desc <- leaderStatusDesc +} + +func (l *LeaderStatusMetricsCollector) Collect(metrics chan<- prometheus.Metric) { + value := float64(0) + if l.isLeading() { + value = 1 + } + metrics <- prometheus.MustNewConstMetric(leaderStatusDesc, prometheus.GaugeValue, value, l.currentInstanceName) +} diff --git a/internal/scheduler/leader_metrics_test.go b/internal/scheduler/leader_metrics_test.go new file mode 100644 index 00000000000..fec5d4e5d08 --- /dev/null +++ b/internal/scheduler/leader_metrics_test.go @@ -0,0 +1,56 @@ +package scheduler + +import ( + "context" + "testing" + + "github.com/prometheus/client_golang/prometheus" + "github.com/stretchr/testify/assert" +) + +const testInstanceName = "instance-1" + +var ( + isNotLeaderMetric = prometheus.MustNewConstMetric(leaderStatusDesc, prometheus.GaugeValue, float64(0), testInstanceName) + isLeaderMetric = prometheus.MustNewConstMetric(leaderStatusDesc, prometheus.GaugeValue, float64(1), testInstanceName) +) + +func TestLeaderStatusMetrics_DefaultsToNotLeader(t *testing.T) { + collector := NewLeaderStatusMetricsCollector(testInstanceName) + + actual := getCurrentMetrics(collector) + assert.Len(t, actual, 1) + assert.Equal(t, actual[0], isNotLeaderMetric) +} + +func TestLeaderStatusMetrics_HandlesLeaderChanges(t *testing.T) { + collector := NewLeaderStatusMetricsCollector(testInstanceName) + + actual := getCurrentMetrics(collector) + assert.Len(t, actual, 1) + assert.Equal(t, actual[0], isNotLeaderMetric) + + // start leading + collector.onStartedLeading(context.Background()) + actual = getCurrentMetrics(collector) + assert.Len(t, actual, 1) + assert.Equal(t, actual[0], isLeaderMetric) + + // stop leading + collector.onStoppedLeading() + actual = getCurrentMetrics(collector) + assert.Len(t, actual, 1) + assert.Equal(t, actual[0], isNotLeaderMetric) +} + +func getCurrentMetrics(collector *LeaderStatusMetricsCollector) []prometheus.Metric { + metricChan := make(chan prometheus.Metric, 1000) + collector.Collect(metricChan) + close(metricChan) + + actual := make([]prometheus.Metric, 0) + for m := range metricChan { + actual = append(actual, m) + } + return actual +} diff --git a/internal/scheduler/metrics.go b/internal/scheduler/metrics.go index 16e55f040bd..f982f8c10f1 100644 --- a/internal/scheduler/metrics.go +++ b/internal/scheduler/metrics.go @@ -259,7 +259,7 @@ func (c *MetricsCollector) updateClusterMetrics(ctx context.Context) ([]promethe } phaseCountByQueue[key]++ - podRequirements := PodRequirementFromJobSchedulingInfo(job.JobSchedulingInfo()) + podRequirements := job.PodRequirements() if podRequirements != nil { queueKey := queueMetricKey{ cluster: executor.Id, diff --git a/internal/scheduler/nodedb/nodedb.go b/internal/scheduler/nodedb/nodedb.go index 4ddd839a9d6..1dbbb71b377 100644 --- a/internal/scheduler/nodedb/nodedb.go +++ b/internal/scheduler/nodedb/nodedb.go @@ -17,19 +17,187 @@ import ( "github.com/armadaproject/armada/internal/armada/configuration" "github.com/armadaproject/armada/internal/common/armadaerrors" + armadamaps "github.com/armadaproject/armada/internal/common/maps" "github.com/armadaproject/armada/internal/common/util" schedulerconfig "github.com/armadaproject/armada/internal/scheduler/configuration" schedulercontext "github.com/armadaproject/armada/internal/scheduler/context" + "github.com/armadaproject/armada/internal/scheduler/interfaces" + "github.com/armadaproject/armada/internal/scheduler/jobdb" "github.com/armadaproject/armada/internal/scheduler/schedulerobjects" + "github.com/armadaproject/armada/pkg/api" ) -// evictedPriority is the priority class priority resources consumed by evicted jobs are accounted for at. -// This helps avoid scheduling new jobs onto nodes that make it impossible to re-schedule evicted jobs. -const evictedPriority int32 = -1 +const ( + // evictedPriority is the priority class priority resources consumed by evicted jobs are accounted for at. + // This helps avoid scheduling new jobs onto nodes that make it impossible to re-schedule evicted jobs. + evictedPriority int32 = -1 + // MinPriority is the smallest possible priority class priority within the NodeDb. + MinPriority int32 = evictedPriority +) + +var empty struct{} + +type Node struct { + Id string + Name string + + // We need to store taints and labels separately from the node type: the latter only includes + // indexed taints and labels, but we need all of them when checking pod requirements. + Taints []v1.Taint + Labels map[string]string + + TotalResources schedulerobjects.ResourceList + + // This field is set when inserting the Node into a NodeDb. + Keys [][]byte + + NodeTypeId uint64 + + AllocatableByPriority schedulerobjects.AllocatableByPriorityAndResourceType + AllocatedByQueue map[string]schedulerobjects.ResourceList + AllocatedByJobId map[string]schedulerobjects.ResourceList + EvictedJobRunIds map[string]bool +} + +// UnsafeCopy returns a pointer to a new value of type Node; it is unsafe because it only makes +// shallow copies of fields that are not mutated by methods of NodeDb. +func (node *Node) UnsafeCopy() *Node { + return &Node{ + Id: node.Id, + Name: node.Name, + + Taints: node.Taints, + Labels: node.Labels, + + TotalResources: node.TotalResources, + + Keys: nil, + + NodeTypeId: node.NodeTypeId, + + AllocatableByPriority: armadamaps.DeepCopy(node.AllocatableByPriority), + AllocatedByQueue: armadamaps.DeepCopy(node.AllocatedByQueue), + AllocatedByJobId: armadamaps.DeepCopy(node.AllocatedByJobId), + EvictedJobRunIds: maps.Clone(node.EvictedJobRunIds), + } +} + +func (nodeDb *NodeDb) create(node *schedulerobjects.Node) (*Node, error) { + taints := node.GetTaints() + if node.Unschedulable { + taints = append(slices.Clone(taints), UnschedulableTaint()) + } + + labels := maps.Clone(node.GetLabels()) + if labels == nil { + labels = make(map[string]string) + } + labels[schedulerconfig.NodeIdLabel] = node.Id + + totalResources := node.TotalResources + + nodeType := schedulerobjects.NewNodeType( + taints, + labels, + nodeDb.indexedTaints, + nodeDb.indexedNodeLabels, + ) + + allocatableByPriority := schedulerobjects.AllocatableByPriorityAndResourceType(node.AllocatableByPriorityAndResource).DeepCopy() + minimumPriority := int32(math.MaxInt32) + for p := range allocatableByPriority { + if p < minimumPriority { + minimumPriority = p + } + } + if minimumPriority < 0 { + return nil, errors.Errorf("found negative priority %d on node %s; negative priorities are reserved for internal use", minimumPriority, node.Id) + } + allocatableByPriority[evictedPriority] = allocatableByPriority[minimumPriority].DeepCopy() + + allocatedByQueue := node.AllocatedByQueue + if allocatedByQueue == nil { + allocatedByQueue = make(map[string]schedulerobjects.ResourceList) + } + + allocatedByJobId := node.AllocatedByJobId + if allocatedByJobId == nil { + allocatedByJobId = make(map[string]schedulerobjects.ResourceList) + } + + evictedJobRunIds := node.EvictedJobRunIds + if evictedJobRunIds == nil { + evictedJobRunIds = make(map[string]bool) + } + + nodeDb.mu.Lock() + for key := range nodeDb.indexedNodeLabels { + if value, ok := labels[key]; ok { + nodeDb.indexedNodeLabelValues[key][value] = empty + } + } + nodeDb.numNodes++ + nodeDb.numNodesByNodeType[nodeType.Id]++ + nodeDb.totalResources.Add(totalResources) + nodeDb.nodeTypes[nodeType.Id] = nodeType + nodeDb.mu.Unlock() + + entry := &Node{ + Id: node.Id, + Name: node.Name, + + Taints: taints, + Labels: labels, + + TotalResources: totalResources, + + Keys: nil, + + NodeTypeId: nodeType.Id, + + AllocatableByPriority: allocatableByPriority, + AllocatedByQueue: allocatedByQueue, + AllocatedByJobId: allocatedByJobId, + EvictedJobRunIds: evictedJobRunIds, + } + return entry, nil +} + +func (nodeDb *NodeDb) CreateAndInsertWithApiJobsWithTxn(txn *memdb.Txn, jobs []*api.Job, node *schedulerobjects.Node) error { + entry, err := nodeDb.create(node) + if err != nil { + return err + } + for _, job := range jobs { + if err := bindJobToNodeInPlace(nodeDb.priorityClasses, job, entry); err != nil { + return err + } + } + if err := nodeDb.UpsertWithTxn(txn, entry); err != nil { + return err + } + return nil +} + +func (nodeDb *NodeDb) CreateAndInsertWithJobDbJobsWithTxn(txn *memdb.Txn, jobs []*jobdb.Job, node *schedulerobjects.Node) error { + entry, err := nodeDb.create(node) + if err != nil { + return err + } + for _, job := range jobs { + if err := bindJobToNodeInPlace(nodeDb.priorityClasses, job, entry); err != nil { + return err + } + } + if err := nodeDb.UpsertWithTxn(txn, entry); err != nil { + return err + } + return nil +} // NodeDb is the scheduler-internal system used to efficiently find nodes on which a pod could be scheduled. type NodeDb struct { - // In-memory database storing *schedulerobjects.Node. + // In-memory database storing *Node. db *memdb.MemDB // Once a node has been found on which a pod can be scheduled, // the NodeDb will consider up to the next maxExtraNodesToConsider nodes. @@ -76,6 +244,12 @@ type NodeDb struct { // // If not set, no labels are indexed. indexedNodeLabels map[string]interface{} + + // Mutex for the remaining fields of this struct, which are mutated after initialization. + mu sync.Mutex + + // Map from indexed label names to the set of values that label takes across all nodes in the NodeDb. + indexedNodeLabelValues map[string]map[string]struct{} // Total number of nodes in the db. numNodes int // Number of nodes in the db by node type. @@ -88,8 +262,6 @@ type NodeDb struct { // Map from podRequirementsNotMetReason Sum64() to the string representation of that reason. // Used to avoid allocs. podRequirementsNotMetReasonStringCache map[uint64]string - // Mutex to control access to totalResources and NodeTypes. - mu sync.Mutex } func NewNodeDb( @@ -134,6 +306,10 @@ func NewNodeDb( Message: "there must be at least one indexed resource", }) } + indexedNodeLabelValues := make(map[string]map[string]struct{}, len(indexedNodeLabels)) + for _, key := range indexedNodeLabels { + indexedNodeLabelValues[key] = make(map[string]struct{}) + } mapFromSlice := func(vs []string) map[string]interface{} { rv := make(map[string]interface{}) for _, v := range vs { @@ -152,13 +328,14 @@ func NewNodeDb( indexedResources, func(v configuration.IndexedResource) int64 { return v.Resolution.MilliValue() }, ), - indexNameByPriority: indexNameByPriority, - indexedTaints: mapFromSlice(indexedTaints), - indexedNodeLabels: mapFromSlice(indexedNodeLabels), - nodeTypes: make(map[uint64]*schedulerobjects.NodeType), - numNodesByNodeType: make(map[uint64]int), - totalResources: schedulerobjects.ResourceList{Resources: make(map[string]resource.Quantity)}, - db: db, + indexNameByPriority: indexNameByPriority, + indexedTaints: mapFromSlice(indexedTaints), + indexedNodeLabels: mapFromSlice(indexedNodeLabels), + indexedNodeLabelValues: indexedNodeLabelValues, + nodeTypes: make(map[uint64]*schedulerobjects.NodeType), + numNodesByNodeType: make(map[uint64]int), + totalResources: schedulerobjects.ResourceList{Resources: make(map[string]resource.Quantity)}, + db: db, // Set the initial capacity (somewhat arbitrarily) to 128 reasons. podRequirementsNotMetReasonStringCache: make(map[uint64]string, 128), }, nil @@ -183,6 +360,12 @@ func (nodeDb *NodeDb) String() string { return sb.String() } +// IndexedNodeLabelValues returns the set of possible values for a given indexed label across all nodes in the NodeDb. +func (nodeDb *NodeDb) IndexedNodeLabelValues(label string) (map[string]struct{}, bool) { + values, ok := nodeDb.indexedNodeLabelValues[label] + return values, ok +} + func (nodeDb *NodeDb) NumNodes() int { nodeDb.mu.Lock() defer nodeDb.mu.Unlock() @@ -200,13 +383,13 @@ func (nodeDb *NodeDb) Txn(write bool) *memdb.Txn { } // GetNode returns a node in the db with given id. -func (nodeDb *NodeDb) GetNode(id string) (*schedulerobjects.Node, error) { +func (nodeDb *NodeDb) GetNode(id string) (*Node, error) { return nodeDb.GetNodeWithTxn(nodeDb.Txn(false), id) } // GetNodeWithTxn returns a node in the db with given id, // within the provided transactions. -func (nodeDb *NodeDb) GetNodeWithTxn(txn *memdb.Txn, id string) (*schedulerobjects.Node, error) { +func (nodeDb *NodeDb) GetNodeWithTxn(txn *memdb.Txn, id string) (*Node, error) { it, err := txn.Get("nodes", "id", id) if err != nil { return nil, errors.WithStack(err) @@ -215,20 +398,16 @@ func (nodeDb *NodeDb) GetNodeWithTxn(txn *memdb.Txn, id string) (*schedulerobjec if obj == nil { return nil, nil } - if node, ok := obj.(*schedulerobjects.Node); !ok { - panic(fmt.Sprintf("expected *Node, but got %T", obj)) - } else { - return node, nil - } + return obj.(*Node), nil } // NodeJobDiff compares two snapshots of the NodeDb memdb and returns // - a map from job ids of all preempted jobs to the node they used to be on // - a map from job ids of all scheduled jobs to the node they were scheduled on // that happened between the two snapshots. -func NodeJobDiff(txnA, txnB *memdb.Txn) (map[string]*schedulerobjects.Node, map[string]*schedulerobjects.Node, error) { - preempted := make(map[string]*schedulerobjects.Node) - scheduled := make(map[string]*schedulerobjects.Node) +func NodeJobDiff(txnA, txnB *memdb.Txn) (map[string]*Node, map[string]*Node, error) { + preempted := make(map[string]*Node) + scheduled := make(map[string]*Node) nodePairIterator, err := NewNodePairIterator(txnA, txnB) if err != nil { return nil, nil, err @@ -263,90 +442,52 @@ func NodeJobDiff(txnA, txnB *memdb.Txn) (map[string]*schedulerobjects.Node, map[ return preempted, scheduled, nil } -// ScheduleMany assigns a set of pods to nodes. -// The assignment is atomic, i.e., either all pods are successfully assigned to nodes or none are. -// The returned bool indicates whether assignment succeeded or not. +// ScheduleMany assigns a set of jobs to nodes. The assignment is atomic, i.e., either all jobs are +// successfully assigned to nodes or none are. The returned bool indicates whether assignment +// succeeded (true) or not (false). +// +// This method sets the PodSchedulingContext field on each JobSchedulingContext that it attempts to +// schedule; if it returns early (e.g., because it finds an unschedulable JobSchedulingContext), +// then this field will not be set on the remaining items. // TODO: Pass through contexts to support timeouts. -func (nodeDb *NodeDb) ScheduleMany(reqs []*schedulerobjects.PodRequirements) ([]*schedulercontext.PodSchedulingContext, bool, error) { +func (nodeDb *NodeDb) ScheduleMany(jctxs []*schedulercontext.JobSchedulingContext) (bool, error) { txn := nodeDb.db.Txn(true) defer txn.Abort() - pctxs, ok, err := nodeDb.ScheduleManyWithTxn(txn, reqs) + ok, err := nodeDb.ScheduleManyWithTxn(txn, jctxs) if ok && err == nil { // All pods can be scheduled; commit the transaction. txn.Commit() - } else { - // On failure, clear the node binding. - for _, pctx := range pctxs { - pctx.Node = nil - } } - return pctxs, ok, err + return ok, err } -func (nodeDb *NodeDb) ScheduleManyWithTxn(txn *memdb.Txn, reqs []*schedulerobjects.PodRequirements) ([]*schedulercontext.PodSchedulingContext, bool, error) { +func (nodeDb *NodeDb) ScheduleManyWithTxn(txn *memdb.Txn, jctxs []*schedulercontext.JobSchedulingContext) (bool, error) { // Attempt to schedule pods one by one in a transaction. - pctxs := make([]*schedulercontext.PodSchedulingContext, 0, len(reqs)) - for _, req := range reqs { - pctx, err := nodeDb.SelectNodeForPodWithTxn(txn, req) + for _, jctx := range jctxs { + node, err := nodeDb.SelectNodeForJobWithTxn(txn, jctx) if err != nil { - return nil, false, err + return false, err } - pctxs = append(pctxs, pctx) - // If we found a node for this pod, bind it and continue to the next pod. - // - // Otherwise, zero out the node binding for all pods and abort the transaction. - if pctx.Node != nil { - if node, err := BindPodToNode(req, pctx.Node); err != nil { - return nil, false, err + if node != nil { + if node, err := bindJobToNode(nodeDb.priorityClasses, jctx.Job, node); err != nil { + return false, err } else { if err := nodeDb.UpsertWithTxn(txn, node); err != nil { - return nil, false, err + return false, err } - pctx.Node = node } } else { - return pctxs, false, nil + return false, nil } } - return pctxs, true, nil + return true, nil } -func (nodeDb *NodeDb) SelectAndBindNodeToPod(req *schedulerobjects.PodRequirements) (*schedulercontext.PodSchedulingContext, error) { - txn := nodeDb.db.Txn(true) - defer txn.Abort() - pctx, err := nodeDb.SelectAndBindNodeToPodWithTxn(txn, req) - if err != nil { - return nil, err - } - txn.Commit() - return pctx, nil -} - -func (nodeDb *NodeDb) SelectAndBindNodeToPodWithTxn(txn *memdb.Txn, req *schedulerobjects.PodRequirements) (*schedulercontext.PodSchedulingContext, error) { - pctx, err := nodeDb.SelectNodeForPodWithTxn(txn, req) - if err != nil { - return nil, err - } - if pctx.Node != nil { - if node, err := BindPodToNode(req, pctx.Node); err != nil { - return nil, err - } else { - if err := nodeDb.UpsertWithTxn(txn, node); err != nil { - return nil, err - } - pctx.Node = node - } - } - return pctx, nil -} - -func (nodeDb *NodeDb) SelectNodeForPod(req *schedulerobjects.PodRequirements) (*schedulercontext.PodSchedulingContext, error) { - return nodeDb.SelectNodeForPodWithTxn(nodeDb.db.Txn(false), req) -} +// SelectNodeForJobWithTxn selects a node on which the job can be scheduled. +func (nodeDb *NodeDb) SelectNodeForJobWithTxn(txn *memdb.Txn, jctx *schedulercontext.JobSchedulingContext) (*Node, error) { + req := jctx.PodRequirements -// SelectNodeForPodWithTxn selects a node on which the pod can be scheduled. -func (nodeDb *NodeDb) SelectNodeForPodWithTxn(txn *memdb.Txn, req *schedulerobjects.PodRequirements) (*schedulercontext.PodSchedulingContext, error) { // Collect all node types that could potentially schedule the pod. matchingNodeTypes, numExcludedNodesByReason, err := nodeDb.NodeTypesMatchingPod(req) if err != nil { @@ -360,10 +501,11 @@ func (nodeDb *NodeDb) SelectNodeForPodWithTxn(txn *memdb.Txn, req *schedulerobje NumNodes: nodeDb.numNodes, NumExcludedNodesByReason: maps.Clone(numExcludedNodesByReason), } + jctx.PodSchedulingContext = pctx // For pods that failed to schedule, add an exclusion reason for implicitly excluded nodes. defer func() { - if pctx.Node != nil { + if pctx.NodeId != "" { return } numExplicitlyExcludedNodes := 0 @@ -382,10 +524,10 @@ func (nodeDb *NodeDb) SelectNodeForPodWithTxn(txn *memdb.Txn, req *schedulerobje if it, err := txn.Get("nodes", "id", nodeId); err != nil { return nil, errors.WithStack(err) } else { - if _, err := nodeDb.selectNodeForPodWithIt(pctx, it, req.Priority, req, true); err != nil { + if node, err := nodeDb.selectNodeForPodWithIt(pctx, it, req.Priority, req, true); err != nil { return nil, err } else { - return pctx, nil + return node, nil } } } @@ -401,24 +543,25 @@ func (nodeDb *NodeDb) SelectNodeForPodWithTxn(txn *memdb.Txn, req *schedulerobje // (since we may consider all nodes at each priority). pctx.NumExcludedNodesByReason = maps.Clone(numExcludedNodesByReason) - // To to find a node at this priority. + // Try to find a node at this priority. node, err := nodeDb.selectNodeForPodAtPriority(txn, pctx, priority, req) if err != nil { return nil, err } if node != nil { - if pctx.Node == nil { - return nil, errors.New("pctx.Node not set") + if pctx.NodeId == "" { + return nil, errors.New("pctx.NodeId not set") } - if node.Id != pctx.Node.Id { - return nil, errors.New("pctx.Node.Id does not match that of the returned node") + if node.Id != pctx.NodeId { + return nil, errors.New("pctx.NodeId does not match that of the returned node") } - return pctx, nil - } else if pctx.Node != nil { - return nil, errors.New("pctx.Node is set, but no node was returned") + return node, nil + } + if pctx.NodeId != "" { + return nil, errors.New("pctx.NodeId is set, but no node was returned") } } - return pctx, nil + return nil, nil } func (nodeDb *NodeDb) selectNodeForPodAtPriority( @@ -426,7 +569,7 @@ func (nodeDb *NodeDb) selectNodeForPodAtPriority( pctx *schedulercontext.PodSchedulingContext, priority int32, req *schedulerobjects.PodRequirements, -) (*schedulerobjects.Node, error) { +) (*Node, error) { nodeTypeIds := make([]uint64, len(pctx.MatchingNodeTypes)) for i, nodeType := range pctx.MatchingNodeTypes { nodeTypeIds[i] = nodeType.Id @@ -468,27 +611,37 @@ func (nodeDb *NodeDb) selectNodeForPodWithIt( priority int32, req *schedulerobjects.PodRequirements, onlyCheckDynamicRequirements bool, -) (*schedulerobjects.Node, error) { - var selectedNode *schedulerobjects.Node +) (*Node, error) { + var selectedNode *Node var selectedNodeScore int - var numConsideredNodes uint + var numExtraNodes uint for obj := it.Next(); obj != nil; obj = it.Next() { - node := obj.(*schedulerobjects.Node) + if selectedNode != nil { + numExtraNodes++ + if numExtraNodes > nodeDb.maxExtraNodesToConsider { + break + } + } + + node := obj.(*Node) if node == nil { return nil, nil } + var matches bool var score int var reason schedulerobjects.PodRequirementsNotMetReason var err error if onlyCheckDynamicRequirements { - matches, score, reason, err = node.DynamicPodRequirementsMet(priority, req) + matches, score, reason, err = schedulerobjects.DynamicPodRequirementsMet(node.AllocatableByPriority[priority], req) } else { - matches, score, reason, err = node.PodRequirementsMet(priority, req) + matches, score, reason, err = schedulerobjects.PodRequirementsMet(node.Taints, node.Labels, node.TotalResources, node.AllocatableByPriority[priority], req) } if err != nil { return nil, err - } else if matches { + } + + if matches { if selectedNode == nil || score > selectedNodeScore { selectedNode = node selectedNodeScore = score @@ -500,201 +653,173 @@ func (nodeDb *NodeDb) selectNodeForPodWithIt( s := nodeDb.stringFromPodRequirementsNotMetReason(reason) pctx.NumExcludedNodesByReason[s] += 1 } - if selectedNode != nil { - numConsideredNodes++ - if numConsideredNodes == nodeDb.maxExtraNodesToConsider+1 { - break - } - } } - pctx.Node = selectedNode - pctx.Score = selectedNodeScore + + if selectedNode != nil { + pctx.NodeId = selectedNode.Id + pctx.Score = selectedNodeScore + pctx.ScheduledAtPriority = priority + } return selectedNode, nil } -// BindPodToNode returns a copy of node with req bound to it. -func BindPodToNode(req *schedulerobjects.PodRequirements, node *schedulerobjects.Node) (*schedulerobjects.Node, error) { - jobId, err := JobIdFromPodRequirements(req) - if err != nil { - return nil, err - } - queue, err := QueueFromPodRequirements(req) - if err != nil { +// bindJobToNode returns a copy of node with job bound to it. +func bindJobToNode(priorityClasses map[string]configuration.PriorityClass, job interfaces.LegacySchedulerJob, node *Node) (*Node, error) { + node = node.UnsafeCopy() + if err := bindJobToNodeInPlace(priorityClasses, job, node); err != nil { return nil, err } - _, isEvicted := node.EvictedJobRunIds[jobId] + return node, nil +} - node = node.DeepCopy() +// bindJobToNodeInPlace is like bindJobToNode, but doesn't make a copy of node. +func bindJobToNodeInPlace(priorityClasses map[string]configuration.PriorityClass, job interfaces.LegacySchedulerJob, node *Node) error { + jobId := job.GetId() + requests := job.GetResourceRequirements().Requests + + _, isEvicted := node.EvictedJobRunIds[jobId] + delete(node.EvictedJobRunIds, jobId) if !isEvicted { if node.AllocatedByJobId == nil { node.AllocatedByJobId = make(map[string]schedulerobjects.ResourceList) } if allocatedToJob, ok := node.AllocatedByJobId[jobId]; ok { - return nil, errors.Errorf("job %s already has resources allocated on node %s", jobId, node.Id) + return errors.Errorf("job %s already has resources allocated on node %s", jobId, node.Id) } else { - allocatedToJob.AddV1ResourceList(req.ResourceRequirements.Requests) + allocatedToJob.AddV1ResourceList(requests) node.AllocatedByJobId[jobId] = allocatedToJob } + if node.AllocatedByQueue == nil { node.AllocatedByQueue = make(map[string]schedulerobjects.ResourceList) } + queue := job.GetQueue() allocatedToQueue := node.AllocatedByQueue[queue] - allocatedToQueue.AddV1ResourceList(req.ResourceRequirements.Requests) + allocatedToQueue.AddV1ResourceList(requests) node.AllocatedByQueue[queue] = allocatedToQueue } - delete(node.EvictedJobRunIds, jobId) + allocatable := node.AllocatableByPriority + priority := priorityClasses[job.GetPriorityClassName()].Priority + allocatable.MarkAllocatedV1ResourceList(priority, requests) if isEvicted { - schedulerobjects.AllocatableByPriorityAndResourceType( - node.AllocatableByPriorityAndResource, - ).MarkAllocatableV1ResourceList(evictedPriority, req.ResourceRequirements.Requests) + allocatable.MarkAllocatableV1ResourceList(evictedPriority, requests) } - schedulerobjects.AllocatableByPriorityAndResourceType( - node.AllocatableByPriorityAndResource, - ).MarkAllocatedV1ResourceList(req.Priority, req.ResourceRequirements.Requests) - return node, nil -} -// EvictPodFromNode returns a copy of node with req evicted from it. Specifically: -// - The job is marked as evicted on the node. -// - AllocatedByJobId and AllocatedByQueue are not updated. -// - Resources requested by the evicted pod are marked as allocated at priority evictedPriority. -func EvictPodFromNode(req *schedulerobjects.PodRequirements, node *schedulerobjects.Node) (*schedulerobjects.Node, error) { - jobId, err := JobIdFromPodRequirements(req) - if err != nil { - return nil, err - } - queue, err := QueueFromPodRequirements(req) - if err != nil { - return nil, err - } - node = node.DeepCopy() + return nil +} - // Ensure we track allocated resources at evictedPriority. - if _, ok := node.AllocatableByPriorityAndResource[evictedPriority]; !ok { - pMin := int32(math.MaxInt32) - ok := false - for p := range node.AllocatableByPriorityAndResource { - if p < pMin { - pMin = p - ok = true - } - } - if ok { - node.AllocatableByPriorityAndResource[evictedPriority] = node.AllocatableByPriorityAndResource[pMin].DeepCopy() +// EvictJobsFromNode returns a copy of node with all elements of jobs for which jobFilter returns +// true evicted from it, together with a slice containing exactly those jobs. +// +// Specifically: +// +// - The jobs that jobFilter returns true for are marked as evicted on the node. +// - Within AllocatableByPriorityAndResource, the resources allocated to these jobs are moved from +// the jobs' priorities to evictedPriority; they are not subtracted from AllocatedByJobId and +// AllocatedByQueue. +func EvictJobsFromNode( + priorityClasses map[string]configuration.PriorityClass, + jobFilter func(interfaces.LegacySchedulerJob) bool, + jobs []interfaces.LegacySchedulerJob, + node *Node, +) ([]interfaces.LegacySchedulerJob, *Node, error) { + evicted := make([]interfaces.LegacySchedulerJob, 0) + node = node.UnsafeCopy() + for _, job := range jobs { + if jobFilter != nil && !jobFilter(job) { + continue + } + evicted = append(evicted, job) + if err := evictJobFromNodeInPlace(priorityClasses, job, node); err != nil { + return nil, nil, err } } + return evicted, node, nil +} +// evictJobFromNodeInPlace is the in-place operation backing EvictJobsFromNode. +func evictJobFromNodeInPlace(priorityClasses map[string]configuration.PriorityClass, job interfaces.LegacySchedulerJob, node *Node) error { + jobId := job.GetId() if _, ok := node.AllocatedByJobId[jobId]; !ok { - return nil, errors.Errorf("job %s has no resources allocated on node %s", jobId, node.Id) + return errors.Errorf("job %s has no resources allocated on node %s", jobId, node.Id) } + + queue := job.GetQueue() if _, ok := node.AllocatedByQueue[queue]; !ok { - return nil, errors.Errorf("queue %s has no resources allocated on node %s", queue, node.Id) + return errors.Errorf("queue %s has no resources allocated on node %s", queue, node.Id) } + if node.EvictedJobRunIds == nil { node.EvictedJobRunIds = make(map[string]bool) } if _, ok := node.EvictedJobRunIds[jobId]; ok { - // TODO: We're using run ids instead of job ids for now. - return nil, errors.Errorf("job %s is already evicted from node %s", jobId, node.Id) - } else { - node.EvictedJobRunIds[jobId] = true + return errors.Errorf("job %s is already evicted from node %s", jobId, node.Id) } + node.EvictedJobRunIds[jobId] = true - schedulerobjects.AllocatableByPriorityAndResourceType( - node.AllocatableByPriorityAndResource, - ).MarkAllocatableV1ResourceList(req.Priority, req.ResourceRequirements.Requests) - schedulerobjects.AllocatableByPriorityAndResourceType( - node.AllocatableByPriorityAndResource, - ).MarkAllocatedV1ResourceList(evictedPriority, req.ResourceRequirements.Requests) - return node, nil + allocatable := node.AllocatableByPriority + priority := priorityClasses[job.GetPriorityClassName()].Priority + requests := job.GetResourceRequirements().Requests + allocatable.MarkAllocatableV1ResourceList(priority, requests) + allocatable.MarkAllocatedV1ResourceList(evictedPriority, requests) + + return nil } -// UnbindPodsFromNode returns a node with all reqs unbound from it. -func UnbindPodsFromNode(reqs []*schedulerobjects.PodRequirements, node *schedulerobjects.Node) (*schedulerobjects.Node, error) { - node = node.DeepCopy() - for _, req := range reqs { - if err := unbindPodFromNodeInPlace(req, node); err != nil { +// UnbindJobsFromNode returns a node with all elements of jobs unbound from it. +func UnbindJobsFromNode(priorityClasses map[string]configuration.PriorityClass, jobs []interfaces.LegacySchedulerJob, node *Node) (*Node, error) { + node = node.UnsafeCopy() + for _, job := range jobs { + if err := unbindJobFromNodeInPlace(priorityClasses, job, node); err != nil { return nil, err } } return node, nil } -// UnbindPodFromNode returns a copy of node with req unbound from it. -func UnbindPodFromNode(req *schedulerobjects.PodRequirements, node *schedulerobjects.Node) (*schedulerobjects.Node, error) { - node = node.DeepCopy() - if err := unbindPodFromNodeInPlace(req, node); err != nil { +// UnbindJobFromNode returns a copy of node with job unbound from it. +func UnbindJobFromNode(priorityClasses map[string]configuration.PriorityClass, job interfaces.LegacySchedulerJob, node *Node) (*Node, error) { + node = node.UnsafeCopy() + if err := unbindJobFromNodeInPlace(priorityClasses, job, node); err != nil { return nil, err } return node, nil } -// unbindPodFromNodeInPlace is like UnbindPodFromNode, but doesn't make a copy of the node. -func unbindPodFromNodeInPlace(req *schedulerobjects.PodRequirements, node *schedulerobjects.Node) error { - jobId, err := JobIdFromPodRequirements(req) - if err != nil { - return err - } - queue, err := QueueFromPodRequirements(req) - if err != nil { - return err - } +// unbindPodFromNodeInPlace is like UnbindJobFromNode, but doesn't make a copy of node. +func unbindJobFromNodeInPlace(priorityClasses map[string]configuration.PriorityClass, job interfaces.LegacySchedulerJob, node *Node) error { + jobId := job.GetId() + requests := job.GetResourceRequirements().Requests + _, isEvicted := node.EvictedJobRunIds[jobId] + delete(node.EvictedJobRunIds, jobId) if _, ok := node.AllocatedByJobId[jobId]; !ok { return errors.Errorf("job %s has no resources allocated on node %s", jobId, node.Id) } else { delete(node.AllocatedByJobId, jobId) } + + queue := job.GetQueue() if allocatedToQueue, ok := node.AllocatedByQueue[queue]; !ok { return errors.Errorf("queue %s has no resources allocated on node %s", queue, node.Id) } else { - allocatedToQueue.SubV1ResourceList(req.ResourceRequirements.Requests) - if allocatedToQueue.Equal(schedulerobjects.ResourceList{}) { + allocatedToQueue.SubV1ResourceList(requests) + if allocatedToQueue.IsZero() { delete(node.AllocatedByQueue, queue) - } else { - node.AllocatedByQueue[queue] = allocatedToQueue } } - delete(node.EvictedJobRunIds, jobId) - priority := req.Priority + allocatable := node.AllocatableByPriority + priority := priorityClasses[job.GetPriorityClassName()].Priority if isEvicted { priority = evictedPriority } - schedulerobjects.AllocatableByPriorityAndResourceType( - node.AllocatableByPriorityAndResource, - ).MarkAllocatableV1ResourceList(priority, req.ResourceRequirements.Requests) - return nil -} + allocatable.MarkAllocatableV1ResourceList(priority, requests) -func JobIdFromPodRequirements(req *schedulerobjects.PodRequirements) (string, error) { - return valueFromPodRequirements(req, schedulerconfig.JobIdAnnotation) -} - -func QueueFromPodRequirements(req *schedulerobjects.PodRequirements) (string, error) { - return valueFromPodRequirements(req, schedulerconfig.QueueAnnotation) -} - -func valueFromPodRequirements(req *schedulerobjects.PodRequirements, key string) (string, error) { - v, ok := req.Annotations[key] - if !ok { - return "", errors.WithStack(&armadaerrors.ErrInvalidArgument{ - Name: "req.Annotations", - Value: req.Annotations, - Message: fmt.Sprintf("%s annotation missing", key), - }) - } - if v == "" { - return "", errors.WithStack(&armadaerrors.ErrInvalidArgument{ - Name: key, - Value: v, - Message: fmt.Sprintf("value of %s is empty", key), - }) - } - return v, nil + return nil } // NodeTypesMatchingPod returns a slice with all node types a pod could be scheduled on. @@ -719,7 +844,7 @@ func (nodeDb *NodeDb) NodeTypesMatchingPod(req *schedulerobjects.PodRequirements return selectedNodeTypes, numExcludedNodesByReason, nil } -func (nodeDb *NodeDb) UpsertMany(nodes []*schedulerobjects.Node) error { +func (nodeDb *NodeDb) UpsertMany(nodes []*Node) error { txn := nodeDb.db.Txn(true) defer txn.Abort() if err := nodeDb.UpsertManyWithTxn(txn, nodes); err != nil { @@ -729,7 +854,7 @@ func (nodeDb *NodeDb) UpsertMany(nodes []*schedulerobjects.Node) error { return nil } -func (nodeDb *NodeDb) UpsertManyWithTxn(txn *memdb.Txn, nodes []*schedulerobjects.Node) error { +func (nodeDb *NodeDb) UpsertManyWithTxn(txn *memdb.Txn, nodes []*Node) error { for _, node := range nodes { if err := nodeDb.UpsertWithTxn(txn, node); err != nil { return err @@ -738,7 +863,7 @@ func (nodeDb *NodeDb) UpsertManyWithTxn(txn *memdb.Txn, nodes []*schedulerobject return nil } -func (nodeDb *NodeDb) Upsert(node *schedulerobjects.Node) error { +func (nodeDb *NodeDb) Upsert(node *Node) error { txn := nodeDb.Txn(true) defer txn.Abort() if err := nodeDb.UpsertWithTxn(txn, node); err != nil { @@ -748,91 +873,16 @@ func (nodeDb *NodeDb) Upsert(node *schedulerobjects.Node) error { return nil } -func (nodeDb *NodeDb) UpsertWithTxn(txn *memdb.Txn, node *schedulerobjects.Node) error { - if len(node.AllocatableByPriorityAndResource) == 0 { - return errors.Errorf("can't upsert node with AllocatableByPriorityAndResource: %v", node.AllocatableByPriorityAndResource) - } - - // Mutating the node once inserted is forbidden. - // TODO: We shouldn't need a copy here. - node = node.DeepCopy() - - // Add an evictedPriority record to the node. - // TODO: We should make NodeDb responsible for creating new nodes and add this record at creation instead of upsert. - if len(node.EvictedJobRunIds) != 0 { - q := schedulerobjects.AllocatableByPriorityAndResourceType(node.AllocatableByPriorityAndResource).Get(evictedPriority, "cpu") - if q.Cmp(node.TotalResources.Get("cpu")) == 0 { - return errors.Errorf("inconsistent node accounting: node %s has evicted jobs but no evicted resources", node.Id) - } - } - - // Ensure we track allocated resources at evictedPriority. - if _, ok := node.AllocatableByPriorityAndResource[evictedPriority]; !ok { - pMin := int32(math.MaxInt32) - ok := false - for p := range node.AllocatableByPriorityAndResource { - if p < pMin { - pMin = p - ok = true - } - } - if ok { - node.AllocatableByPriorityAndResource[evictedPriority] = node.AllocatableByPriorityAndResource[pMin].DeepCopy() - } - } - - // Make sure nodes have a label containing the nodeId. - if node.Labels == nil { - node.Labels = map[string]string{schedulerconfig.NodeIdLabel: node.Id} - } else { - node.Labels[schedulerconfig.NodeIdLabel] = node.Id - } - - // Add a special taint to unschedulable nodes before inserting. - // Adding a corresponding toleration to evicted pods ensures they can be re-scheduled. - // To prevent scheduling new pods onto cordoned nodes, only evicted pods should have this toleration. - if node.Unschedulable { - node.Taints = append(node.Taints, UnschedulableTaint()) - } - - // Compute the node type of the node. - nodeType := schedulerobjects.NewNodeType( - node.GetTaints(), - node.GetLabels(), - nodeDb.indexedTaints, - nodeDb.indexedNodeLabels, - ) - node.NodeTypeId = nodeType.Id - node.NodeType = nodeType - - // Compute the keys necessary to efficiently iterate over nodes. - node.NodeDbKeys = make([][]byte, len(nodeDb.prioritiesToTryAssigningAt)) +func (nodeDb *NodeDb) UpsertWithTxn(txn *memdb.Txn, node *Node) error { + keys := make([][]byte, len(nodeDb.prioritiesToTryAssigningAt)) for i, p := range nodeDb.prioritiesToTryAssigningAt { - node.NodeDbKeys[i] = nodeDb.nodeDbKeyFromNode(node.NodeDbKeys[i], node, p) + keys[i] = nodeDb.nodeDbKey(keys[i], node.NodeTypeId, node.AllocatableByPriority[p]) } + node.Keys = keys - // Add the node to the db. - isNewNode := false - if existingNode, err := nodeDb.GetNodeWithTxn(txn, node.Id); err != nil { - return err - } else if existingNode == nil { - isNewNode = true - } if err := txn.Insert("nodes", node); err != nil { return errors.WithStack(err) } - - // If this is a new node, update overall statistics. - // Note that these are not rolled back on txn abort. - nodeDb.mu.Lock() - if isNewNode { - nodeDb.numNodes++ - nodeDb.numNodesByNodeType[nodeType.Id]++ - nodeDb.totalResources.Add(node.TotalResources) - } - nodeDb.nodeTypes[nodeType.Id] = nodeType - nodeDb.mu.Unlock() - return nil } @@ -844,10 +894,10 @@ func (nodeDb *NodeDb) ClearAllocated() error { if err != nil { return err } - newNodes := make([]*schedulerobjects.Node, 0) + newNodes := make([]*Node, 0) for node := it.NextNode(); node != nil; node = it.NextNode() { - node = node.DeepCopy() - node.AllocatableByPriorityAndResource = schedulerobjects.NewAllocatableByPriorityAndResourceType( + node = node.UnsafeCopy() + node.AllocatableByPriority = schedulerobjects.NewAllocatableByPriorityAndResourceType( nodeDb.prioritiesToTryAssigningAt, node.TotalResources, ) @@ -895,6 +945,8 @@ func nodeIndexName(keyIndex int) string { // using a cache to avoid allocating new strings when possible. func (nodeDb *NodeDb) stringFromPodRequirementsNotMetReason(reason schedulerobjects.PodRequirementsNotMetReason) string { h := reason.Sum64() + nodeDb.mu.Lock() + defer nodeDb.mu.Unlock() if s, ok := nodeDb.podRequirementsNotMetReasonStringCache[h]; ok { return s } else { @@ -904,15 +956,15 @@ func (nodeDb *NodeDb) stringFromPodRequirementsNotMetReason(reason schedulerobje } } -// nodeDbKeyFromNode returns the index key for a particular node and resource. +// nodeDbKey returns the index key for a particular node. // Allocatable resources are rounded down to the closest multiple of nodeDb.indexedResourceResolutionMillis. // This improves efficiency by reducing the number of distinct values in the index. -func (nodeDb *NodeDb) nodeDbKeyFromNode(out []byte, node *schedulerobjects.Node, priority int32) []byte { +func (nodeDb *NodeDb) nodeDbKey(out []byte, nodeTypeId uint64, allocatable schedulerobjects.ResourceList) []byte { return RoundedNodeIndexKeyFromResourceList( out, - node.NodeTypeId, + nodeTypeId, nodeDb.indexedResources, nodeDb.indexedResourceResolutionMillis, - node.AllocatableByPriorityAndResource[priority], + allocatable, ) } diff --git a/internal/scheduler/nodedb/nodedb_test.go b/internal/scheduler/nodedb/nodedb_test.go index c62e0abfe6a..78e0198e9d8 100644 --- a/internal/scheduler/nodedb/nodedb_test.go +++ b/internal/scheduler/nodedb/nodedb_test.go @@ -12,6 +12,9 @@ import ( armadamaps "github.com/armadaproject/armada/internal/common/maps" schedulerconfig "github.com/armadaproject/armada/internal/scheduler/configuration" + schedulercontext "github.com/armadaproject/armada/internal/scheduler/context" + "github.com/armadaproject/armada/internal/scheduler/interfaces" + "github.com/armadaproject/armada/internal/scheduler/jobdb" "github.com/armadaproject/armada/internal/scheduler/schedulerobjects" "github.com/armadaproject/armada/internal/scheduler/testfixtures" ) @@ -23,60 +26,65 @@ func TestNodeDbSchema(t *testing.T) { // Test the accounting of total resources across all nodes. func TestTotalResources(t *testing.T) { - nodeDb, err := createNodeDb([]*schedulerobjects.Node{}) - if !assert.NoError(t, err) { - return - } + nodeDb, err := newNodeDbWithNodes([]*schedulerobjects.Node{}) + require.NoError(t, err) + expected := schedulerobjects.ResourceList{Resources: make(map[string]resource.Quantity)} - assert.True(t, expected.Equal(nodeDb.totalResources)) + assert.True(t, expected.Equal(nodeDb.TotalResources())) // Upserting nodes for the first time should increase the resource count. nodes := testfixtures.N32CpuNodes(2, testfixtures.TestPriorities) for _, node := range nodes { expected.Add(node.TotalResources) } - err = nodeDb.UpsertMany(nodes) - if !assert.NoError(t, err) { - return + txn := nodeDb.Txn(true) + for _, node := range nodes { + err := nodeDb.CreateAndInsertWithJobDbJobsWithTxn(txn, nil, node) + require.NoError(t, err) } - assert.True(t, expected.Equal(nodeDb.totalResources)) + txn.Commit() - // Upserting the same nodes again should not affect total resource count. - err = nodeDb.UpsertMany(nodes) - if !assert.NoError(t, err) { - return - } - assert.True(t, expected.Equal(nodeDb.totalResources)) + assert.True(t, expected.Equal(nodeDb.TotalResources())) // Upserting new nodes should increase the resource count. nodes = testfixtures.N8GpuNodes(3, testfixtures.TestPriorities) for _, node := range nodes { expected.Add(node.TotalResources) } - err = nodeDb.UpsertMany(nodes) - if !assert.NoError(t, err) { - return + txn = nodeDb.Txn(true) + for _, node := range nodes { + err := nodeDb.CreateAndInsertWithJobDbJobsWithTxn(txn, nil, node) + require.NoError(t, err) } - assert.True(t, expected.Equal(nodeDb.totalResources)) + txn.Commit() + + assert.True(t, expected.Equal(nodeDb.TotalResources())) } func TestSelectNodeForPod_NodeIdLabel_Success(t *testing.T) { nodes := testfixtures.N32CpuNodes(2, testfixtures.TestPriorities) nodeId := nodes[1].Id require.NotEmpty(t, nodeId) - db, err := createNodeDb(nodes) + db, err := newNodeDbWithNodes(nodes) require.NoError(t, err) - reqs := testfixtures.WithNodeSelectorPodReqs( + jobs := testfixtures.WithNodeSelectorJobs( map[string]string{schedulerconfig.NodeIdLabel: nodeId}, - testfixtures.N1CpuPodReqs("A", 0, 1), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 1), ) - for _, req := range reqs { - pctx, err := db.SelectNodeForPod(req) + jctxs := schedulercontext.JobSchedulingContextsFromJobs(testfixtures.TestPriorityClasses, jobs) + for _, jctx := range jctxs { + txn := db.Txn(false) + node, err := db.SelectNodeForJobWithTxn(txn, jctx) + txn.Abort() if !assert.NoError(t, err) { continue } - require.NotNil(t, pctx.Node) - assert.Equal(t, nodeId, pctx.Node.Id) + pctx := jctx.PodSchedulingContext + require.NotNil(t, node) + assert.Equal(t, nodeId, node.Id) + + require.NotNil(t, pctx) + assert.Equal(t, nodeId, pctx.NodeId) assert.Equal(t, 0, len(pctx.NumExcludedNodesByReason)) assert.Empty(t, pctx.NumExcludedNodesByReason) } @@ -86,61 +94,74 @@ func TestSelectNodeForPod_NodeIdLabel_Failure(t *testing.T) { nodes := testfixtures.N32CpuNodes(1, testfixtures.TestPriorities) nodeId := nodes[0].Id require.NotEmpty(t, nodeId) - db, err := createNodeDb(nodes) + db, err := newNodeDbWithNodes(nodes) require.NoError(t, err) - reqs := testfixtures.WithNodeSelectorPodReqs( + jobs := testfixtures.WithNodeSelectorJobs( map[string]string{schedulerconfig.NodeIdLabel: "this node does not exist"}, - testfixtures.N1CpuPodReqs("A", 0, 1), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 1), ) - for _, req := range reqs { - pctx, err := db.SelectNodeForPod(req) + jctxs := schedulercontext.JobSchedulingContextsFromJobs(testfixtures.TestPriorityClasses, jobs) + for _, jctx := range jctxs { + txn := db.Txn(false) + node, err := db.SelectNodeForJobWithTxn(txn, jctx) + txn.Abort() if !assert.NoError(t, err) { continue } - assert.Nil(t, pctx.Node) + assert.Nil(t, node) + + pctx := jctx.PodSchedulingContext + require.NotNil(t, pctx) + assert.Equal(t, "", pctx.NodeId) assert.Equal(t, 1, len(pctx.NumExcludedNodesByReason)) } } func TestNodeBindingEvictionUnbinding(t *testing.T) { - node := testfixtures.Test8GpuNode(append(testfixtures.TestPriorities, evictedPriority)) - req := testfixtures.N1GpuPodReqs("A", 0, 1)[0] - request := schedulerobjects.ResourceListFromV1ResourceList(req.ResourceRequirements.Requests) - jobId, err := JobIdFromPodRequirements(req) + node := testfixtures.Test8GpuNode(testfixtures.TestPriorities) + nodeDb, err := newNodeDbWithNodes([]*schedulerobjects.Node{node}) require.NoError(t, err) + entry, err := nodeDb.GetNode(node.Id) + require.NoError(t, err) + + jobFilter := func(job interfaces.LegacySchedulerJob) bool { return true } + job := testfixtures.Test1GpuJob("A", testfixtures.PriorityClass0) + request := schedulerobjects.ResourceListFromV1ResourceList(job.GetResourceRequirements().Requests) + jobId := job.GetId() - boundNode, err := BindPodToNode(req, node) + boundNode, err := bindJobToNode(testfixtures.TestPriorityClasses, job, entry) require.NoError(t, err) - unboundNode, err := UnbindPodFromNode(req, boundNode) + unboundNode, err := UnbindJobFromNode(testfixtures.TestPriorityClasses, job, boundNode) require.NoError(t, err) - unboundMultipleNode, err := UnbindPodsFromNode([]*schedulerobjects.PodRequirements{req}, boundNode) + unboundMultipleNode, err := UnbindJobsFromNode(testfixtures.TestPriorityClasses, []interfaces.LegacySchedulerJob{job}, boundNode) require.NoError(t, err) - evictedNode, err := EvictPodFromNode(req, boundNode) + evictedJobs, evictedNode, err := EvictJobsFromNode(testfixtures.TestPriorityClasses, jobFilter, []interfaces.LegacySchedulerJob{job}, boundNode) require.NoError(t, err) + assert.Equal(t, []interfaces.LegacySchedulerJob{job}, evictedJobs) - evictedUnboundNode, err := UnbindPodFromNode(req, evictedNode) + evictedUnboundNode, err := UnbindJobFromNode(testfixtures.TestPriorityClasses, job, evictedNode) require.NoError(t, err) - evictedBoundNode, err := BindPodToNode(req, evictedNode) + evictedBoundNode, err := bindJobToNode(testfixtures.TestPriorityClasses, job, evictedNode) require.NoError(t, err) - _, err = EvictPodFromNode(req, node) + _, _, err = EvictJobsFromNode(testfixtures.TestPriorityClasses, jobFilter, []interfaces.LegacySchedulerJob{job}, entry) require.Error(t, err) - _, err = UnbindPodFromNode(req, node) + _, err = UnbindJobFromNode(testfixtures.TestPriorityClasses, job, entry) require.Error(t, err) - _, err = BindPodToNode(req, boundNode) + _, err = bindJobToNode(testfixtures.TestPriorityClasses, job, boundNode) require.Error(t, err) - _, err = EvictPodFromNode(req, evictedNode) + _, _, err = EvictJobsFromNode(testfixtures.TestPriorityClasses, jobFilter, []interfaces.LegacySchedulerJob{job}, evictedNode) require.Error(t, err) - assertNodeAccountingEqual(t, node, unboundNode) - assertNodeAccountingEqual(t, node, evictedUnboundNode) + assertNodeAccountingEqual(t, entry, unboundNode) + assertNodeAccountingEqual(t, entry, evictedUnboundNode) assertNodeAccountingEqual(t, unboundNode, evictedUnboundNode) assertNodeAccountingEqual(t, boundNode, evictedBoundNode) assertNodeAccountingEqual(t, unboundNode, unboundMultipleNode) @@ -177,29 +198,25 @@ func TestNodeBindingEvictionUnbinding(t *testing.T) { expectedAllocatable := boundNode.TotalResources.DeepCopy() expectedAllocatable.Sub(request) - assert.True(t, expectedAllocatable.Equal(boundNode.AllocatableByPriorityAndResource[req.Priority])) + priority := testfixtures.TestPriorityClasses[job.GetPriorityClassName()].Priority + assert.True(t, expectedAllocatable.Equal(boundNode.AllocatableByPriority[priority])) assert.Empty(t, unboundNode.AllocatedByJobId) assert.Empty(t, unboundNode.AllocatedByQueue) assert.Empty(t, unboundNode.EvictedJobRunIds) } -func assertNodeAccountingEqual(t *testing.T, node1, node2 *schedulerobjects.Node) bool { - rv := true - rv = rv && assert.True( +func assertNodeAccountingEqual(t *testing.T, node1, node2 *Node) { + allocatable1 := schedulerobjects.QuantityByTAndResourceType[int32](node1.AllocatableByPriority) + allocatable2 := schedulerobjects.QuantityByTAndResourceType[int32](node2.AllocatableByPriority) + assert.True( t, - schedulerobjects.QuantityByPriorityAndResourceType( - node1.AllocatableByPriorityAndResource, - ).Equal( - schedulerobjects.QuantityByPriorityAndResourceType( - node2.AllocatableByPriorityAndResource, - ), - ), + allocatable1.Equal(allocatable2), "expected %v, but got %v", - node1.AllocatableByPriorityAndResource, - node2.AllocatableByPriorityAndResource, + node1.AllocatableByPriority, + node2.AllocatableByPriority, ) - rv = rv && assert.True( + assert.True( t, armadamaps.DeepEqual( node1.AllocatedByJobId, @@ -209,7 +226,7 @@ func assertNodeAccountingEqual(t *testing.T, node1, node2 *schedulerobjects.Node node1.AllocatedByJobId, node2.AllocatedByJobId, ) - rv = rv && assert.True( + assert.True( t, armadamaps.DeepEqual( node1.AllocatedByQueue, @@ -219,7 +236,7 @@ func assertNodeAccountingEqual(t *testing.T, node1, node2 *schedulerobjects.Node node1.AllocatedByQueue, node2.AllocatedByQueue, ) - rv = rv && assert.True( + assert.True( t, maps.Equal( node1.EvictedJobRunIds, @@ -229,60 +246,118 @@ func assertNodeAccountingEqual(t *testing.T, node1, node2 *schedulerobjects.Node node1.EvictedJobRunIds, node2.EvictedJobRunIds, ) - rv = rv && assert.True( - t, - armadamaps.DeepEqual( - node1.NonArmadaAllocatedResources, - node2.NonArmadaAllocatedResources, - ), - "expected %v, but got %v", - node1.NonArmadaAllocatedResources, - node2.NonArmadaAllocatedResources, - ) - return rv } -func TestSelectAndBindNodeToPod(t *testing.T) { +func TestEviction(t *testing.T) { + tests := map[string]struct { + jobFilter func(interfaces.LegacySchedulerJob) bool + expectedEvictions []int32 + }{ + "jobFilter always returns false": { + jobFilter: func(_ interfaces.LegacySchedulerJob) bool { return false }, + expectedEvictions: []int32{}, + }, + "jobFilter always returns true": { + jobFilter: func(_ interfaces.LegacySchedulerJob) bool { return true }, + expectedEvictions: []int32{0, 1}, + }, + "jobFilter returns true for preemptible jobs": { + jobFilter: func(job interfaces.LegacySchedulerJob) bool { + priorityClassName := job.GetPriorityClassName() + priorityClass := testfixtures.TestPriorityClasses[priorityClassName] + return priorityClass.Preemptible + }, + expectedEvictions: []int32{0}, + }, + "jobFilter nil": { + jobFilter: nil, + expectedEvictions: []int32{0, 1}, + }, + } + for name, tc := range tests { + t.Run(name, func(t *testing.T) { + nodeDb, err := newNodeDbWithNodes([]*schedulerobjects.Node{}) + require.NoError(t, err) + txn := nodeDb.Txn(true) + jobs := []*jobdb.Job{ + testfixtures.Test1Cpu4GiJob("queue-alice", testfixtures.PriorityClass0), + testfixtures.Test1Cpu4GiJob("queue-alice", testfixtures.PriorityClass3), + } + node := testfixtures.Test32CpuNode(testfixtures.TestPriorities) + err = nodeDb.CreateAndInsertWithJobDbJobsWithTxn(txn, jobs, node) + txn.Commit() + require.NoError(t, err) + entry, err := nodeDb.GetNode(node.Id) + require.NoError(t, err) + + existingJobs := make([]interfaces.LegacySchedulerJob, len(jobs)) + for i, job := range jobs { + existingJobs[i] = job + } + actualEvictions, _, err := EvictJobsFromNode(testfixtures.TestPriorityClasses, tc.jobFilter, existingJobs, entry) + require.NoError(t, err) + expectedEvictions := make([]interfaces.LegacySchedulerJob, 0, len(tc.expectedEvictions)) + for _, i := range tc.expectedEvictions { + expectedEvictions = append(expectedEvictions, jobs[i]) + } + assert.Equal(t, expectedEvictions, actualEvictions) + }) + } +} + +func TestScheduleIndividually(t *testing.T) { tests := map[string]struct { Nodes []*schedulerobjects.Node - Reqs []*schedulerobjects.PodRequirements + Jobs []*jobdb.Job ExpectSuccess []bool }{ "all jobs fit": { Nodes: testfixtures.N32CpuNodes(1, testfixtures.TestPriorities), - Reqs: testfixtures.N1CpuPodReqs("A", 0, 32), + Jobs: testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 32), ExpectSuccess: testfixtures.Repeat(true, 32), }, "not all jobs fit": { Nodes: testfixtures.N32CpuNodes(1, testfixtures.TestPriorities), - Reqs: testfixtures.N1CpuPodReqs("A", 0, 33), + Jobs: testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 33), ExpectSuccess: append(testfixtures.Repeat(true, 32), testfixtures.Repeat(false, 1)...), }, "unavailable resource": { Nodes: testfixtures.N32CpuNodes(1, testfixtures.TestPriorities), - Reqs: testfixtures.N1GpuPodReqs("A", 0, 1), + Jobs: testfixtures.N1GpuJobs("A", testfixtures.PriorityClass0, 1), ExpectSuccess: testfixtures.Repeat(false, 1), }, "unsupported resource": { Nodes: testfixtures.N32CpuNodes(1, testfixtures.TestPriorities), - Reqs: testfixtures.WithRequestsPodReqs( + Jobs: testfixtures.WithRequestsJobs( schedulerobjects.ResourceList{ Resources: map[string]resource.Quantity{ "gibberish": resource.MustParse("1"), }, }, - testfixtures.N1CpuPodReqs("A", 0, 1), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 1), ), ExpectSuccess: testfixtures.Repeat(false, 1), }, "preemption": { - Nodes: testfixtures.N32CpuNodes(1, testfixtures.TestPriorities), - Reqs: append(append(testfixtures.N1CpuPodReqs("A", 0, 32), testfixtures.N1CpuPodReqs("A", 1, 32)...), testfixtures.N1CpuPodReqs("A", 0, 32)...), + Nodes: testfixtures.N32CpuNodes(1, testfixtures.TestPriorities), + Jobs: append( + append( + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 32), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass1, 32)..., + ), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 32)..., + ), ExpectSuccess: append(testfixtures.Repeat(true, 64), testfixtures.Repeat(false, 32)...), }, "taints/tolerations": { - Nodes: testfixtures.NTainted32CpuNodes(1, testfixtures.TestPriorities), - Reqs: append(append(testfixtures.N1CpuPodReqs("A", 0, 1), testfixtures.N1GpuPodReqs("A", 0, 1)...), testfixtures.N32CpuPodReqs("A", 0, 1)...), + Nodes: testfixtures.NTainted32CpuNodes(1, testfixtures.TestPriorities), + Jobs: append( + append( + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 1), + testfixtures.N1GpuJobs("A", testfixtures.PriorityClass0, 1)..., + ), + testfixtures.N32Cpu256GiJobs("A", testfixtures.PriorityClass0, 1)..., + ), ExpectSuccess: []bool{false, false, true}, }, "node selector": { @@ -295,11 +370,11 @@ func TestSelectAndBindNodeToPod(t *testing.T) { testfixtures.N32CpuNodes(1, testfixtures.TestPriorities), )..., ), - Reqs: testfixtures.WithNodeSelectorPodReqs( + Jobs: testfixtures.WithNodeSelectorJobs( map[string]string{ "key": "value", }, - testfixtures.N1CpuPodReqs("A", 0, 33), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 33), ), ExpectSuccess: append(testfixtures.Repeat(true, 32), testfixtures.Repeat(false, 1)...), }, @@ -310,21 +385,21 @@ func TestSelectAndBindNodeToPod(t *testing.T) { }, testfixtures.N32CpuNodes(1, testfixtures.TestPriorities), ), - Reqs: testfixtures.WithNodeSelectorPodReqs( + Jobs: testfixtures.WithNodeSelectorJobs( map[string]string{ "key": "this is the wrong value", }, - testfixtures.N1CpuPodReqs("A", 0, 1), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 1), ), ExpectSuccess: testfixtures.Repeat(false, 1), }, "node selector with missing label": { Nodes: testfixtures.N32CpuNodes(1, testfixtures.TestPriorities), - Reqs: testfixtures.WithNodeSelectorPodReqs( + Jobs: testfixtures.WithNodeSelectorJobs( map[string]string{ "this label does not exist": "value", }, - testfixtures.N1CpuPodReqs("A", 0, 1), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 1), ), ExpectSuccess: testfixtures.Repeat(false, 1), }, @@ -338,7 +413,7 @@ func TestSelectAndBindNodeToPod(t *testing.T) { testfixtures.N32CpuNodes(1, testfixtures.TestPriorities), )..., ), - Reqs: testfixtures.WithNodeAffinityPodReqs( + Jobs: testfixtures.WithNodeAffinityJobs( []v1.NodeSelectorTerm{ { MatchExpressions: []v1.NodeSelectorRequirement{ @@ -350,30 +425,47 @@ func TestSelectAndBindNodeToPod(t *testing.T) { }, }, }, - testfixtures.N1CpuPodReqs("A", 0, 33), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 33), ), ExpectSuccess: append(testfixtures.Repeat(true, 32), testfixtures.Repeat(false, 1)...), }, } for name, tc := range tests { t.Run(name, func(t *testing.T) { - nodeDb, err := createNodeDb(tc.Nodes) + nodeDb, err := newNodeDbWithNodes(tc.Nodes) require.NoError(t, err) - for i, req := range tc.Reqs { - report, err := nodeDb.SelectAndBindNodeToPod(req) + + jctxs := schedulercontext.JobSchedulingContextsFromJobs(testfixtures.TestPriorityClasses, tc.Jobs) + + for i, jctx := range jctxs { + ok, err := nodeDb.ScheduleMany([]*schedulercontext.JobSchedulingContext{jctx}) require.NoError(t, err) + pctx := jctx.PodSchedulingContext + + if !tc.ExpectSuccess[i] { + assert.False(t, ok) + if pctx != nil { + assert.Equal(t, "", pctx.NodeId) + } + continue + } + + assert.True(t, ok) + require.NotNil(t, pctx) + + nodeId := pctx.NodeId if !tc.ExpectSuccess[i] { - assert.Nil(t, report.Node) + assert.Equal(t, "", nodeId) continue } - assert.NotNil(t, report.Node) + require.NotEqual(t, "", nodeId) - node, err := nodeDb.GetNode(report.Node.Id) - require.NoError(t, err) - jobId, err := JobIdFromPodRequirements(req) + job := jctx.Job + node, err := nodeDb.GetNode(nodeId) require.NoError(t, err) - expected := schedulerobjects.ResourceListFromV1ResourceList(req.ResourceRequirements.Requests) - actual, ok := node.AllocatedByJobId[jobId] + require.NotNil(t, node) + expected := schedulerobjects.ResourceListFromV1ResourceList(job.GetResourceRequirements().Requests) + actual, ok := node.AllocatedByJobId[job.GetId()] require.True(t, ok) assert.True(t, actual.Equal(expected)) } @@ -387,59 +479,56 @@ func TestScheduleMany(t *testing.T) { Nodes []*schedulerobjects.Node // Schedule one group of jobs at a time. // Each group is composed of a slice of pods. - Reqs [][]*schedulerobjects.PodRequirements + Jobs [][]*jobdb.Job // For each group, whether we expect scheduling to succeed. ExpectSuccess []bool }{ "simple success": { Nodes: testfixtures.N32CpuNodes(1, testfixtures.TestPriorities), - Reqs: [][]*schedulerobjects.PodRequirements{testfixtures.N1CpuPodReqs("A", 0, 32)}, + Jobs: [][]*jobdb.Job{testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 32)}, ExpectSuccess: []bool{true}, }, "simple failure": { Nodes: testfixtures.N32CpuNodes(1, testfixtures.TestPriorities), - Reqs: [][]*schedulerobjects.PodRequirements{testfixtures.N1CpuPodReqs("A", 0, 33)}, + Jobs: [][]*jobdb.Job{testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 33)}, ExpectSuccess: []bool{false}, }, "correct rollback": { Nodes: testfixtures.N32CpuNodes(2, testfixtures.TestPriorities), - Reqs: [][]*schedulerobjects.PodRequirements{ - testfixtures.N1CpuPodReqs("A", 0, 32), - testfixtures.N1CpuPodReqs("A", 0, 33), - testfixtures.N1CpuPodReqs("A", 0, 32), + Jobs: [][]*jobdb.Job{ + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 32), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 33), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 32), }, ExpectSuccess: []bool{true, false, true}, }, "varying job size": { Nodes: testfixtures.N32CpuNodes(2, testfixtures.TestPriorities), - Reqs: [][]*schedulerobjects.PodRequirements{ - append(testfixtures.N32CpuPodReqs("A", 0, 1), testfixtures.N1CpuPodReqs("A", 0, 32)...), - testfixtures.N1CpuPodReqs("A", 0, 1), + Jobs: [][]*jobdb.Job{ + append( + testfixtures.N32Cpu256GiJobs("A", testfixtures.PriorityClass0, 1), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 32)..., + ), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 1), }, ExpectSuccess: []bool{true, false}, }, } for name, tc := range tests { t.Run(name, func(t *testing.T) { - nodeDb, err := createNodeDb(tc.Nodes) - if !assert.NoError(t, err) { - return - } - for i, reqs := range tc.Reqs { - reports, ok, err := nodeDb.ScheduleMany(reqs) - if !assert.NoError(t, err) { - return - } - if tc.ExpectSuccess[i] { - assert.Equal(t, len(reqs), len(reports)) - for _, report := range reports { - if !assert.NotNil(t, report.Node) { - return - } + nodeDb, err := newNodeDbWithNodes(tc.Nodes) + require.NoError(t, err) + for i, jobs := range tc.Jobs { + jctxs := schedulercontext.JobSchedulingContextsFromJobs(testfixtures.TestPriorityClasses, jobs) + ok, err := nodeDb.ScheduleMany(jctxs) + require.NoError(t, err) + assert.Equal(t, tc.ExpectSuccess[i], ok) + for _, jctx := range jctxs { + pctx := jctx.PodSchedulingContext + require.NotNil(t, pctx) + if tc.ExpectSuccess[i] { + assert.NotEqual(t, "", pctx.NodeId) } - assert.True(t, ok) - } else { - assert.False(t, ok) } } }) @@ -447,22 +536,28 @@ func TestScheduleMany(t *testing.T) { } func benchmarkUpsert(nodes []*schedulerobjects.Node, b *testing.B) { - db, err := NewNodeDb( + nodeDb, err := NewNodeDb( testfixtures.TestPriorityClasses, testfixtures.TestMaxExtraNodesToConsider, testfixtures.TestResources, testfixtures.TestIndexedTaints, testfixtures.TestIndexedNodeLabels, ) - if !assert.NoError(b, err) { - return + require.NoError(b, err) + txn := nodeDb.Txn(true) + entries := make([]*Node, len(nodes)) + for i, node := range nodes { + err := nodeDb.CreateAndInsertWithJobDbJobsWithTxn(txn, nil, node) + require.NoError(b, err) + entry, err := nodeDb.GetNode(node.Id) + require.NoError(b, err) + entries[i] = entry } + txn.Commit() b.ResetTimer() for n := 0; n < b.N; n++ { - err := db.UpsertMany(nodes) - if !assert.NoError(b, err) { - return - } + err := nodeDb.UpsertMany(entries) + require.NoError(b, err) } } @@ -478,7 +573,7 @@ func BenchmarkUpsert100000(b *testing.B) { benchmarkUpsert(testfixtures.N32CpuNodes(100000, testfixtures.TestPriorities), b) } -func benchmarkSelectAndBindNodeToPod(nodes []*schedulerobjects.Node, reqs []*schedulerobjects.PodRequirements, b *testing.B) { +func benchmarkScheduleMany(b *testing.B, nodes []*schedulerobjects.Node, jobs []*jobdb.Job) { nodeDb, err := NewNodeDb( testfixtures.TestPriorityClasses, testfixtures.TestMaxExtraNodesToConsider, @@ -487,120 +582,122 @@ func benchmarkSelectAndBindNodeToPod(nodes []*schedulerobjects.Node, reqs []*sch testfixtures.TestIndexedNodeLabels, ) require.NoError(b, err) - - err = nodeDb.UpsertMany(nodes) - require.NoError(b, err) + txn := nodeDb.Txn(true) + for _, node := range nodes { + err := nodeDb.CreateAndInsertWithJobDbJobsWithTxn(txn, nil, node) + require.NoError(b, err) + } + txn.Commit() b.ResetTimer() for n := 0; n < b.N; n++ { + jctxs := schedulercontext.JobSchedulingContextsFromJobs(testfixtures.TestPriorityClasses, jobs) txn := nodeDb.Txn(true) - for _, req := range reqs { - _, err := nodeDb.SelectAndBindNodeToPodWithTxn(txn, req) - require.NoError(b, err) - } + _, err := nodeDb.ScheduleManyWithTxn(txn, jctxs) txn.Abort() + require.NoError(b, err) } } -func BenchmarkSelectAndBindNodeToPod10CpuNodes320SmallJobs(b *testing.B) { - benchmarkSelectAndBindNodeToPod( - testfixtures.N32CpuNodes(1, testfixtures.TestPriorities), - testfixtures.N1CpuPodReqs("A", 0, 320), +func BenchmarkScheduleMany10CpuNodes320SmallJobs(b *testing.B) { + benchmarkScheduleMany( b, + testfixtures.N32CpuNodes(1, testfixtures.TestPriorities), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 320), ) } -func BenchmarkSelectAndBindNodeToPod10CpuNodes640SmallJobs(b *testing.B) { - benchmarkSelectAndBindNodeToPod( - testfixtures.N32CpuNodes(1, testfixtures.TestPriorities), - testfixtures.N1CpuPodReqs("A", 0, 640), +func BenchmarkScheduleMany10CpuNodes640SmallJobs(b *testing.B) { + benchmarkScheduleMany( b, + testfixtures.N32CpuNodes(1, testfixtures.TestPriorities), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 640), ) } -func BenchmarkSelectAndBindNodeToPod100CpuNodes3200SmallJobs(b *testing.B) { - benchmarkSelectAndBindNodeToPod( - testfixtures.N32CpuNodes(100, testfixtures.TestPriorities), - testfixtures.N1CpuPodReqs("A", 0, 3200), +func BenchmarkScheduleMany100CpuNodes3200SmallJobs(b *testing.B) { + benchmarkScheduleMany( b, + testfixtures.N32CpuNodes(100, testfixtures.TestPriorities), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 3200), ) } -func BenchmarkSelectAndBindNodeToPod100CpuNodes6400SmallJobs(b *testing.B) { - benchmarkSelectAndBindNodeToPod( - testfixtures.N32CpuNodes(100, testfixtures.TestPriorities), - testfixtures.N1CpuPodReqs("A", 0, 6400), +func BenchmarkScheduleMany100CpuNodes6400SmallJobs(b *testing.B) { + benchmarkScheduleMany( b, + testfixtures.N32CpuNodes(100, testfixtures.TestPriorities), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 6400), ) } -func BenchmarkSelectAndBindNodeToPod1000CpuNodes32000SmallJobs(b *testing.B) { - benchmarkSelectAndBindNodeToPod( - testfixtures.N32CpuNodes(1000, testfixtures.TestPriorities), - testfixtures.N1CpuPodReqs("A", 0, 32000), +func BenchmarkScheduleMany1000CpuNodes32000SmallJobs(b *testing.B) { + benchmarkScheduleMany( b, + testfixtures.N32CpuNodes(1000, testfixtures.TestPriorities), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 32000), ) } -func BenchmarkSelectAndBindNodeToPod1000CpuNodes64000SmallJobs(b *testing.B) { - benchmarkSelectAndBindNodeToPod( - testfixtures.N32CpuNodes(1000, testfixtures.TestPriorities), - testfixtures.N1CpuPodReqs("A", 0, 64000), +func BenchmarkScheduleMany1000CpuNodes64000SmallJobs(b *testing.B) { + benchmarkScheduleMany( b, + testfixtures.N32CpuNodes(1000, testfixtures.TestPriorities), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 64000), ) } -func BenchmarkSelectAndBindNodeToPod100CpuNodes1CpuUnused(b *testing.B) { - benchmarkSelectAndBindNodeToPod( +func BenchmarkScheduleMany100CpuNodes1CpuUnused(b *testing.B) { + benchmarkScheduleMany( + b, testfixtures.WithUsedResourcesNodes( 0, schedulerobjects.ResourceList{Resources: map[string]resource.Quantity{"cpu": resource.MustParse("31")}}, testfixtures.N32CpuNodes(100, testfixtures.TestPriorities), ), - testfixtures.N1CpuPodReqs("A", 0, 100), - b, + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 100), ) } -func BenchmarkSelectAndBindNodeToPod1000CpuNodes1CpuUnused(b *testing.B) { - benchmarkSelectAndBindNodeToPod( +func BenchmarkScheduleMany1000CpuNodes1CpuUnused(b *testing.B) { + benchmarkScheduleMany( + b, testfixtures.WithUsedResourcesNodes( 0, schedulerobjects.ResourceList{Resources: map[string]resource.Quantity{"cpu": resource.MustParse("31")}}, testfixtures.N32CpuNodes(1000, testfixtures.TestPriorities), ), - testfixtures.N1CpuPodReqs("A", 0, 1000), - b, + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 1000), ) } -func BenchmarkSelectAndBindNodeToPod10000CpuNodes1CpuUnused(b *testing.B) { - benchmarkSelectAndBindNodeToPod( +func BenchmarkScheduleMany10000CpuNodes1CpuUnused(b *testing.B) { + benchmarkScheduleMany( + b, testfixtures.WithUsedResourcesNodes( 0, schedulerobjects.ResourceList{Resources: map[string]resource.Quantity{"cpu": resource.MustParse("31")}}, testfixtures.N32CpuNodes(10000, testfixtures.TestPriorities), ), - testfixtures.N1CpuPodReqs("A", 0, 10000), - b, + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 10000), ) } -func BenchmarkSelectAndBindNodeToPodResourceConstrained(b *testing.B) { +func BenchmarkScheduleManyResourceConstrained(b *testing.B) { nodes := append(append( testfixtures.N32CpuNodes(500, testfixtures.TestPriorities), testfixtures.N8GpuNodes(1, testfixtures.TestPriorities)...), testfixtures.N32CpuNodes(499, testfixtures.TestPriorities)..., ) - benchmarkSelectAndBindNodeToPod( - nodes, - testfixtures.N1GpuPodReqs("A", 0, 1), + benchmarkScheduleMany( b, + nodes, + testfixtures.N1GpuJobs("A", testfixtures.PriorityClass0, 1), ) } -func createNodeDb(nodes []*schedulerobjects.Node) (*NodeDb, error) { - db, err := NewNodeDb( +func newNodeDbWithNodes(nodes []*schedulerobjects.Node) (*NodeDb, error) { + nodeDb, err := NewNodeDb( testfixtures.TestPriorityClasses, testfixtures.TestMaxExtraNodesToConsider, testfixtures.TestResources, @@ -610,10 +707,14 @@ func createNodeDb(nodes []*schedulerobjects.Node) (*NodeDb, error) { if err != nil { return nil, err } - if err := db.UpsertMany(nodes); err != nil { - return nil, err + txn := nodeDb.Txn(true) + for _, node := range nodes { + if err := nodeDb.CreateAndInsertWithJobDbJobsWithTxn(txn, nil, node); err != nil { + return nil, err + } } - return db, nil + txn.Commit() + return nodeDb, nil } func BenchmarkNodeDbStringFromPodRequirementsNotMetReason(b *testing.B) { @@ -637,17 +738,3 @@ func randomString(n int) string { } return s } - -func GetTestNodeDb() *NodeDb { - nodeDb, err := NewNodeDb( - testfixtures.TestPriorityClasses, - 0, - testfixtures.TestResources, - testfixtures.TestIndexedTaints, - testfixtures.TestIndexedNodeLabels, - ) - if err != nil { - panic(err) - } - return nodeDb -} diff --git a/internal/scheduler/nodedb/nodeiteration.go b/internal/scheduler/nodedb/nodeiteration.go index 3ae075edea4..fb2715c6676 100644 --- a/internal/scheduler/nodedb/nodeiteration.go +++ b/internal/scheduler/nodedb/nodeiteration.go @@ -9,12 +9,10 @@ import ( "github.com/pkg/errors" "golang.org/x/exp/slices" "k8s.io/apimachinery/pkg/api/resource" - - "github.com/armadaproject/armada/internal/scheduler/schedulerobjects" ) type NodeIterator interface { - NextNode() *schedulerobjects.Node + NextNode() *Node } // NodesIterator is an iterator over all nodes in the db. @@ -36,16 +34,12 @@ func (it *NodesIterator) WatchCh() <-chan struct{} { panic("not implemented") } -func (it *NodesIterator) NextNode() *schedulerobjects.Node { +func (it *NodesIterator) NextNode() *Node { obj := it.it.Next() if obj == nil { return nil } - node, ok := obj.(*schedulerobjects.Node) - if !ok { - panic(fmt.Sprintf("expected *Node, but got %T", obj)) - } - return node + return obj.(*Node) } func (it *NodesIterator) Next() interface{} { @@ -55,13 +49,13 @@ func (it *NodesIterator) Next() interface{} { type NodePairIterator struct { itA *NodesIterator itB *NodesIterator - nodeA *schedulerobjects.Node - nodeB *schedulerobjects.Node + nodeA *Node + nodeB *Node } type NodePairIteratorItem struct { - NodeA *schedulerobjects.Node - NodeB *schedulerobjects.Node + NodeA *Node + NodeB *Node } func NewNodePairIterator(txnA, txnB *memdb.Txn) (*NodePairIterator, error) { @@ -144,10 +138,10 @@ func (index *NodeIndex) FromArgs(args ...interface{}) ([]byte, error) { return args[0].([]byte), nil } -// FromObject extracts the index key from a *schedulerobjects.Node. +// FromObject extracts the index key from a *Node. func (index *NodeIndex) FromObject(raw interface{}) (bool, []byte, error) { - node := raw.(*schedulerobjects.Node) - return true, node.NodeDbKeys[index.KeyIndex], nil + node := raw.(*Node) + return true, node.Keys[index.KeyIndex], nil } // NodeTypesIterator is an iterator over all nodes of the given nodeTypes @@ -212,7 +206,7 @@ func (it *NodeTypesIterator) Next() interface{} { return v } -func (it *NodeTypesIterator) NextNode() (*schedulerobjects.Node, error) { +func (it *NodeTypesIterator) NextNode() (*Node, error) { if it.pq.Len() == 0 { return nil, nil } @@ -236,7 +230,7 @@ type nodeTypesIteratorPQ struct { } type nodeTypesIteratorPQItem struct { - node *schedulerobjects.Node + node *Node it *NodeTypeIterator // The index of the item in the heap. Maintained by the heap.Interface methods. index int @@ -248,9 +242,9 @@ func (pq *nodeTypesIteratorPQ) Less(i, j int) bool { return pq.less(pq.items[i].node, pq.items[j].node) } -func (it *nodeTypesIteratorPQ) less(a, b *schedulerobjects.Node) bool { - allocatableByPriorityA := a.AllocatableByPriorityAndResource[it.priority] - allocatableByPriorityB := b.AllocatableByPriorityAndResource[it.priority] +func (it *nodeTypesIteratorPQ) less(a, b *Node) bool { + allocatableByPriorityA := a.AllocatableByPriority[it.priority] + allocatableByPriorityB := b.AllocatableByPriority[it.priority] for _, t := range it.indexedResources { qa := allocatableByPriorityA.Get(t) qb := allocatableByPriorityB.Get(t) @@ -376,13 +370,13 @@ func (it *NodeTypeIterator) Next() interface{} { return v } -func (it *NodeTypeIterator) NextNode() (*schedulerobjects.Node, error) { +func (it *NodeTypeIterator) NextNode() (*Node, error) { for { v := it.memdbIterator.Next() if v == nil { return nil, nil } - node := v.(*schedulerobjects.Node) + node := v.(*Node) if node.Id == it.previousNodeId { panic(fmt.Sprintf("iterator received the same node twice consecutively: %s", node.Id)) } @@ -391,9 +385,9 @@ func (it *NodeTypeIterator) NextNode() (*schedulerobjects.Node, error) { // There are no more nodes of this nodeType. return nil, nil } - allocatableByPriority := node.AllocatableByPriorityAndResource[it.priority] + allocatableByPriority := node.AllocatableByPriority[it.priority] if len(allocatableByPriority.Resources) == 0 { - return nil, errors.Errorf("node %s has no resources registered at priority %d: %v", node.Id, it.priority, node.AllocatableByPriorityAndResource) + return nil, errors.Errorf("node %s has no resources registered at priority %d: %v", node.Id, it.priority, node.AllocatableByPriority) } for i, t := range it.indexedResources { nodeQuantity := allocatableByPriority.Get(t) diff --git a/internal/scheduler/nodedb/nodeiteration_test.go b/internal/scheduler/nodedb/nodeiteration_test.go index 7b5edaa270d..215f1719c40 100644 --- a/internal/scheduler/nodedb/nodeiteration_test.go +++ b/internal/scheduler/nodedb/nodeiteration_test.go @@ -4,8 +4,6 @@ import ( "fmt" "testing" - "github.com/hashicorp/go-memdb" - "github.com/pkg/errors" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "golang.org/x/exp/maps" @@ -37,7 +35,7 @@ func TestNodesIterator(t *testing.T) { for i, node := range tc.Nodes { indexById[node.Id] = i } - nodeDb, err := createNodeDb(tc.Nodes) + nodeDb, err := newNodeDbWithNodes(tc.Nodes) if !assert.NoError(t, err) { return } @@ -65,28 +63,26 @@ func TestNodesIterator(t *testing.T) { func TestNodePairIterator(t *testing.T) { nodes := testfixtures.TestCluster() - for i, c := range []string{"A", "B", "C"} { - nodes[i].Id = c + for i, nodeId := range []string{"A", "B", "C"} { + nodes[i].Id = nodeId } - - nodeDb, err := createNodeDb(nil) + nodeDb, err := newNodeDbWithNodes(nodes) require.NoError(t, err) - for _, node := range nodes { - node.NodeDbKeys = make([][]byte, len(nodeDb.prioritiesToTryAssigningAt)) - for i, p := range nodeDb.prioritiesToTryAssigningAt { - node.NodeDbKeys[i] = nodeDb.nodeDbKeyFromNode(node.NodeDbKeys[i], node, p) - } + entries := make([]*Node, len(nodes)) + for i, node := range nodes { + entry, err := nodeDb.GetNode(node.Id) + require.NoError(t, err) + entries[i] = entry } txn := nodeDb.Txn(true) - require.NoError(t, txn.Insert("nodes", nodes[0])) - require.NoError(t, txn.Insert("nodes", nodes[1])) + require.NoError(t, txn.Delete("nodes", entries[2])) txn.Commit() txnA := nodeDb.Txn(false) txn = nodeDb.Txn(true) - require.NoError(t, txn.Delete("nodes", nodes[0])) - require.NoError(t, txn.Insert("nodes", nodes[2])) + require.NoError(t, txn.Delete("nodes", entries[0])) + require.NoError(t, txn.Insert("nodes", entries[2])) txn.Commit() txnB := nodeDb.Txn(false) @@ -99,16 +95,16 @@ func TestNodePairIterator(t *testing.T) { } expected := []*NodePairIteratorItem{ { - NodeA: nodes[0], + NodeA: entries[0], NodeB: nil, }, { - NodeA: nodes[1], - NodeB: nodes[1], + NodeA: entries[1], + NodeB: entries[1], }, { NodeA: nil, - NodeB: nodes[2], + NodeB: entries[2], }, } assert.Equal(t, expected, actual) @@ -413,29 +409,24 @@ func TestNodeTypeIterator(t *testing.T) { } for name, tc := range tests { t.Run(name, func(t *testing.T) { - nodeDb, err := createNodeDb(nil) + nodeDb, err := newNodeDbWithNodes(nil) require.NoError(t, err) - // Set monotonically increaseing node ids to ensure nodes appear in predictable order. + entries := make([]*Node, len(tc.nodes)) for i, node := range tc.nodes { + // Set monotonically increasing node IDs to ensure nodes appear in predictable order. node.Id = fmt.Sprintf("%d", i) - } - indexByNodeId := make(map[string]int) - for i, node := range tc.nodes { - indexByNodeId[node.Id] = i - } - // Compute the keys necessary to efficiently iterate over nodes - // and populate the database. We do this manually instead of using nodeDb.Upsert to control the nodeTypeId. - for _, node := range tc.nodes { - node.NodeDbKeys = make([][]byte, len(nodeDb.prioritiesToTryAssigningAt)) - for i, p := range nodeDb.prioritiesToTryAssigningAt { - node.NodeDbKeys[i] = nodeDb.nodeDbKeyFromNode(node.NodeDbKeys[i], node, p) - } + entry, err := nodeDb.create(node) + require.NoError(t, err) + + // We can safely override NodeTypeId, because Keys is recomputed upon insertion. + entry.NodeTypeId = node.NodeTypeId + + entries[i] = entry } - require.NoError(t, populateDatabase(nodeDb.db, tc.nodes)) + require.NoError(t, nodeDb.UpsertMany(entries)) - // Create iterator. indexedResourceRequests := make([]resource.Quantity, len(testfixtures.TestResources)) for i, t := range nodeDb.indexedResources { indexedResourceRequests[i] = tc.resourceRequests.Get(t) @@ -447,22 +438,31 @@ func TestNodeTypeIterator(t *testing.T) { } } require.NotEqual(t, -1, keyIndex) - it, err := NewNodeTypeIterator(nodeDb.Txn(false), tc.nodeTypeId, nodeIndexName(keyIndex), tc.priority, testfixtures.TestResourceNames, indexedResourceRequests, testfixtures.TestIndexedResourceResolutionMillis) + it, err := NewNodeTypeIterator( + nodeDb.Txn(false), + tc.nodeTypeId, + nodeIndexName(keyIndex), + tc.priority, + testfixtures.TestResourceNames, + indexedResourceRequests, + testfixtures.TestIndexedResourceResolutionMillis, + ) require.NoError(t, err) - // Compare actual with expected order. - actual := make([]int, 0) + expected := make([]string, len(tc.expected)) + for i, nodeId := range tc.expected { + expected[i] = fmt.Sprintf("%d", nodeId) + } + actual := make([]string, 0) for { node, err := it.NextNode() require.NoError(t, err) if node == nil { break } - i, ok := indexByNodeId[node.Id] - require.True(t, ok) - actual = append(actual, i) + actual = append(actual, node.Id) } - assert.Equal(t, tc.expected, actual) + assert.Equal(t, expected, actual) // Calling next should always return nil from now on. for i := 0; i < 100; i++ { @@ -799,27 +799,23 @@ func TestNodeTypesIterator(t *testing.T) { } for name, tc := range tests { t.Run(name, func(t *testing.T) { - nodeDb, err := createNodeDb(nil) + nodeDb, err := newNodeDbWithNodes(nil) require.NoError(t, err) - // Set monotonically increaseing node ids to ensure nodes appear in predictable order. + entries := make([]*Node, len(tc.nodes)) for i, node := range tc.nodes { + // Set monotonically increasing node IDs to ensure nodes appear in predictable order. node.Id = fmt.Sprintf("%d", i) - } - indexByNodeId := make(map[string]int) - for i, node := range tc.nodes { - indexByNodeId[node.Id] = i - } - // Compute the keys necessary to efficiently iterate over nodes - // and populate the database. We do this manually instead of using nodeDb.Upsert to control the nodeTypeId. - for _, node := range tc.nodes { - node.NodeDbKeys = make([][]byte, len(nodeDb.prioritiesToTryAssigningAt)) - for i, p := range nodeDb.prioritiesToTryAssigningAt { - node.NodeDbKeys[i] = nodeDb.nodeDbKeyFromNode(node.NodeDbKeys[i], node, p) - } + entry, err := nodeDb.create(node) + require.NoError(t, err) + + // We can safely override NodeTypeId, because Keys is recomputed upon insertion. + entry.NodeTypeId = node.NodeTypeId + + entries[i] = entry } - require.NoError(t, populateDatabase(nodeDb.db, tc.nodes)) + require.NoError(t, nodeDb.UpsertMany(entries)) indexedResourceRequests := make([]resource.Quantity, len(testfixtures.TestResources)) for i, t := range testfixtures.TestResourceNames { @@ -836,19 +832,20 @@ func TestNodeTypesIterator(t *testing.T) { ) require.NoError(t, err) - // Compare actual with expected order. - actual := make([]int, 0) + expected := make([]string, len(tc.expected)) + for i, nodeId := range tc.expected { + expected[i] = fmt.Sprintf("%d", nodeId) + } + actual := make([]string, 0) for { node, err := it.NextNode() require.NoError(t, err) if node == nil { break } - i, ok := indexByNodeId[node.Id] - require.True(t, ok) - actual = append(actual, i) + actual = append(actual, node.Id) } - assert.Equal(t, tc.expected, actual) + assert.Equal(t, expected, actual) // Calling next again should still return nil. node, err := it.NextNode() @@ -858,19 +855,6 @@ func TestNodeTypesIterator(t *testing.T) { } } -func populateDatabase(db *memdb.MemDB, items []*schedulerobjects.Node) error { - txn := db.Txn(true) - defer txn.Abort() - for _, item := range items { - err := txn.Insert("nodes", item) - if err != nil { - return errors.WithStack(err) - } - } - txn.Commit() - return nil -} - func BenchmarkNodeTypeIterator(b *testing.B) { // Create nodes with varying amounts of CPU available. numNodes := 1000 @@ -889,7 +873,7 @@ func BenchmarkNodeTypeIterator(b *testing.B) { []*schedulerobjects.Node{node}, ) } - nodeDb, err := createNodeDb(nodes) + nodeDb, err := newNodeDbWithNodes(nodes) require.NoError(b, err) // Create iterator for 0 CPU required and an unfeasible memory request, diff --git a/internal/scheduler/pool_assigner.go b/internal/scheduler/pool_assigner.go index 14528c619e4..29d16e4d957 100644 --- a/internal/scheduler/pool_assigner.go +++ b/internal/scheduler/pool_assigner.go @@ -10,6 +10,7 @@ import ( "k8s.io/apimachinery/pkg/util/clock" "github.com/armadaproject/armada/internal/armada/configuration" + schedulercontext "github.com/armadaproject/armada/internal/scheduler/context" "github.com/armadaproject/armada/internal/scheduler/database" "github.com/armadaproject/armada/internal/scheduler/jobdb" "github.com/armadaproject/armada/internal/scheduler/nodedb" @@ -118,27 +119,32 @@ func (p *DefaultPoolAssigner) AssignPool(j *jobdb.Job) (string, error) { return cachedPool.(string), nil } - req := PodRequirementFromJobSchedulingInfo(j.JobSchedulingInfo()) + req := j.PodRequirements() req = p.clearAnnotations(req) // Otherwise iterate through each pool and detect the first one the job is potentially schedulable on for pool, executors := range p.executorsByPool { for _, e := range executors { - minReqsMet, _ := requestIsLargeEnough(schedulerobjects.ResourceListFromV1ResourceList( - req.GetResourceRequirements().Requests, - ), e.minimumJobSize) - if minReqsMet { - nodeDb := e.nodeDb - txn := nodeDb.Txn(true) - report, err := nodeDb.SelectNodeForPodWithTxn(txn, req) - txn.Abort() - if err != nil { - return "", errors.WithMessagef(err, "error selecting node for job %s", j.Id()) - } - if report.Node != nil { - p.poolCache.Add(schedulingKey, pool) - return pool, nil - } + requests := req.GetResourceRequirements().Requests + if ok, _ := requestsAreLargeEnough(schedulerobjects.ResourceListFromV1ResourceList(requests), e.minimumJobSize); !ok { + continue + } + nodeDb := e.nodeDb + txn := nodeDb.Txn(true) + jctx := &schedulercontext.JobSchedulingContext{ + Created: time.Now(), + JobId: j.GetId(), + Job: j, + PodRequirements: j.GetPodRequirements(p.priorityClasses), + } + node, err := nodeDb.SelectNodeForJobWithTxn(txn, jctx) + txn.Abort() + if err != nil { + return "", errors.WithMessagef(err, "error selecting node for job %s", j.Id()) + } + if node != nil { + p.poolCache.Add(schedulingKey, pool) + return pool, nil } } } @@ -157,10 +163,14 @@ func (p *DefaultPoolAssigner) constructNodeDb(nodes []*schedulerobjects.Node) (* if err != nil { return nil, err } - err = nodeDb.UpsertMany(nodes) - if err != nil { - return nil, err + txn := nodeDb.Txn(true) + defer txn.Abort() + for _, node := range nodes { + if err := nodeDb.CreateAndInsertWithJobDbJobsWithTxn(txn, nil, node); err != nil { + return nil, err + } } + txn.Commit() err = nodeDb.ClearAllocated() if err != nil { return nil, err diff --git a/internal/scheduler/preempting_queue_scheduler.go b/internal/scheduler/preempting_queue_scheduler.go index 78931edd37c..7b58d355d6a 100644 --- a/internal/scheduler/preempting_queue_scheduler.go +++ b/internal/scheduler/preempting_queue_scheduler.go @@ -16,13 +16,11 @@ import ( "github.com/armadaproject/armada/internal/armada/configuration" armadamaps "github.com/armadaproject/armada/internal/common/maps" armadaslices "github.com/armadaproject/armada/internal/common/slices" - "github.com/armadaproject/armada/internal/common/util" schedulerconfig "github.com/armadaproject/armada/internal/scheduler/configuration" schedulerconstraints "github.com/armadaproject/armada/internal/scheduler/constraints" schedulercontext "github.com/armadaproject/armada/internal/scheduler/context" "github.com/armadaproject/armada/internal/scheduler/interfaces" "github.com/armadaproject/armada/internal/scheduler/nodedb" - "github.com/armadaproject/armada/internal/scheduler/schedulerobjects" ) // PreemptingQueueScheduler is a scheduler that makes a unified decisions on which jobs to preempt and schedule. @@ -32,6 +30,7 @@ type PreemptingQueueScheduler struct { constraints schedulerconstraints.SchedulingConstraints nodeEvictionProbability float64 nodeOversubscriptionEvictionProbability float64 + protectedFractionOfFairShare float64 jobRepo JobRepository nodeDb *nodedb.NodeDb // Maps job ids to the id of the node the job is associated with. @@ -53,6 +52,7 @@ func NewPreemptingQueueScheduler( constraints schedulerconstraints.SchedulingConstraints, nodeEvictionProbability float64, nodeOversubscriptionEvictionProbability float64, + protectedFractionOfFairShare float64, jobRepo JobRepository, nodeDb *nodedb.NodeDb, initialNodeIdByJobId map[string]string, @@ -77,6 +77,7 @@ func NewPreemptingQueueScheduler( constraints: constraints, nodeEvictionProbability: nodeEvictionProbability, nodeOversubscriptionEvictionProbability: nodeOversubscriptionEvictionProbability, + protectedFractionOfFairShare: protectedFractionOfFairShare, jobRepo: jobRepo, nodeDb: nodeDb, nodeIdByJobId: maps.Clone(initialNodeIdByJobId), @@ -99,7 +100,7 @@ func (sch *PreemptingQueueScheduler) SkipUnsuccessfulSchedulingKeyCheck() { func (sch *PreemptingQueueScheduler) Schedule(ctx context.Context) (*SchedulerResult, error) { log := ctxlogrus.Extract(ctx) log = log.WithField("service", "PreemptingQueueScheduler") - if ResourceListAsWeightedMillis(sch.schedulingContext.ResourceScarcity, sch.schedulingContext.TotalResources) == 0 { + if sch.schedulingContext.TotalResources.AsWeightedMillis(sch.schedulingContext.ResourceScarcity) == 0 { // This refers to resources available across all clusters, i.e., // it may include resources not currently considered for scheduling. log.Infof( @@ -108,7 +109,7 @@ func (sch *PreemptingQueueScheduler) Schedule(ctx context.Context) (*SchedulerRe ) return &SchedulerResult{}, nil } - if ResourceListAsWeightedMillis(sch.schedulingContext.ResourceScarcity, sch.nodeDb.TotalResources()) == 0 { + if rl := sch.nodeDb.TotalResources(); rl.AsWeightedMillis(sch.schedulingContext.ResourceScarcity) == 0 { // This refers to the resources currently considered for scheduling. log.Infof( "no resources with non-zero weight available for scheduling in NodeDb: resource scarcity %v, total resources %v", @@ -132,16 +133,40 @@ func (sch *PreemptingQueueScheduler) Schedule(ctx context.Context) (*SchedulerRe snapshot := sch.nodeDb.Txn(false) // Evict preemptible jobs. + totalCost := sch.schedulingContext.TotalCost() evictorResult, inMemoryJobRepo, err := sch.evict( ctxlogrus.ToContext( ctx, log.WithField("stage", "evict for resource balancing"), ), - NewStochasticEvictor( + NewNodeEvictor( sch.jobRepo, sch.schedulingContext.PriorityClasses, - sch.schedulingContext.DefaultPriorityClass, sch.nodeEvictionProbability, + func(ctx context.Context, job interfaces.LegacySchedulerJob) bool { + if job.GetAnnotations() == nil { + log := ctxlogrus.Extract(ctx) + log.Errorf("can't evict job %s: annotations not initialised", job.GetId()) + return false + } + if job.GetNodeSelector() == nil { + log := ctxlogrus.Extract(ctx) + log.Errorf("can't evict job %s: nodeSelector not initialised", job.GetId()) + return false + } + if qctx, ok := sch.schedulingContext.QueueSchedulingContexts[job.GetQueue()]; ok { + fairShare := qctx.Weight / sch.schedulingContext.WeightSum + actualShare := qctx.TotalCostForQueue() / totalCost + fractionOfFairShare := actualShare / fairShare + if fractionOfFairShare <= sch.protectedFractionOfFairShare { + return false + } + } + if priorityClass, ok := sch.schedulingContext.PriorityClasses[job.GetPriorityClassName()]; ok { + return priorityClass.Preemptible + } + return false + }, nil, ), ) @@ -250,10 +275,6 @@ func (sch *PreemptingQueueScheduler) Schedule(ctx context.Context) (*SchedulerRe } if sch.enableAssertions { err := sch.assertions( - ctxlogrus.ToContext( - ctx, - log.WithField("stage", "validate consistency"), - ), snapshot, preemptedJobsById, scheduledJobsById, @@ -424,35 +445,32 @@ func (sch *PreemptingQueueScheduler) setEvictedGangCardinality(evictedJobsById m return nil } -func (sch *PreemptingQueueScheduler) evictionAssertions(evictedJobsById map[string]interfaces.LegacySchedulerJob, affectedNodesById map[string]*schedulerobjects.Node) error { +func (sch *PreemptingQueueScheduler) evictionAssertions(evictedJobsById map[string]interfaces.LegacySchedulerJob, affectedNodesById map[string]*nodedb.Node) error { for _, qctx := range sch.schedulingContext.QueueSchedulingContexts { - if !qctx.AllocatedByPriority.IsStrictlyNonNegative() { - return errors.Errorf("negative allocation for queue %s after eviction: %s", qctx.Queue, qctx.AllocatedByPriority) + if !qctx.AllocatedByPriorityClass.IsStrictlyNonNegative() { + return errors.Errorf("negative allocation for queue %s after eviction: %s", qctx.Queue, qctx.AllocatedByPriorityClass) } } evictedJobIdsByGangId := make(map[string]map[string]bool) for _, job := range evictedJobsById { - if gangId, ok := sch.gangIdByJobId[job.GetId()]; ok { + jobId := job.GetId() + if gangId, ok := sch.gangIdByJobId[jobId]; ok { if m := evictedJobIdsByGangId[gangId]; m != nil { - m[job.GetId()] = true + m[jobId] = true } else { - evictedJobIdsByGangId[gangId] = map[string]bool{job.GetId(): true} + evictedJobIdsByGangId[gangId] = map[string]bool{jobId: true} } } if !isEvictedJob(job) { - return errors.Errorf("evicted job %s is not marked as such: job annotations %v", job.GetId(), job.GetAnnotations()) + return errors.Errorf("evicted job %s is not marked as such: job annotations %v", jobId, job.GetAnnotations()) } - if nodeId, ok := targetNodeIdFromLegacySchedulerJob(job); ok { + nodeSelector := job.GetNodeSelector() + if nodeId, ok := targetNodeIdFromNodeSelector(nodeSelector); ok { if _, ok := affectedNodesById[nodeId]; !ok { - return errors.Errorf("node id %s targeted by job %s is not marked as affected", nodeId, job.GetId()) + return errors.Errorf("node id %s targeted by job %s is not marked as affected", nodeId, jobId) } } else { - req := PodRequirementFromLegacySchedulerJob(job, nil) - if req != nil { - return errors.Errorf("evicted job %s is missing target node id selector: job nodeSelector %v", job.GetId(), req.NodeSelector) - } else { - return errors.Errorf("evicted job %s is missing target node id selector: req is nil", job.GetId()) - } + return errors.Errorf("evicted job %s is missing target node id selector: job nodeSelector %v", jobId, nodeSelector) } } for gangId, evictedGangJobIds := range evictedJobIdsByGangId { @@ -524,15 +542,7 @@ func (sch *PreemptingQueueScheduler) unbindJobs(jobs []interfaces.LegacySchedule if err != nil { return err } - node, err = nodedb.UnbindPodsFromNode( - util.Map( - jobsOnNode, - func(job interfaces.LegacySchedulerJob) *schedulerobjects.PodRequirements { - return PodRequirementFromLegacySchedulerJob(job, sch.schedulingContext.PriorityClasses) - }, - ), - node, - ) + node, err = nodedb.UnbindJobsFromNode(sch.schedulingContext.PriorityClasses, jobsOnNode, node) if err != nil { return err } @@ -552,7 +562,7 @@ func (sch *PreemptingQueueScheduler) updateGangAccounting(preemptedJobs, schedul } } for _, job := range scheduledJobs { - gangId, _, isGangJob, err := GangIdAndCardinalityFromLegacySchedulerJob(job, sch.schedulingContext.PriorityClasses) + gangId, _, isGangJob, err := GangIdAndCardinalityFromLegacySchedulerJob(job) if err != nil { return err } @@ -575,7 +585,6 @@ func (sch *PreemptingQueueScheduler) updateGangAccounting(preemptedJobs, schedul // Compare the nodedb.NodeJobDiff with expected preempted/scheduled jobs to ensure NodeDb is consistent. // This is only to validate that nothing unexpected happened during scheduling. func (sch *PreemptingQueueScheduler) assertions( - ctx context.Context, snapshot *memdb.Txn, preemptedJobsById, scheduledJobsById map[string]interfaces.LegacySchedulerJob, @@ -644,27 +653,25 @@ func (sch *PreemptingQueueScheduler) assertions( type Evictor struct { jobRepo JobRepository priorityClasses map[string]configuration.PriorityClass - nodeFilter func(context.Context, *schedulerobjects.Node) bool + nodeFilter func(context.Context, *nodedb.Node) bool jobFilter func(context.Context, interfaces.LegacySchedulerJob) bool - postEvictFunc func(context.Context, interfaces.LegacySchedulerJob, *schedulerobjects.Node) + postEvictFunc func(context.Context, interfaces.LegacySchedulerJob, *nodedb.Node) } type EvictorResult struct { // Map from job id to job, containing all evicted jobs. EvictedJobsById map[string]interfaces.LegacySchedulerJob // Map from node id to node, containing all nodes on which at least one job was evicted. - AffectedNodesById map[string]*schedulerobjects.Node + AffectedNodesById map[string]*nodedb.Node // For each evicted job, maps the id of the job to the id of the node it was evicted from. NodeIdByJobId map[string]string } -// NewStochasticEvictor returns a new evictor that for each node evicts -// all preemptible jobs from that node with probability perNodeEvictionProbability. -func NewStochasticEvictor( +func NewNodeEvictor( jobRepo JobRepository, priorityClasses map[string]configuration.PriorityClass, - defaultPriorityClass string, perNodeEvictionProbability float64, + jobFilter func(context.Context, interfaces.LegacySchedulerJob) bool, random *rand.Rand, ) *Evictor { if perNodeEvictionProbability <= 0 { @@ -673,44 +680,13 @@ func NewStochasticEvictor( if random == nil { random = rand.New(rand.NewSource(int64(time.Now().Nanosecond()))) } - return NewPreemptibleEvictor( - jobRepo, - priorityClasses, - defaultPriorityClass, - func(_ context.Context, node *schedulerobjects.Node) bool { - return len(node.AllocatedByJobId) > 0 && random.Float64() < perNodeEvictionProbability - }, - ) -} - -// NewPreemptibleEvictor returns a new evictor that evicts all preemptible jobs -// on nodes for which nodeFilter returns true. -func NewPreemptibleEvictor( - jobRepo JobRepository, - priorityClasses map[string]configuration.PriorityClass, - defaultPriorityClass string, - nodeFilter func(context.Context, *schedulerobjects.Node) bool, -) *Evictor { return &Evictor{ jobRepo: jobRepo, priorityClasses: priorityClasses, - nodeFilter: nodeFilter, - jobFilter: func(ctx context.Context, job interfaces.LegacySchedulerJob) bool { - if job.GetAnnotations() == nil { - log := ctxlogrus.Extract(ctx) - log.Warnf("can't evict job %s: annotations not initialised", job.GetId()) - return false - } - priorityClassName := job.GetRequirements(priorityClasses).PriorityClassName - priorityClass, ok := priorityClasses[priorityClassName] - if !ok { - priorityClass = priorityClasses[defaultPriorityClass] - } - if priorityClass.Preemptible { - return true - } - return false + nodeFilter: func(_ context.Context, node *nodedb.Node) bool { + return len(node.AllocatedByJobId) > 0 && random.Float64() < perNodeEvictionProbability }, + jobFilter: jobFilter, postEvictFunc: defaultPostEvictFunc, } } @@ -729,7 +705,7 @@ func NewFilteredEvictor( return &Evictor{ jobRepo: jobRepo, priorityClasses: priorityClasses, - nodeFilter: func(_ context.Context, node *schedulerobjects.Node) bool { + nodeFilter: func(_ context.Context, node *nodedb.Node) bool { shouldEvict := nodeIdsToEvict[node.Id] return shouldEvict }, @@ -764,9 +740,9 @@ func NewOversubscribedEvictor( return &Evictor{ jobRepo: jobRepo, priorityClasses: priorityClasses, - nodeFilter: func(_ context.Context, node *schedulerobjects.Node) bool { + nodeFilter: func(_ context.Context, node *nodedb.Node) bool { overSubscribedPriorities = make(map[int32]bool) - for p, rl := range node.AllocatableByPriorityAndResource { + for p, rl := range node.AllocatableByPriority { if p < 0 { // Negative priorities correspond to already evicted jobs. continue @@ -786,7 +762,7 @@ func NewOversubscribedEvictor( log.Warnf("can't evict job %s: annotations not initialised", job.GetId()) return false } - priorityClassName := job.GetRequirements(priorityClasses).PriorityClassName + priorityClassName := job.GetPriorityClassName() priorityClass, ok := priorityClasses[priorityClassName] if !ok { priorityClass = priorityClasses[defaultPriorityClass] @@ -805,55 +781,53 @@ func NewOversubscribedEvictor( // Any job for which jobFilter returns true is evicted (if the node was not skipped). // If a job was evicted from a node, postEvictFunc is called with the corresponding job and node. func (evi *Evictor) Evict(ctx context.Context, it nodedb.NodeIterator) (*EvictorResult, error) { + var jobFilter func(job interfaces.LegacySchedulerJob) bool + if evi.jobFilter != nil { + jobFilter = func(job interfaces.LegacySchedulerJob) bool { return evi.jobFilter(ctx, job) } + } evictedJobsById := make(map[string]interfaces.LegacySchedulerJob) - affectedNodesById := make(map[string]*schedulerobjects.Node) + affectedNodesById := make(map[string]*nodedb.Node) nodeIdByJobId := make(map[string]string) for node := it.NextNode(); node != nil; node = it.NextNode() { if evi.nodeFilter != nil && !evi.nodeFilter(ctx, node) { continue } - jobIds := util.Filter( - maps.Keys(node.AllocatedByJobId), - func(jobId string) bool { - _, ok := node.EvictedJobRunIds[jobId] - return !ok - }, - ) + jobIds := make([]string, 0) + for jobId := range node.AllocatedByJobId { + if _, ok := node.EvictedJobRunIds[jobId]; !ok { + jobIds = append(jobIds, jobId) + } + } jobs, err := evi.jobRepo.GetExistingJobsByIds(jobIds) if err != nil { return nil, err } - for _, job := range jobs { - if evi.jobFilter != nil && !evi.jobFilter(ctx, job) { - continue - } - req := PodRequirementFromLegacySchedulerJob(job, evi.priorityClasses) - if req == nil { - continue - } - node, err = nodedb.EvictPodFromNode(req, node) - if err != nil { - return nil, err - } + evictedJobs, node, err := nodedb.EvictJobsFromNode(evi.priorityClasses, jobFilter, jobs, node) + if err != nil { + return nil, err + } + for _, job := range evictedJobs { + evictedJobsById[job.GetId()] = job + nodeIdByJobId[job.GetId()] = node.Id if evi.postEvictFunc != nil { evi.postEvictFunc(ctx, job, node) } - - evictedJobsById[job.GetId()] = job - nodeIdByJobId[job.GetId()] = node.Id } - affectedNodesById[node.Id] = node + if len(evictedJobs) > 0 { + affectedNodesById[node.Id] = node + } } - return &EvictorResult{ + result := &EvictorResult{ EvictedJobsById: evictedJobsById, AffectedNodesById: affectedNodesById, NodeIdByJobId: nodeIdByJobId, - }, nil + } + return result, nil } // TODO: This is only necessary for jobs not scheduled in this cycle. // Since jobs scheduled in this cycle can be re-scheduled onto another node without triggering a preemption. -func defaultPostEvictFunc(ctx context.Context, job interfaces.LegacySchedulerJob, node *schedulerobjects.Node) { +func defaultPostEvictFunc(ctx context.Context, job interfaces.LegacySchedulerJob, node *nodedb.Node) { // Add annotation indicating to the scheduler this this job was evicted. annotations := job.GetAnnotations() if annotations == nil { @@ -864,22 +838,11 @@ func defaultPostEvictFunc(ctx context.Context, job interfaces.LegacySchedulerJob } // Add node selector ensuring this job is only re-scheduled onto the node it was evicted from. - req := PodRequirementFromLegacySchedulerJob(job, nil) - if req.NodeSelector == nil { + nodeSelector := job.GetNodeSelector() + if nodeSelector == nil { log := ctxlogrus.Extract(ctx) log.Errorf("error evicting job %s: nodeSelector not initialised", job.GetId()) } else { - req.NodeSelector[schedulerconfig.NodeIdLabel] = node.Id - } - - // Add a toleration to allow the job to be re-scheduled even if node is unschedulable. - // - // TODO: Because req is created with a new tolerations slice above, this toleration doesn't persist. - // In practice, this isn't an issue now since we don't check static requirements for evicted jobs. - if node.Unschedulable { - req.Tolerations = append(req.Tolerations, nodedb.UnschedulableToleration()) + nodeSelector[schedulerconfig.NodeIdLabel] = node.Id } - - // We've changed the scheduling requirements and must clear any cached key. - req.ClearCachedSchedulingKey() } diff --git a/internal/scheduler/preempting_queue_scheduler_test.go b/internal/scheduler/preempting_queue_scheduler_test.go index ff594403e58..59a82e899de 100644 --- a/internal/scheduler/preempting_queue_scheduler_test.go +++ b/internal/scheduler/preempting_queue_scheduler_test.go @@ -27,19 +27,18 @@ import ( ) func TestEvictOversubscribed(t *testing.T) { - nodes := testfixtures.N32CpuNodes(1, testfixtures.TestPriorities) - node := nodes[0] - var err error jobs := append( - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 20), - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass1, 20)..., + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 20), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass1, 20)..., ) - reqs := PodRequirementsFromLegacySchedulerJobs(jobs, testfixtures.TestPriorityClasses) - for _, req := range reqs { - node, err = nodedb.BindPodToNode(req, node) - require.NoError(t, err) - } - nodes[0] = node + node := testfixtures.Test32CpuNode(testfixtures.TestPriorities) + nodeDb, err := NewNodeDb() + require.NoError(t, err) + txn := nodeDb.Txn(true) + err = nodeDb.CreateAndInsertWithJobDbJobsWithTxn(txn, jobs, node) + require.NoError(t, err) + entry, err := nodeDb.GetNode(node.Id) + require.NoError(t, err) jobRepo := NewInMemoryJobRepository(testfixtures.TestPriorityClasses) for _, job := range jobs { @@ -52,7 +51,7 @@ func TestEvictOversubscribed(t *testing.T) { 1, nil, ) - it := NewInMemoryNodeIterator(nodes) + it := NewInMemoryNodeIterator([]*nodedb.Node{entry}) result, err := evictor.Evict(context.Background(), it) require.NoError(t, err) @@ -61,7 +60,7 @@ func TestEvictOversubscribed(t *testing.T) { slices.Sort(priorities) for nodeId, node := range result.AffectedNodesById { for _, p := range priorities { - for resourceType, q := range node.AllocatableByPriorityAndResource[p].Resources { + for resourceType, q := range node.AllocatableByPriority[p].Resources { assert.NotEqual(t, -1, q.Cmp(resource.Quantity{}), "resource %s oversubscribed by %s on node %s", resourceType, q.String(), nodeId) } } @@ -70,16 +69,16 @@ func TestEvictOversubscribed(t *testing.T) { type InMemoryNodeIterator struct { i int - nodes []*schedulerobjects.Node + nodes []*nodedb.Node } -func NewInMemoryNodeIterator(nodes []*schedulerobjects.Node) *InMemoryNodeIterator { +func NewInMemoryNodeIterator(nodes []*nodedb.Node) *InMemoryNodeIterator { return &InMemoryNodeIterator{ nodes: slices.Clone(nodes), } } -func (it *InMemoryNodeIterator) NextNode() *schedulerobjects.Node { +func (it *InMemoryNodeIterator) NextNode() *nodedb.Node { if it.i >= len(it.nodes) { return nil } @@ -111,10 +110,9 @@ func TestPreemptingQueueScheduler(t *testing.T) { Rounds []SchedulingRound // Map from queue to the priority factor associated with that queue. PriorityFactorByQueue map[string]float64 - // Initial resource usage for all queues. - // This value is used across all rounds, + // Initial resource usage for all queues. This value is used across all rounds, // i.e., we don't update it based on preempted/scheduled jobs. - InitialAllocationByQueue map[string]schedulerobjects.QuantityByPriorityAndResourceType + InitialAllocationByQueueAndPriorityClass map[string]schedulerobjects.QuantityByTAndResourceType[string] // Total resources across all clusters. // If empty, it is computed as the total resources across the provided nodes. TotalResources schedulerobjects.ResourceList @@ -127,7 +125,7 @@ func TestPreemptingQueueScheduler(t *testing.T) { Rounds: []SchedulingRound{ { JobsByQueue: map[string][]*jobdb.Job{ - "A": testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 32), + "A": testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 32), }, ExpectedScheduledIndices: map[string][]int{ "A": testfixtures.IntRange(0, 31), @@ -135,7 +133,7 @@ func TestPreemptingQueueScheduler(t *testing.T) { }, { JobsByQueue: map[string][]*jobdb.Job{ - "B": testfixtures.N1CpuJobs("B", testfixtures.PriorityClass0, 32), + "B": testfixtures.N1Cpu4GiJobs("B", testfixtures.PriorityClass0, 32), }, ExpectedScheduledIndices: map[string][]int{ "B": testfixtures.IntRange(0, 15), @@ -148,7 +146,7 @@ func TestPreemptingQueueScheduler(t *testing.T) { }, { JobsByQueue: map[string][]*jobdb.Job{ - "C": testfixtures.N1CpuJobs("C", testfixtures.PriorityClass0, 10), + "C": testfixtures.N1Cpu4GiJobs("C", testfixtures.PriorityClass0, 10), }, ExpectedScheduledIndices: map[string][]int{ "C": testfixtures.IntRange(0, 9), @@ -165,9 +163,9 @@ func TestPreemptingQueueScheduler(t *testing.T) { { // The system should be in steady-state; nothing should be scheduled/preempted. JobsByQueue: map[string][]*jobdb.Job{ - "A": testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 1), - "B": testfixtures.N1CpuJobs("B", testfixtures.PriorityClass0, 1), - "C": testfixtures.N1CpuJobs("C", testfixtures.PriorityClass0, 1), + "A": testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 1), + "B": testfixtures.N1Cpu4GiJobs("B", testfixtures.PriorityClass0, 1), + "C": testfixtures.N1Cpu4GiJobs("C", testfixtures.PriorityClass0, 1), }, }, }, @@ -183,7 +181,7 @@ func TestPreemptingQueueScheduler(t *testing.T) { Rounds: []SchedulingRound{ { JobsByQueue: map[string][]*jobdb.Job{ - "A": testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 32), + "A": testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 32), }, ExpectedScheduledIndices: map[string][]int{ "A": testfixtures.IntRange(0, 31), @@ -191,7 +189,7 @@ func TestPreemptingQueueScheduler(t *testing.T) { }, { JobsByQueue: map[string][]*jobdb.Job{ - "B": testfixtures.N1CpuJobs("B", testfixtures.PriorityClass0, 32), + "B": testfixtures.N1Cpu4GiJobs("B", testfixtures.PriorityClass0, 32), }, ExpectedScheduledIndices: map[string][]int{ "B": testfixtures.IntRange(0, 20), @@ -205,8 +203,8 @@ func TestPreemptingQueueScheduler(t *testing.T) { { // The system should be in steady-state; nothing should be scheduled/preempted. JobsByQueue: map[string][]*jobdb.Job{ - "A": testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 1), - "B": testfixtures.N1CpuJobs("B", testfixtures.PriorityClass0, 1), + "A": testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 1), + "B": testfixtures.N1Cpu4GiJobs("B", testfixtures.PriorityClass0, 1), }, }, }, @@ -221,7 +219,7 @@ func TestPreemptingQueueScheduler(t *testing.T) { Rounds: []SchedulingRound{ { JobsByQueue: map[string][]*jobdb.Job{ - "A": testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 32), + "A": testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 32), }, ExpectedScheduledIndices: map[string][]int{ "A": testfixtures.IntRange(0, 31), @@ -229,7 +227,7 @@ func TestPreemptingQueueScheduler(t *testing.T) { }, { JobsByQueue: map[string][]*jobdb.Job{ - "B": testfixtures.N1CpuJobs("B", testfixtures.PriorityClass0, 32), + "B": testfixtures.N1Cpu4GiJobs("B", testfixtures.PriorityClass0, 32), }, ExpectedScheduledIndices: map[string][]int{ "B": testfixtures.IntRange(0, 20), @@ -243,8 +241,8 @@ func TestPreemptingQueueScheduler(t *testing.T) { { // The system should be in steady-state; nothing should be scheduled/preempted. JobsByQueue: map[string][]*jobdb.Job{ - "A": testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 1), - "B": testfixtures.N1CpuJobs("B", testfixtures.PriorityClass0, 1), + "A": testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 1), + "B": testfixtures.N1Cpu4GiJobs("B", testfixtures.PriorityClass0, 1), }, }, }, @@ -261,7 +259,7 @@ func TestPreemptingQueueScheduler(t *testing.T) { Rounds: []SchedulingRound{ { JobsByQueue: map[string][]*jobdb.Job{ - "A": testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 32), + "A": testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 32), }, ExpectedScheduledIndices: map[string][]int{ "A": testfixtures.IntRange(0, 31), @@ -269,7 +267,7 @@ func TestPreemptingQueueScheduler(t *testing.T) { }, { JobsByQueue: map[string][]*jobdb.Job{ - "B": testfixtures.N1CpuJobs("B", testfixtures.PriorityClass0, 32), + "B": testfixtures.N1Cpu4GiJobs("B", testfixtures.PriorityClass0, 32), }, ExpectedScheduledIndices: map[string][]int{ "B": testfixtures.IntRange(0, 31), @@ -288,7 +286,7 @@ func TestPreemptingQueueScheduler(t *testing.T) { Rounds: []SchedulingRound{ { JobsByQueue: map[string][]*jobdb.Job{ - "B": testfixtures.N1CpuJobs("B", testfixtures.PriorityClass0, 32), + "B": testfixtures.N1Cpu4GiJobs("B", testfixtures.PriorityClass0, 32), }, ExpectedScheduledIndices: map[string][]int{ "B": testfixtures.IntRange(0, 31), @@ -296,7 +294,7 @@ func TestPreemptingQueueScheduler(t *testing.T) { }, { JobsByQueue: map[string][]*jobdb.Job{ - "A": testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 32), + "A": testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 32), }, ExpectedScheduledIndices: map[string][]int{ "A": testfixtures.IntRange(0, 31), @@ -315,7 +313,7 @@ func TestPreemptingQueueScheduler(t *testing.T) { Rounds: []SchedulingRound{ { JobsByQueue: map[string][]*jobdb.Job{ - "A": testfixtures.N1CpuJobs("A", testfixtures.PriorityClass2, 33), + "A": testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass2, 33), }, ExpectedScheduledIndices: map[string][]int{ "A": testfixtures.IntRange(0, 31), @@ -323,7 +321,7 @@ func TestPreemptingQueueScheduler(t *testing.T) { }, { JobsByQueue: map[string][]*jobdb.Job{ - "B": testfixtures.N1CpuJobs("B", testfixtures.PriorityClass3, 1), + "B": testfixtures.N1Cpu4GiJobs("B", testfixtures.PriorityClass3, 1), }, ExpectedScheduledIndices: map[string][]int{ "B": testfixtures.IntRange(0, 0), @@ -336,7 +334,7 @@ func TestPreemptingQueueScheduler(t *testing.T) { }, { JobsByQueue: map[string][]*jobdb.Job{ - "A": testfixtures.N1CpuJobs("A", testfixtures.PriorityClass2, 1), + "A": testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass2, 1), }, }, {}, // Empty round to make sure nothing changes. @@ -352,7 +350,7 @@ func TestPreemptingQueueScheduler(t *testing.T) { Rounds: []SchedulingRound{ { JobsByQueue: map[string][]*jobdb.Job{ - "A": testfixtures.N32CpuJobs("A", testfixtures.PriorityClass0, 1), + "A": testfixtures.N32Cpu256GiJobs("A", testfixtures.PriorityClass0, 1), }, ExpectedScheduledIndices: map[string][]int{ "A": testfixtures.IntRange(0, 0), @@ -361,7 +359,7 @@ func TestPreemptingQueueScheduler(t *testing.T) { { // These should all be scheduled onto the second node with no preemptions necessary. JobsByQueue: map[string][]*jobdb.Job{ - "A": testfixtures.N32CpuJobs("A", testfixtures.PriorityClass1, 1), + "A": testfixtures.N32Cpu256GiJobs("A", testfixtures.PriorityClass1, 1), }, ExpectedScheduledIndices: map[string][]int{ "A": testfixtures.IntRange(0, 0), @@ -378,7 +376,7 @@ func TestPreemptingQueueScheduler(t *testing.T) { Rounds: []SchedulingRound{ { JobsByQueue: map[string][]*jobdb.Job{ - "A": testfixtures.N32CpuJobs("A", testfixtures.PriorityClass1, 1), + "A": testfixtures.N32Cpu256GiJobs("A", testfixtures.PriorityClass1, 1), }, ExpectedScheduledIndices: map[string][]int{ "A": testfixtures.IntRange(0, 0), @@ -386,7 +384,7 @@ func TestPreemptingQueueScheduler(t *testing.T) { }, { JobsByQueue: map[string][]*jobdb.Job{ - "A": testfixtures.N32CpuJobs("A", testfixtures.PriorityClass0, 1), + "A": testfixtures.N32Cpu256GiJobs("A", testfixtures.PriorityClass0, 1), }, ExpectedScheduledIndices: map[string][]int{ "A": testfixtures.IntRange(0, 0), @@ -395,7 +393,7 @@ func TestPreemptingQueueScheduler(t *testing.T) { { // This job should preempt the priority-0 jobs. JobsByQueue: map[string][]*jobdb.Job{ - "A": testfixtures.N32CpuJobs("A", testfixtures.PriorityClass2, 1), + "A": testfixtures.N32Cpu256GiJobs("A", testfixtures.PriorityClass2, 1), }, ExpectedScheduledIndices: map[string][]int{ "A": testfixtures.IntRange(0, 0), @@ -417,7 +415,7 @@ func TestPreemptingQueueScheduler(t *testing.T) { Rounds: []SchedulingRound{ { JobsByQueue: map[string][]*jobdb.Job{ - "A": testfixtures.N32CpuJobs("A", testfixtures.PriorityClass1, 1), + "A": testfixtures.N32Cpu256GiJobs("A", testfixtures.PriorityClass1, 1), }, ExpectedScheduledIndices: map[string][]int{ "A": testfixtures.IntRange(0, 0), @@ -426,7 +424,7 @@ func TestPreemptingQueueScheduler(t *testing.T) { { // These should all be scheduled onto the second node with no preemptions necessary. JobsByQueue: map[string][]*jobdb.Job{ - "B": testfixtures.N32CpuJobs("B", testfixtures.PriorityClass0, 1), + "B": testfixtures.N32Cpu256GiJobs("B", testfixtures.PriorityClass0, 1), }, ExpectedScheduledIndices: map[string][]int{ "B": testfixtures.IntRange(0, 0), @@ -435,7 +433,7 @@ func TestPreemptingQueueScheduler(t *testing.T) { { // These should all be scheduled onto the second node with no preemptions necessary. JobsByQueue: map[string][]*jobdb.Job{ - "C": testfixtures.N32CpuJobs("C", testfixtures.PriorityClass2, 1), + "C": testfixtures.N32Cpu256GiJobs("C", testfixtures.PriorityClass2, 1), }, ExpectedScheduledIndices: map[string][]int{ "C": testfixtures.IntRange(0, 0), @@ -444,7 +442,7 @@ func TestPreemptingQueueScheduler(t *testing.T) { { // These should all be scheduled onto the second node with no preemptions necessary. JobsByQueue: map[string][]*jobdb.Job{ - "D": testfixtures.N32CpuJobs("D", testfixtures.PriorityClass3, 1), + "D": testfixtures.N32Cpu256GiJobs("D", testfixtures.PriorityClass3, 1), }, ExpectedScheduledIndices: map[string][]int{ "D": testfixtures.IntRange(0, 0), @@ -470,8 +468,8 @@ func TestPreemptingQueueScheduler(t *testing.T) { { // Fill half of node 1 and half of node 2. JobsByQueue: map[string][]*jobdb.Job{ - "A": testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 16), - "B": testfixtures.N1CpuJobs("B", testfixtures.PriorityClass0, 16), + "A": testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 16), + "B": testfixtures.N1Cpu4GiJobs("B", testfixtures.PriorityClass0, 16), }, ExpectedScheduledIndices: map[string][]int{ "A": testfixtures.IntRange(0, 15), @@ -481,7 +479,7 @@ func TestPreemptingQueueScheduler(t *testing.T) { { // Schedule a gang filling the remaining space on both nodes. JobsByQueue: map[string][]*jobdb.Job{ - "C": testfixtures.WithGangAnnotationsJobs(testfixtures.N1CpuJobs("C", testfixtures.PriorityClass0, 32)), + "C": testfixtures.WithGangAnnotationsJobs(testfixtures.N1Cpu4GiJobs("C", testfixtures.PriorityClass0, 32)), }, ExpectedScheduledIndices: map[string][]int{ "C": testfixtures.IntRange(0, 31), @@ -491,7 +489,7 @@ func TestPreemptingQueueScheduler(t *testing.T) { // Schedule jobs that requires preempting one job in the gang, // and assert that all jobs in the gang are preempted. JobsByQueue: map[string][]*jobdb.Job{ - "A": testfixtures.N1CpuJobs("A", testfixtures.PriorityClass1, 17), + "A": testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass1, 17), }, ExpectedScheduledIndices: map[string][]int{ "A": testfixtures.IntRange(0, 16), @@ -516,7 +514,7 @@ func TestPreemptingQueueScheduler(t *testing.T) { { // Schedule a gang across two nodes. JobsByQueue: map[string][]*jobdb.Job{ - "A": testfixtures.WithGangAnnotationsJobs(testfixtures.N32CpuJobs("A", testfixtures.PriorityClass0, 2)), + "A": testfixtures.WithGangAnnotationsJobs(testfixtures.N32Cpu256GiJobs("A", testfixtures.PriorityClass0, 2)), }, ExpectedScheduledIndices: map[string][]int{ "A": testfixtures.IntRange(0, 1), @@ -549,7 +547,7 @@ func TestPreemptingQueueScheduler(t *testing.T) { // to avoid them being urgency-preempted in the next round. JobsByQueue: map[string][]*jobdb.Job{ "A": testfixtures.WithGangAnnotationsJobs( - append(testfixtures.N1CpuJobs("A", testfixtures.PriorityClass1, 32), testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 1)...), + append(testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass1, 32), testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 1)...), ), }, ExpectedScheduledIndices: map[string][]int{ @@ -560,7 +558,7 @@ func TestPreemptingQueueScheduler(t *testing.T) { // Schedule a that requires preempting one job in the gang, // and assert that all jobs in the gang are preempted. JobsByQueue: map[string][]*jobdb.Job{ - "B": testfixtures.N32CpuJobs("B", testfixtures.PriorityClass1, 1), + "B": testfixtures.N32Cpu256GiJobs("B", testfixtures.PriorityClass1, 1), }, ExpectedScheduledIndices: map[string][]int{ "B": testfixtures.IntRange(0, 0), @@ -587,7 +585,7 @@ func TestPreemptingQueueScheduler(t *testing.T) { { // Schedule a gang spanning nodes 1 and 2. JobsByQueue: map[string][]*jobdb.Job{ - "A": testfixtures.WithGangAnnotationsJobs(testfixtures.N1CpuJobs("A", testfixtures.PriorityClass1, 33)), + "A": testfixtures.WithGangAnnotationsJobs(testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass1, 33)), }, ExpectedScheduledIndices: map[string][]int{ "A": testfixtures.IntRange(0, 32), @@ -598,7 +596,7 @@ func TestPreemptingQueueScheduler(t *testing.T) { // Make the one job landing on node 3 have priority 0, so it will be urgency-preempted next. JobsByQueue: map[string][]*jobdb.Job{ "A": testfixtures.WithGangAnnotationsJobs( - append(testfixtures.N1CpuJobs("A", testfixtures.PriorityClass1, 31), testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 1)...), + append(testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass1, 31), testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 1)...), ), }, ExpectedScheduledIndices: map[string][]int{ @@ -609,7 +607,7 @@ func TestPreemptingQueueScheduler(t *testing.T) { // Schedule a job that requires preempting the one job on node 3. // Assert that the entire second gang is preempted and that the first gang isn't. JobsByQueue: map[string][]*jobdb.Job{ - "B": testfixtures.N32CpuJobs("B", testfixtures.PriorityClass1, 1), + "B": testfixtures.N32Cpu256GiJobs("B", testfixtures.PriorityClass1, 1), }, ExpectedScheduledIndices: map[string][]int{ "B": testfixtures.IntRange(0, 0), @@ -632,7 +630,7 @@ func TestPreemptingQueueScheduler(t *testing.T) { Rounds: []SchedulingRound{ { JobsByQueue: map[string][]*jobdb.Job{ - "A": testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 10), + "A": testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 10), }, ExpectedScheduledIndices: map[string][]int{ "A": testfixtures.IntRange(0, 4), @@ -640,7 +638,7 @@ func TestPreemptingQueueScheduler(t *testing.T) { }, { JobsByQueue: map[string][]*jobdb.Job{ - "A": testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 10), + "A": testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 10), }, ExpectedScheduledIndices: map[string][]int{ "A": testfixtures.IntRange(0, 4), @@ -657,7 +655,7 @@ func TestPreemptingQueueScheduler(t *testing.T) { Rounds: []SchedulingRound{ { JobsByQueue: map[string][]*jobdb.Job{ - "A": testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 2), + "A": testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 2), }, ExpectedScheduledIndices: map[string][]int{ "A": testfixtures.IntRange(0, 1), @@ -665,7 +663,7 @@ func TestPreemptingQueueScheduler(t *testing.T) { }, { JobsByQueue: map[string][]*jobdb.Job{ - "A": testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 10), + "A": testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 10), }, ExpectedScheduledIndices: map[string][]int{ "A": testfixtures.IntRange(0, 4), @@ -687,7 +685,7 @@ func TestPreemptingQueueScheduler(t *testing.T) { Rounds: []SchedulingRound{ { JobsByQueue: map[string][]*jobdb.Job{ - "A": testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 10), + "A": testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 10), }, ExpectedScheduledIndices: map[string][]int{ "A": testfixtures.IntRange(0, 5), @@ -695,7 +693,7 @@ func TestPreemptingQueueScheduler(t *testing.T) { }, { JobsByQueue: map[string][]*jobdb.Job{ - "A": testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 10), + "A": testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 10), }, ExpectedScheduledIndices: map[string][]int{ "A": testfixtures.IntRange(0, 5), @@ -712,7 +710,7 @@ func TestPreemptingQueueScheduler(t *testing.T) { Rounds: []SchedulingRound{ { JobsByQueue: map[string][]*jobdb.Job{ - "A": testfixtures.N32CpuJobs("A", testfixtures.PriorityClass0, 1), + "A": testfixtures.N32Cpu256GiJobs("A", testfixtures.PriorityClass0, 1), }, ExpectedScheduledIndices: map[string][]int{ "A": testfixtures.IntRange(0, 0), @@ -720,7 +718,7 @@ func TestPreemptingQueueScheduler(t *testing.T) { }, { JobsByQueue: map[string][]*jobdb.Job{ - "A": testfixtures.N32CpuJobs("A", testfixtures.PriorityClass1, 1), + "A": testfixtures.N32Cpu256GiJobs("A", testfixtures.PriorityClass1, 1), }, ExpectedScheduledIndices: map[string][]int{ "A": testfixtures.IntRange(0, 0), @@ -742,7 +740,7 @@ func TestPreemptingQueueScheduler(t *testing.T) { Rounds: []SchedulingRound{ { JobsByQueue: map[string][]*jobdb.Job{ - "A": testfixtures.N32CpuJobs("A", testfixtures.PriorityClass0, 1), + "A": testfixtures.N32Cpu256GiJobs("A", testfixtures.PriorityClass0, 1), }, ExpectedScheduledIndices: map[string][]int{ "A": testfixtures.IntRange(0, 0), @@ -750,7 +748,7 @@ func TestPreemptingQueueScheduler(t *testing.T) { }, { JobsByQueue: map[string][]*jobdb.Job{ - "B": testfixtures.N32CpuJobs("B", testfixtures.PriorityClass1, 1), + "B": testfixtures.N32Cpu256GiJobs("B", testfixtures.PriorityClass1, 1), }, ExpectedScheduledIndices: map[string][]int{ "B": testfixtures.IntRange(0, 0), @@ -773,7 +771,7 @@ func TestPreemptingQueueScheduler(t *testing.T) { Rounds: []SchedulingRound{ { JobsByQueue: map[string][]*jobdb.Job{ - "A": append(testfixtures.N32CpuJobs("A", testfixtures.PriorityClass0, 1), testfixtures.N32CpuJobs("A", testfixtures.PriorityClass1, 1)...), + "A": append(testfixtures.N32Cpu256GiJobs("A", testfixtures.PriorityClass0, 1), testfixtures.N32Cpu256GiJobs("A", testfixtures.PriorityClass1, 1)...), }, ExpectedScheduledIndices: map[string][]int{ "A": testfixtures.IntRange(1, 1), @@ -791,9 +789,9 @@ func TestPreemptingQueueScheduler(t *testing.T) { { JobsByQueue: map[string][]*jobdb.Job{ "A": append(append( - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 10), - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass1, 10)...), - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass2, 10)..., + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 10), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass1, 10)...), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass2, 10)..., ), }, ExpectedScheduledIndices: map[string][]int{ @@ -802,7 +800,7 @@ func TestPreemptingQueueScheduler(t *testing.T) { }, { JobsByQueue: map[string][]*jobdb.Job{ - "A": testfixtures.N1CpuJobs("A", testfixtures.PriorityClass3, 24), + "A": testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass3, 24), }, ExpectedScheduledIndices: map[string][]int{ "A": testfixtures.IntRange(0, 23), @@ -818,30 +816,35 @@ func TestPreemptingQueueScheduler(t *testing.T) { "A": 1, }, }, - "per-priority class limits": { + "MaximumResourceFractionPerQueue": { SchedulingConfig: testfixtures.WithPerPriorityLimitsConfig( - map[int32]map[string]float64{ - 0: {"cpu": 60.0 / 64.0}, - 1: {"cpu": 20.0 / 64.0}, + map[string]map[string]float64{ + testfixtures.PriorityClass0: {"cpu": 1.0 / 32.0}, + testfixtures.PriorityClass1: {"cpu": 2.0 / 32.0}, + testfixtures.PriorityClass2: {"cpu": 3.0 / 32.0}, + testfixtures.PriorityClass3: {"cpu": 4.0 / 32.0}, }, testfixtures.TestSchedulingConfig(), ), - Nodes: testfixtures.N32CpuNodes(2, testfixtures.TestPriorities), + Nodes: testfixtures.N32CpuNodes(1, testfixtures.TestPriorities), Rounds: []SchedulingRound{ { JobsByQueue: map[string][]*jobdb.Job{ - "A": append( - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass1, 64), - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 64)..., + "A": armadaslices.Concatenate( + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 32), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass1, 32), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass2, 32), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass3, 32), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 32), ), }, ExpectedScheduledIndices: map[string][]int{ - "A": append(testfixtures.IntRange(0, 19), testfixtures.IntRange(64, 103)...), + "A": {0, 32, 33, 64, 65, 66, 96, 97, 98, 99}, }, }, { JobsByQueue: map[string][]*jobdb.Job{ - "A": testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 1), + "A": testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 1), }, }, }, @@ -849,11 +852,13 @@ func TestPreemptingQueueScheduler(t *testing.T) { "A": 1, }, }, - "per-priority class limits multiple rounds": { + "MaximumResourceFractionPerQueue multiple rounds": { SchedulingConfig: testfixtures.WithPerPriorityLimitsConfig( - map[int32]map[string]float64{ - 0: {"cpu": 30.0 / 32.0}, - 1: {"cpu": 10.0 / 32.0}, + map[string]map[string]float64{ + testfixtures.PriorityClass0: {"cpu": 1.0 / 32.0}, + testfixtures.PriorityClass1: {"cpu": 2.0 / 32.0}, + testfixtures.PriorityClass2: {"cpu": 3.0 / 32.0}, + testfixtures.PriorityClass3: {"cpu": 4.0 / 32.0}, }, testfixtures.TestSchedulingConfig(), ), @@ -861,31 +866,57 @@ func TestPreemptingQueueScheduler(t *testing.T) { Rounds: []SchedulingRound{ { JobsByQueue: map[string][]*jobdb.Job{ - "A": append( - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass1, 5), - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 10)..., + "A": armadaslices.Concatenate( + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 32), + ), + }, + ExpectedScheduledIndices: map[string][]int{ + "A": testfixtures.IntRange(0, 0), + }, + }, + { + JobsByQueue: map[string][]*jobdb.Job{ + "A": armadaslices.Concatenate( + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass1, 32), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 32), + ), + }, + ExpectedScheduledIndices: map[string][]int{ + "A": testfixtures.IntRange(0, 1), + }, + }, + { + JobsByQueue: map[string][]*jobdb.Job{ + "A": armadaslices.Concatenate( + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass2, 32), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass1, 32), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 32), ), }, ExpectedScheduledIndices: map[string][]int{ - "A": testfixtures.IntRange(0, 14), + "A": testfixtures.IntRange(0, 2), }, }, { JobsByQueue: map[string][]*jobdb.Job{ - "A": append( - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass1, 32), - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 32)..., + "A": armadaslices.Concatenate( + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass3, 32), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass2, 32), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass1, 32), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 32), ), }, ExpectedScheduledIndices: map[string][]int{ - "A": append(testfixtures.IntRange(0, 4), testfixtures.IntRange(32, 41)...), + "A": testfixtures.IntRange(0, 3), }, }, { JobsByQueue: map[string][]*jobdb.Job{ - "A": append( - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass1, 32), - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 32)..., + "A": armadaslices.Concatenate( + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass3, 32), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass2, 32), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass1, 32), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 32), ), }, }, @@ -903,7 +934,7 @@ func TestPreemptingQueueScheduler(t *testing.T) { Rounds: []SchedulingRound{ { JobsByQueue: map[string][]*jobdb.Job{ - "B": testfixtures.N1CpuJobs("B", testfixtures.PriorityClass0, 64), + "B": testfixtures.N1Cpu4GiJobs("B", testfixtures.PriorityClass0, 64), }, ExpectedScheduledIndices: map[string][]int{ "B": testfixtures.IntRange(0, 15), @@ -911,8 +942,8 @@ func TestPreemptingQueueScheduler(t *testing.T) { }, { JobsByQueue: map[string][]*jobdb.Job{ - "A": testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 64), - "B": testfixtures.N1CpuJobs("B", testfixtures.PriorityClass0, 64), + "A": testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 64), + "B": testfixtures.N1Cpu4GiJobs("B", testfixtures.PriorityClass0, 64), }, ExpectedScheduledIndices: map[string][]int{ "A": testfixtures.IntRange(0, 15), @@ -920,8 +951,8 @@ func TestPreemptingQueueScheduler(t *testing.T) { }, { JobsByQueue: map[string][]*jobdb.Job{ - "A": testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 64), - "B": testfixtures.N1CpuJobs("B", testfixtures.PriorityClass0, 64), + "A": testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 64), + "B": testfixtures.N1Cpu4GiJobs("B", testfixtures.PriorityClass0, 64), }, ExpectedScheduledIndices: map[string][]int{ "A": testfixtures.IntRange(0, 7), @@ -930,8 +961,8 @@ func TestPreemptingQueueScheduler(t *testing.T) { }, { JobsByQueue: map[string][]*jobdb.Job{ - "A": testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 64), - "B": testfixtures.N1CpuJobs("B", testfixtures.PriorityClass0, 64), + "A": testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 64), + "B": testfixtures.N1Cpu4GiJobs("B", testfixtures.PriorityClass0, 64), }, ExpectedScheduledIndices: map[string][]int{ "A": testfixtures.IntRange(0, 7), @@ -950,8 +981,8 @@ func TestPreemptingQueueScheduler(t *testing.T) { Rounds: []SchedulingRound{ { JobsByQueue: map[string][]*jobdb.Job{ - "A": testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 32), - "B": testfixtures.N1CpuJobs("B", testfixtures.PriorityClass1, 32), + "A": testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 32), + "B": testfixtures.N1Cpu4GiJobs("B", testfixtures.PriorityClass1, 32), }, ExpectedScheduledIndices: map[string][]int{ "B": testfixtures.IntRange(0, 31), @@ -970,8 +1001,8 @@ func TestPreemptingQueueScheduler(t *testing.T) { Rounds: []SchedulingRound{ { JobsByQueue: map[string][]*jobdb.Job{ - "A": testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 32), - "B": testfixtures.N1CpuJobs("B", testfixtures.PriorityClass1, 31), + "A": testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 32), + "B": testfixtures.N1Cpu4GiJobs("B", testfixtures.PriorityClass1, 31), }, ExpectedScheduledIndices: map[string][]int{ "A": testfixtures.IntRange(0, 0), @@ -991,8 +1022,8 @@ func TestPreemptingQueueScheduler(t *testing.T) { Rounds: []SchedulingRound{ { JobsByQueue: map[string][]*jobdb.Job{ - "A": testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 32), - "B": testfixtures.N1CpuJobs("B", testfixtures.PriorityClass3, 32), + "A": testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 32), + "B": testfixtures.N1Cpu4GiJobs("B", testfixtures.PriorityClass3, 32), }, ExpectedScheduledIndices: map[string][]int{ "B": testfixtures.IntRange(0, 31), @@ -1011,7 +1042,7 @@ func TestPreemptingQueueScheduler(t *testing.T) { Rounds: []SchedulingRound{ { JobsByQueue: map[string][]*jobdb.Job{ - "A": testfixtures.N1CpuJobs("A", testfixtures.PriorityClass1, 16), + "A": testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass1, 16), }, ExpectedScheduledIndices: map[string][]int{ "A": testfixtures.IntRange(0, 15), @@ -1019,8 +1050,8 @@ func TestPreemptingQueueScheduler(t *testing.T) { }, { JobsByQueue: map[string][]*jobdb.Job{ - "A": testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 16), - "B": testfixtures.N1CpuJobs("B", testfixtures.PriorityClass1, 32), + "A": testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 16), + "B": testfixtures.N1Cpu4GiJobs("B", testfixtures.PriorityClass1, 32), }, ExpectedScheduledIndices: map[string][]int{ "B": testfixtures.IntRange(0, 15), @@ -1043,8 +1074,8 @@ func TestPreemptingQueueScheduler(t *testing.T) { { JobsByQueue: map[string][]*jobdb.Job{ "A": armadaslices.Concatenate( - testfixtures.N16CpuJobs("A", testfixtures.PriorityClass2, 1), - testfixtures.N16CpuJobs("A", testfixtures.PriorityClass2NonPreemptible, 3), + testfixtures.N16Cpu128GiJobs("A", testfixtures.PriorityClass2, 1), + testfixtures.N16Cpu128GiJobs("A", testfixtures.PriorityClass2NonPreemptible, 3), ), }, ExpectedScheduledIndices: map[string][]int{ @@ -1054,8 +1085,8 @@ func TestPreemptingQueueScheduler(t *testing.T) { { JobsByQueue: map[string][]*jobdb.Job{ "B": armadaslices.Concatenate( - testfixtures.N16CpuJobs("B", testfixtures.PriorityClass3, 1), - testfixtures.N16CpuJobs("B", testfixtures.PriorityClass2NonPreemptible, 1), + testfixtures.N16Cpu128GiJobs("B", testfixtures.PriorityClass3, 1), + testfixtures.N16Cpu128GiJobs("B", testfixtures.PriorityClass2NonPreemptible, 1), ), }, ExpectedScheduledIndices: map[string][]int{ @@ -1080,7 +1111,7 @@ func TestPreemptingQueueScheduler(t *testing.T) { Rounds: []SchedulingRound{ { JobsByQueue: map[string][]*jobdb.Job{ - "A": testfixtures.N1CpuJobs("A", testfixtures.PriorityClass1, 1), + "A": testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass1, 1), }, ExpectedScheduledIndices: map[string][]int{ "A": testfixtures.IntRange(0, 0), @@ -1088,13 +1119,152 @@ func TestPreemptingQueueScheduler(t *testing.T) { }, { JobsByQueue: map[string][]*jobdb.Job{ - "B": testfixtures.N1CpuJobs("B", testfixtures.PriorityClass1, 1), + "B": testfixtures.N1Cpu4GiJobs("B", testfixtures.PriorityClass1, 1), }, NodeIndicesToCordon: []int{0}, }, { JobsByQueue: map[string][]*jobdb.Job{ - "B": testfixtures.N1CpuJobs("B", testfixtures.PriorityClass1, 1), + "B": testfixtures.N1Cpu4GiJobs("B", testfixtures.PriorityClass1, 1), + }, + }, + {}, // Empty round to make sure nothing changes. + }, + PriorityFactorByQueue: map[string]float64{ + "A": 1, + "B": 1, + }, + }, + "ProtectedFractionOfFairShare": { + SchedulingConfig: testfixtures.WithProtectedFractionOfFairShareConfig( + 1.0, + testfixtures.TestSchedulingConfig(), + ), + Nodes: testfixtures.N32CpuNodes(1, testfixtures.TestPriorities), + Rounds: []SchedulingRound{ + { + JobsByQueue: map[string][]*jobdb.Job{ + "A": testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 10), + }, + ExpectedScheduledIndices: map[string][]int{ + "A": testfixtures.IntRange(0, 9), + }, + }, + { + JobsByQueue: map[string][]*jobdb.Job{ + "B": testfixtures.N1Cpu4GiJobs("B", testfixtures.PriorityClass3, 22), + }, + ExpectedScheduledIndices: map[string][]int{ + "B": testfixtures.IntRange(0, 21), + }, + }, + { + JobsByQueue: map[string][]*jobdb.Job{ + "C": testfixtures.N1Cpu4GiJobs("C", testfixtures.PriorityClass0, 1), + }, + }, + {}, // Empty round to make sure nothing changes. + }, + PriorityFactorByQueue: map[string]float64{ + "A": 1, + "B": 1, + "C": 1, + }, + }, + "ProtectedFractionOfFairShare at limit": { + SchedulingConfig: testfixtures.WithProtectedFractionOfFairShareConfig( + 0.5, + testfixtures.TestSchedulingConfig(), + ), + Nodes: testfixtures.N32CpuNodes(1, testfixtures.TestPriorities), + Rounds: []SchedulingRound{ + { + JobsByQueue: map[string][]*jobdb.Job{ + "A": testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 8), + }, + ExpectedScheduledIndices: map[string][]int{ + "A": testfixtures.IntRange(0, 7), + }, + }, + { + JobsByQueue: map[string][]*jobdb.Job{ + "B": testfixtures.N1Cpu4GiJobs("B", testfixtures.PriorityClass3, 24), + }, + ExpectedScheduledIndices: map[string][]int{ + "B": testfixtures.IntRange(0, 23), + }, + }, + { + JobsByQueue: map[string][]*jobdb.Job{ + "C": testfixtures.N1Cpu4GiJobs("C", testfixtures.PriorityClass0, 1), + }, + }, + {}, // Empty round to make sure nothing changes. + }, + PriorityFactorByQueue: map[string]float64{ + "A": 0.5, + "B": 1, + "C": 1, + }, + }, + "ProtectedFractionOfFairShare above limit": { + SchedulingConfig: testfixtures.WithProtectedFractionOfFairShareConfig( + 0.5, + testfixtures.TestSchedulingConfig(), + ), + Nodes: testfixtures.N32CpuNodes(1, testfixtures.TestPriorities), + Rounds: []SchedulingRound{ + { + JobsByQueue: map[string][]*jobdb.Job{ + "A": testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 9), + }, + ExpectedScheduledIndices: map[string][]int{ + "A": testfixtures.IntRange(0, 8), + }, + }, + { + JobsByQueue: map[string][]*jobdb.Job{ + "B": testfixtures.N1Cpu4GiJobs("B", testfixtures.PriorityClass3, 23), + }, + ExpectedScheduledIndices: map[string][]int{ + "B": testfixtures.IntRange(0, 22), + }, + }, + { + JobsByQueue: map[string][]*jobdb.Job{ + "C": testfixtures.N1Cpu4GiJobs("C", testfixtures.PriorityClass0, 1), + }, + ExpectedScheduledIndices: map[string][]int{ + "C": testfixtures.IntRange(0, 0), + }, + ExpectedPreemptedIndices: map[string]map[int][]int{ + "A": { + 0: testfixtures.IntRange(8, 8), + }, + }, + }, + {}, // Empty round to make sure nothing changes. + }, + PriorityFactorByQueue: map[string]float64{ + "A": 1, + "B": 1, + "C": 1, + }, + }, + "DominantResourceFairness": { + SchedulingConfig: testfixtures.WithDominantResourceFairnessConfig( + testfixtures.TestSchedulingConfig(), + ), + Nodes: testfixtures.N32CpuNodes(1, testfixtures.TestPriorities), + Rounds: []SchedulingRound{ + { + JobsByQueue: map[string][]*jobdb.Job{ + "A": testfixtures.N1Cpu16GiJobs("A", testfixtures.PriorityClass0, 32), + "B": testfixtures.N1Cpu4GiJobs("B", testfixtures.PriorityClass0, 32), + }, + ExpectedScheduledIndices: map[string][]int{ + "A": testfixtures.IntRange(0, 9), + "B": testfixtures.IntRange(0, 21), }, }, {}, // Empty round to make sure nothing changes. @@ -1107,8 +1277,14 @@ func TestPreemptingQueueScheduler(t *testing.T) { } for name, tc := range tests { t.Run(name, func(t *testing.T) { - nodeDb, err := CreateNodeDb(tc.Nodes) + nodeDb, err := NewNodeDb() require.NoError(t, err) + txn := nodeDb.Txn(true) + for _, node := range tc.Nodes { + err := nodeDb.CreateAndInsertWithJobDbJobsWithTxn(txn, nil, node) + require.NoError(t, err) + } + txn.Commit() // Repo. for storing jobs to be queued. // The Redis job repo. doesn't order by pc, so we disable pc ordering here too. @@ -1118,7 +1294,7 @@ func TestPreemptingQueueScheduler(t *testing.T) { // Accounting across scheduling rounds. roundByJobId := make(map[string]int) indexByJobId := make(map[string]int) - allocatedByQueueAndPriority := armadamaps.DeepCopy(tc.InitialAllocationByQueue) + allocatedByQueueAndPriorityClass := armadamaps.DeepCopy(tc.InitialAllocationByQueueAndPriorityClass) nodeIdByJobId := make(map[string]string) var jobIdsByGangId map[string]map[string]bool var gangIdByJobId map[string]string @@ -1149,11 +1325,10 @@ func TestPreemptingQueueScheduler(t *testing.T) { for roundIndex, reqIndices := range reqIndicesByRoundIndex { for _, reqIndex := range reqIndices { job := tc.Rounds[roundIndex].JobsByQueue[queue][reqIndex] - req := PodRequirementFromLegacySchedulerJob(job, tc.SchedulingConfig.Preemption.PriorityClasses) nodeId := nodeIdByJobId[job.GetId()] node, err := nodeDb.GetNode(nodeId) require.NoError(t, err) - node, err = nodedb.UnbindPodFromNode(req, node) + node, err = nodedb.UnbindJobFromNode(tc.SchedulingConfig.Preemption.PriorityClasses, job, node) require.NoError(t, err) err = nodeDb.Upsert(node) require.NoError(t, err) @@ -1169,8 +1344,8 @@ func TestPreemptingQueueScheduler(t *testing.T) { for _, j := range round.NodeIndicesToCordon { node, err := nodeDb.GetNode(tc.Nodes[j].Id) require.NoError(t, err) - node = node.DeepCopy() - node.Unschedulable = true + node = node.UnsafeCopy() + node.Taints = append(slices.Clone(node.Taints), nodedb.UnschedulableTaint()) err = nodeDb.Upsert(node) require.NoError(t, err) } @@ -1188,8 +1363,12 @@ func TestPreemptingQueueScheduler(t *testing.T) { tc.SchedulingConfig.ResourceScarcity, tc.TotalResources, ) + if tc.SchedulingConfig.FairnessModel == configuration.DominantResourceFairness { + sctx.EnableDominantResourceFairness(tc.SchedulingConfig.DominantResourceFairnessResourcesToConsider) + } for queue, priorityFactor := range tc.PriorityFactorByQueue { - err := sctx.AddQueueSchedulingContext(queue, priorityFactor, allocatedByQueueAndPriority[queue]) + weight := 1 / priorityFactor + err := sctx.AddQueueSchedulingContext(queue, weight, allocatedByQueueAndPriorityClass[queue]) require.NoError(t, err) } constraints := schedulerconstraints.SchedulingConstraintsFromSchedulingConfig( @@ -1203,6 +1382,7 @@ func TestPreemptingQueueScheduler(t *testing.T) { constraints, tc.SchedulingConfig.Preemption.NodeEvictionProbability, tc.SchedulingConfig.Preemption.NodeOversubscriptionEvictionProbability, + tc.SchedulingConfig.Preemption.ProtectedFractionOfFairShare, repo, nodeDb, nodeIdByJobId, @@ -1217,35 +1397,30 @@ func TestPreemptingQueueScheduler(t *testing.T) { // Test resource accounting. for _, job := range result.PreemptedJobs { - req := PodRequirementFromLegacySchedulerJob(job, tc.SchedulingConfig.Preemption.PriorityClasses) - requests := schedulerobjects.ResourceListFromV1ResourceList(req.ResourceRequirements.Requests) - quantityByPriorityAndResourceType := schedulerobjects.QuantityByPriorityAndResourceType{ - req.Priority: requests, + m := allocatedByQueueAndPriorityClass[job.GetQueue()] + if m == nil { + m = make(schedulerobjects.QuantityByTAndResourceType[string]) + allocatedByQueueAndPriorityClass[job.GetQueue()] = m } - allocatedByQueueAndPriority[job.GetQueue()].Sub(quantityByPriorityAndResourceType) + m.SubV1ResourceList( + job.GetPriorityClassName(), + job.GetResourceRequirements().Requests, + ) } for _, job := range result.ScheduledJobs { - req := PodRequirementFromLegacySchedulerJob(job, tc.SchedulingConfig.Preemption.PriorityClasses) - requests := schedulerobjects.ResourceListFromV1ResourceList(req.ResourceRequirements.Requests) - quantityByPriorityAndResourceType := schedulerobjects.QuantityByPriorityAndResourceType{ - req.Priority: requests, - } - m := allocatedByQueueAndPriority[job.GetQueue()] + m := allocatedByQueueAndPriorityClass[job.GetQueue()] if m == nil { - m = make(schedulerobjects.QuantityByPriorityAndResourceType) + m = make(schedulerobjects.QuantityByTAndResourceType[string]) + allocatedByQueueAndPriorityClass[job.GetQueue()] = m } - m.Add(quantityByPriorityAndResourceType) - allocatedByQueueAndPriority[job.GetQueue()] = m - } - for queue, allocated := range allocatedByQueueAndPriority { - // Filter out explicit zeros to enable comparing with expected allocation. - allocatedByQueueAndPriority[queue] = armadamaps.Filter( - allocated, - func(_ int32, rl schedulerobjects.ResourceList) bool { - return !rl.IsZero() - }, + m.AddV1ResourceList( + job.GetPriorityClassName(), + job.GetResourceRequirements().Requests, ) } + for queue, qctx := range sctx.QueueSchedulingContexts { + assert.True(t, qctx.AllocatedByPriorityClass.Equal(allocatedByQueueAndPriorityClass[queue])) + } // Test that jobs are mapped to nodes correctly. for _, job := range result.PreemptedJobs { @@ -1322,7 +1497,7 @@ func TestPreemptingQueueScheduler(t *testing.T) { require.NoError(t, err) for node := it.NextNode(); node != nil; node = it.NextNode() { for _, p := range priorities { - for resourceType, q := range node.AllocatableByPriorityAndResource[p].Resources { + for resourceType, q := range node.AllocatableByPriority[p].Resources { assert.NotEqual(t, -1, q.Cmp(resource.Quantity{}), "resource %s oversubscribed by %s on node %s", resourceType, q.String(), node.Id) } } @@ -1352,68 +1527,77 @@ func BenchmarkPreemptingQueueScheduler(b *testing.B) { MaxPriorityFactor int }{ "1 node 1 queue 320 jobs": { - SchedulingConfig: testfixtures.WithNodeOversubscriptionEvictionProbabilityConfig( - 0, - testfixtures.WithNodeEvictionProbabilityConfig( - 0.1, - testfixtures.TestSchedulingConfig(), - ), - ), + SchedulingConfig: testfixtures.TestSchedulingConfig(), Nodes: testfixtures.N32CpuNodes(1, testfixtures.TestPriorities), - JobFunc: testfixtures.N1CpuJobs, + JobFunc: testfixtures.N1Cpu4GiJobs, NumQueues: 1, NumJobsPerQueue: 320, MinPriorityFactor: 1, MaxPriorityFactor: 1, }, + "1 node 10 queues 320 jobs": { + SchedulingConfig: testfixtures.TestSchedulingConfig(), + Nodes: testfixtures.N32CpuNodes(1, testfixtures.TestPriorities), + JobFunc: testfixtures.N1Cpu4GiJobs, + NumQueues: 10, + NumJobsPerQueue: 320, + MinPriorityFactor: 1, + MaxPriorityFactor: 1, + }, "10 nodes 1 queue 3200 jobs": { - SchedulingConfig: testfixtures.WithNodeEvictionProbabilityConfig( - 0.1, - testfixtures.TestSchedulingConfig(), - ), + SchedulingConfig: testfixtures.TestSchedulingConfig(), Nodes: testfixtures.N32CpuNodes(10, testfixtures.TestPriorities), - JobFunc: testfixtures.N1CpuJobs, + JobFunc: testfixtures.N1Cpu4GiJobs, NumQueues: 1, NumJobsPerQueue: 3200, MinPriorityFactor: 1, MaxPriorityFactor: 1, }, "10 nodes 10 queues 3200 jobs": { - SchedulingConfig: testfixtures.WithNodeEvictionProbabilityConfig( - 0.1, - testfixtures.TestSchedulingConfig(), - ), + SchedulingConfig: testfixtures.TestSchedulingConfig(), Nodes: testfixtures.N32CpuNodes(10, testfixtures.TestPriorities), - JobFunc: testfixtures.N1CpuJobs, + JobFunc: testfixtures.N1Cpu4GiJobs, NumQueues: 10, NumJobsPerQueue: 3200, MinPriorityFactor: 1, MaxPriorityFactor: 1, }, "100 nodes 1 queue 32000 jobs": { - SchedulingConfig: testfixtures.WithNodeEvictionProbabilityConfig( - 0.1, - testfixtures.TestSchedulingConfig(), - ), + SchedulingConfig: testfixtures.TestSchedulingConfig(), Nodes: testfixtures.N32CpuNodes(100, testfixtures.TestPriorities), - JobFunc: testfixtures.N1CpuJobs, + JobFunc: testfixtures.N1Cpu4GiJobs, NumQueues: 1, NumJobsPerQueue: 32000, MinPriorityFactor: 1, MaxPriorityFactor: 1, }, + "100 nodes 10 queues 32000 jobs": { + SchedulingConfig: testfixtures.TestSchedulingConfig(), + Nodes: testfixtures.N32CpuNodes(100, testfixtures.TestPriorities), + JobFunc: testfixtures.N1Cpu4GiJobs, + NumQueues: 10, + NumJobsPerQueue: 32000, + MinPriorityFactor: 1, + MaxPriorityFactor: 1, + }, "1000 nodes 1 queue 320000 jobs": { - SchedulingConfig: testfixtures.WithNodeEvictionProbabilityConfig( - 0.1, - testfixtures.TestSchedulingConfig(), - ), + SchedulingConfig: testfixtures.TestSchedulingConfig(), Nodes: testfixtures.N32CpuNodes(1000, testfixtures.TestPriorities), - JobFunc: testfixtures.N1CpuJobs, + JobFunc: testfixtures.N1Cpu4GiJobs, NumQueues: 1, NumJobsPerQueue: 320000, MinPriorityFactor: 1, MaxPriorityFactor: 1, }, + "1000 nodes 10 queues 320000 jobs": { + SchedulingConfig: testfixtures.TestSchedulingConfig(), + Nodes: testfixtures.N32CpuNodes(1000, testfixtures.TestPriorities), + JobFunc: testfixtures.N1Cpu4GiJobs, + NumQueues: 1, + NumJobsPerQueue: 32000, + MinPriorityFactor: 1, + MaxPriorityFactor: 1, + }, } for name, tc := range tests { b.Run(name, func(b *testing.B) { @@ -1425,10 +1609,16 @@ func BenchmarkPreemptingQueueScheduler(b *testing.B) { priorityFactorByQueue[queue] = float64(rand.Intn(tc.MaxPriorityFactor-tc.MinPriorityFactor+1) + tc.MinPriorityFactor) } - nodeDb, err := CreateNodeDb(tc.Nodes) + nodeDb, err := NewNodeDb() require.NoError(b, err) - repo := NewInMemoryJobRepository(testfixtures.TestPriorityClasses) - allocatedByQueueAndPriority := make(map[string]schedulerobjects.QuantityByPriorityAndResourceType) + txn := nodeDb.Txn(true) + for _, node := range tc.Nodes { + err := nodeDb.CreateAndInsertWithJobDbJobsWithTxn(txn, nil, node) + require.NoError(b, err) + } + txn.Commit() + + jobRepo := NewInMemoryJobRepository(testfixtures.TestPriorityClasses) jobs := make([]interfaces.LegacySchedulerJob, 0) for _, queueJobs := range jobsByQueue { @@ -1436,7 +1626,7 @@ func BenchmarkPreemptingQueueScheduler(b *testing.B) { jobs = append(jobs, job) } } - repo.EnqueueMany(jobs) + jobRepo.EnqueueMany(jobs) sctx := schedulercontext.NewSchedulingContext( "executor", @@ -1447,7 +1637,8 @@ func BenchmarkPreemptingQueueScheduler(b *testing.B) { nodeDb.TotalResources(), ) for queue, priorityFactor := range priorityFactorByQueue { - err := sctx.AddQueueSchedulingContext(queue, priorityFactor, allocatedByQueueAndPriority[queue]) + weight := 1 / priorityFactor + err := sctx.AddQueueSchedulingContext(queue, weight, make(schedulerobjects.QuantityByTAndResourceType[string])) require.NoError(b, err) } constraints := schedulerconstraints.SchedulingConstraintsFromSchedulingConfig( @@ -1461,7 +1652,8 @@ func BenchmarkPreemptingQueueScheduler(b *testing.B) { constraints, tc.SchedulingConfig.Preemption.NodeEvictionProbability, tc.SchedulingConfig.Preemption.NodeOversubscriptionEvictionProbability, - repo, + tc.SchedulingConfig.Preemption.ProtectedFractionOfFairShare, + jobRepo, nodeDb, nil, nil, @@ -1471,19 +1663,29 @@ func BenchmarkPreemptingQueueScheduler(b *testing.B) { require.NoError(b, err) require.Equal(b, 0, len(result.PreemptedJobs)) - // Create a new job repo without the scheduled jobs. - scheduledJobsById := make(map[string]interfaces.LegacySchedulerJob) + scheduledJobs := make(map[string]bool) for _, job := range result.ScheduledJobs { - scheduledJobsById[job.GetId()] = job + scheduledJobs[job.GetId()] = true } - unscheduledJobs := make([]interfaces.LegacySchedulerJob, 0) - for _, job := range jobs { - if _, ok := scheduledJobsById[job.GetId()]; !ok { - unscheduledJobs = append(unscheduledJobs, job) - } + for queue, jobs := range jobRepo.jobsByQueue { + jobRepo.jobsByQueue[queue] = armadaslices.Filter(jobs, func(job interfaces.LegacySchedulerJob) bool { return scheduledJobs[job.GetId()] }) } - repo = NewInMemoryJobRepository(testfixtures.TestPriorityClasses) - repo.EnqueueMany(unscheduledJobs) + + jobsByNodeId := make(map[string][]*jobdb.Job) + for _, job := range ScheduledJobsFromSchedulerResult[*jobdb.Job](result) { + nodeId := result.NodeIdByJobId[job.GetId()] + jobsByNodeId[nodeId] = append(jobsByNodeId[nodeId], job) + } + nodeDb, err = NewNodeDb() + require.NoError(b, err) + txn = nodeDb.Txn(true) + for _, node := range tc.Nodes { + err := nodeDb.CreateAndInsertWithJobDbJobsWithTxn(txn, jobsByNodeId[node.Id], node) + require.NoError(b, err) + } + txn.Commit() + + allocatedByQueueAndPriorityClass := sctx.AllocatedByQueueAndPriority() b.ResetTimer() for n := 0; n < b.N; n++ { @@ -1496,7 +1698,8 @@ func BenchmarkPreemptingQueueScheduler(b *testing.B) { nodeDb.TotalResources(), ) for queue, priorityFactor := range priorityFactorByQueue { - err := sctx.AddQueueSchedulingContext(queue, priorityFactor, allocatedByQueueAndPriority[queue]) + weight := 1 / priorityFactor + err := sctx.AddQueueSchedulingContext(queue, weight, allocatedByQueueAndPriorityClass[queue]) require.NoError(b, err) } sch := NewPreemptingQueueScheduler( @@ -1504,7 +1707,8 @@ func BenchmarkPreemptingQueueScheduler(b *testing.B) { constraints, tc.SchedulingConfig.Preemption.NodeEvictionProbability, tc.SchedulingConfig.Preemption.NodeOversubscriptionEvictionProbability, - repo, + tc.SchedulingConfig.Preemption.ProtectedFractionOfFairShare, + jobRepo, nodeDb, nil, nil, diff --git a/internal/scheduler/queue_scheduler.go b/internal/scheduler/queue_scheduler.go index 9408015ec3c..956232b38f4 100644 --- a/internal/scheduler/queue_scheduler.go +++ b/internal/scheduler/queue_scheduler.go @@ -3,7 +3,6 @@ package scheduler import ( "container/heap" "context" - "math" "reflect" "time" @@ -63,7 +62,7 @@ func (sch *QueueScheduler) SkipUnsuccessfulSchedulingKeyCheck() { func (sch *QueueScheduler) Schedule(ctx context.Context) (*SchedulerResult, error) { log := ctxlogrus.Extract(ctx) - if ResourceListAsWeightedMillis(sch.schedulingContext.ResourceScarcity, sch.schedulingContext.TotalResources) == 0 { + if sch.schedulingContext.TotalResources.AsWeightedMillis(sch.schedulingContext.ResourceScarcity) == 0 { // This refers to resources available across all clusters, i.e., // it may include resources not currently considered for scheduling. log.Infof( @@ -72,8 +71,8 @@ func (sch *QueueScheduler) Schedule(ctx context.Context) (*SchedulerResult, erro ) return &SchedulerResult{}, nil } - if ResourceListAsWeightedMillis(sch.schedulingContext.ResourceScarcity, sch.gangScheduler.nodeDb.TotalResources()) == 0 { - // This refers to the resources currently considered for schedling. + if rl := sch.gangScheduler.nodeDb.TotalResources(); rl.AsWeightedMillis(sch.schedulingContext.ResourceScarcity) == 0 { + // This refers to the resources currently considered for scheduling. log.Infof( "no resources with non-zero weight available for scheduling in NodeDb: resource scarcity %v, total resources %v", sch.schedulingContext.ResourceScarcity, sch.gangScheduler.nodeDb.TotalResources(), @@ -112,8 +111,9 @@ func (sch *QueueScheduler) Schedule(ctx context.Context) (*SchedulerResult, erro } else if ok { for _, jctx := range gctx.JobSchedulingContexts { scheduledJobs = append(scheduledJobs, jctx.Job) - if jctx.PodSchedulingContext != nil && jctx.PodSchedulingContext.Node != nil { - nodeIdByJobId[jctx.JobId] = jctx.PodSchedulingContext.Node.Id + pctx := jctx.PodSchedulingContext + if pctx != nil && pctx.NodeId != "" { + nodeIdByJobId[jctx.JobId] = pctx.NodeId } } } else if schedulerconstraints.IsTerminalUnschedulableReason(unschedulableReason) { @@ -216,7 +216,6 @@ func (it *QueuedGangIterator) Peek() (*schedulercontext.GangSchedulingContext, e if unsuccessfulJctx, ok := it.schedulingContext.UnfeasibleSchedulingKeys[schedulingKey]; ok { jctx := &schedulercontext.JobSchedulingContext{ Created: time.Now(), - ExecutorId: it.schedulingContext.ExecutorId, JobId: job.GetId(), Job: job, UnschedulableReason: unsuccessfulJctx.UnschedulableReason, @@ -242,20 +241,18 @@ func (it *QueuedGangIterator) Peek() (*schedulercontext.GangSchedulingContext, e if len(gang) == gangCardinality { delete(it.jobsByGangId, gangId) it.next = schedulercontext.NewGangSchedulingContext( - jobSchedulingContextsFromJobs( - gang, - it.schedulingContext.ExecutorId, + schedulercontext.JobSchedulingContextsFromJobs( it.schedulingContext.PriorityClasses, + gang, ), ) return it.next, nil } } else { it.next = schedulercontext.NewGangSchedulingContext( - jobSchedulingContextsFromJobs( - []interfaces.LegacySchedulerJob{job}, - it.schedulingContext.ExecutorId, + schedulercontext.JobSchedulingContextsFromJobs( it.schedulingContext.PriorityClasses, + []interfaces.LegacySchedulerJob{job}, ), ) return it.next, nil @@ -277,12 +274,6 @@ type CandidateGangIterator struct { SchedulingContext *schedulercontext.SchedulingContext // If true, this iterator only yields gangs where all jobs are evicted. onlyYieldEvicted bool - // For each queue, weight is the inverse of the priority factor. - weightByQueue map[string]float64 - // Sum of all weights. - weightSum float64 - // Total weighted resources. - totalResourcesAsWeightedMillis int64 // Reusable buffer to avoid allocations. buffer schedulerobjects.ResourceList // Priority queue containing per-queue iterators. @@ -294,28 +285,10 @@ func NewCandidateGangIterator( sctx *schedulercontext.SchedulingContext, iteratorsByQueue map[string]*QueuedGangIterator, ) (*CandidateGangIterator, error) { - weightSum := 0.0 - weightByQueue := make(map[string]float64, len(iteratorsByQueue)) - for queue := range iteratorsByQueue { - qctx := sctx.QueueSchedulingContexts[queue] - if qctx == nil { - return nil, errors.Errorf("no scheduling context for queue %s", queue) - } - weight := 1 / math.Max(qctx.PriorityFactor, 1) - weightByQueue[queue] = weight - weightSum += weight - } - totalResourcesAsWeightedMillis := ResourceListAsWeightedMillis(sctx.ResourceScarcity, sctx.TotalResources) - if totalResourcesAsWeightedMillis < 1 { - totalResourcesAsWeightedMillis = 1 - } it := &CandidateGangIterator{ - SchedulingContext: sctx, - weightByQueue: weightByQueue, - weightSum: weightSum, - totalResourcesAsWeightedMillis: totalResourcesAsWeightedMillis, - buffer: schedulerobjects.NewResourceListWithDefaultSize(), - pq: make(QueueCandidateGangIteratorPQ, 0, len(iteratorsByQueue)), + SchedulingContext: sctx, + buffer: schedulerobjects.NewResourceListWithDefaultSize(), + pq: make(QueueCandidateGangIteratorPQ, 0, len(iteratorsByQueue)), } for queue, queueIt := range iteratorsByQueue { if _, err := it.updateAndPushPQItem(it.newPQItem(queue, queueIt)); err != nil { @@ -372,17 +345,11 @@ func (it *CandidateGangIterator) updatePQItem(item *QueueCandidateGangIteratorIt // fractionOfFairShareWithGctx returns the fraction of its fair share this queue would have if the jobs in gctx were scheduled. func (it *CandidateGangIterator) fractionOfFairShareWithGctx(gctx *schedulercontext.GangSchedulingContext) float64 { + qctx := it.SchedulingContext.QueueSchedulingContexts[gctx.Queue] it.buffer.Zero() - it.buffer.Add(it.SchedulingContext.QueueSchedulingContexts[gctx.Queue].Allocated) + it.buffer.Add(qctx.Allocated) it.buffer.Add(gctx.TotalResourceRequests) - queueWeight := it.weightByQueue[gctx.Queue] - if queueWeight == 0 { - return 1 - } else { - fairShare := queueWeight / it.weightSum - used := ResourceListAsWeightedMillis(it.SchedulingContext.ResourceScarcity, it.buffer) - return (float64(used) / float64(it.totalResourcesAsWeightedMillis)) / fairShare - } + return qctx.TotalCostForQueueWithAllocation(it.buffer) } // Clear removes the first item in the iterator. diff --git a/internal/scheduler/queue_scheduler_test.go b/internal/scheduler/queue_scheduler_test.go index 6489e743b6a..84d42dd3e36 100644 --- a/internal/scheduler/queue_scheduler_test.go +++ b/internal/scheduler/queue_scheduler_test.go @@ -34,7 +34,7 @@ func TestQueueScheduler(t *testing.T) { // Map from queue to the priority factor associated with that queue. PriorityFactorByQueue map[string]float64 // Initial resource usage for all queues. - InitialAllocatedByQueueAndPriority map[string]schedulerobjects.QuantityByPriorityAndResourceType + InitialAllocatedByQueueAndPriorityClass map[string]schedulerobjects.QuantityByTAndResourceType[string] // Nodes to be considered by the scheduler. Nodes []*schedulerobjects.Node // Jobs to try scheduling. @@ -48,41 +48,41 @@ func TestQueueScheduler(t *testing.T) { SchedulingConfig: testfixtures.TestSchedulingConfig(), PriorityFactorByQueue: map[string]float64{"A": 1.0}, Nodes: testfixtures.N32CpuNodes(1, testfixtures.TestPriorities), - Jobs: testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 32), + Jobs: testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 32), ExpectedScheduledIndices: testfixtures.IntRange(0, 31), }, "simple failure": { SchedulingConfig: testfixtures.TestSchedulingConfig(), PriorityFactorByQueue: map[string]float64{"A": 1.0}, Nodes: testfixtures.N32CpuNodes(1, testfixtures.TestPriorities), - Jobs: testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 33), + Jobs: testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 33), ExpectedScheduledIndices: testfixtures.IntRange(0, 31), }, "multiple nodes": { SchedulingConfig: testfixtures.TestSchedulingConfig(), PriorityFactorByQueue: map[string]float64{"A": 1.0}, Nodes: testfixtures.N32CpuNodes(2, testfixtures.TestPriorities), - Jobs: testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 64), + Jobs: testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 64), ExpectedScheduledIndices: testfixtures.IntRange(0, 63), }, "preempt lower-priority jobs": { SchedulingConfig: testfixtures.TestSchedulingConfig(), Nodes: testfixtures.N32CpuNodes(1, testfixtures.TestPriorities), - Jobs: armadaslices.Concatenate(testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 1), testfixtures.N1CpuJobs("A", testfixtures.PriorityClass1, 1)), + Jobs: armadaslices.Concatenate(testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 1), testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass1, 1)), PriorityFactorByQueue: map[string]float64{"A": 1}, ExpectedScheduledIndices: testfixtures.IntRange(0, 1), }, "no preemption of higher-priority jobs": { SchedulingConfig: testfixtures.TestSchedulingConfig(), Nodes: testfixtures.N32CpuNodes(1, testfixtures.TestPriorities), - Jobs: armadaslices.Concatenate(testfixtures.N32CpuJobs("A", testfixtures.PriorityClass1, 1), testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 1)), + Jobs: armadaslices.Concatenate(testfixtures.N32Cpu256GiJobs("A", testfixtures.PriorityClass1, 1), testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 1)), PriorityFactorByQueue: map[string]float64{"A": 1}, ExpectedScheduledIndices: testfixtures.IntRange(0, 0), }, "unschedulable jobs do not block schedulable jobs": { SchedulingConfig: testfixtures.TestSchedulingConfig(), Nodes: testfixtures.N32CpuNodes(1, testfixtures.TestPriorities), - Jobs: armadaslices.Concatenate(testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 1), testfixtures.N32CpuJobs("A", testfixtures.PriorityClass0, 10), testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 1)), + Jobs: armadaslices.Concatenate(testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 1), testfixtures.N32Cpu256GiJobs("A", testfixtures.PriorityClass0, 10), testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 1)), PriorityFactorByQueue: map[string]float64{"A": 1}, ExpectedScheduledIndices: []int{0, 11}, }, @@ -90,9 +90,9 @@ func TestQueueScheduler(t *testing.T) { SchedulingConfig: testfixtures.WithMaxJobsToScheduleConfig(2, testfixtures.TestSchedulingConfig()), Nodes: testfixtures.N32CpuNodes(1, testfixtures.TestPriorities), Jobs: armadaslices.Concatenate( - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 1), - testfixtures.N32CpuJobs("A", testfixtures.PriorityClass0, 10), - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 3), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 1), + testfixtures.N32Cpu256GiJobs("A", testfixtures.PriorityClass0, 10), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 3), ), PriorityFactorByQueue: map[string]float64{"A": 1}, ExpectedScheduledIndices: []int{0, 11}, @@ -103,22 +103,22 @@ func TestQueueScheduler(t *testing.T) { Nodes: testfixtures.N32CpuNodes(1, testfixtures.TestPriorities), Jobs: armadaslices.Concatenate( testfixtures.WithGangAnnotationsJobs( - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 2), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 2), ), testfixtures.WithGangAnnotationsJobs( - testfixtures.N32CpuJobs("A", testfixtures.PriorityClass0, 2), + testfixtures.N32Cpu256GiJobs("A", testfixtures.PriorityClass0, 2), ), testfixtures.WithGangAnnotationsJobs( - testfixtures.N32CpuJobs("A", testfixtures.PriorityClass0, 2), + testfixtures.N32Cpu256GiJobs("A", testfixtures.PriorityClass0, 2), ), testfixtures.WithGangAnnotationsJobs( - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 2), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 2), ), testfixtures.WithGangAnnotationsJobs( - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 2), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 2), ), testfixtures.WithGangAnnotationsJobs( - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 2), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 2), ), ), PriorityFactorByQueue: map[string]float64{"A": 1}, @@ -132,89 +132,44 @@ func TestQueueScheduler(t *testing.T) { ), PriorityFactorByQueue: map[string]float64{"A": 1.0}, Nodes: testfixtures.N32CpuNodes(1, testfixtures.TestPriorities), - Jobs: testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 32), + Jobs: testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 32), ExpectedScheduledIndices: testfixtures.IntRange(0, 16), ExpectedNeverAttemptedIndices: testfixtures.IntRange(17, 31), }, "PerPriorityLimits": { SchedulingConfig: testfixtures.WithPerPriorityLimitsConfig( - map[int32]map[string]float64{ - 0: {"cpu": 1.0}, - 1: {"cpu": 15.0 / 32.0}, - 2: {"cpu": 10.0 / 32.0}, - 3: {"cpu": 3.0 / 32.0}, + map[string]map[string]float64{ + testfixtures.PriorityClass0: {"cpu": 1.0 / 32.0}, + testfixtures.PriorityClass1: {"cpu": 2.0 / 32.0}, + testfixtures.PriorityClass2: {"cpu": 3.0 / 32.0}, + testfixtures.PriorityClass3: {"cpu": 4.0 / 32.0}, }, testfixtures.TestSchedulingConfig(), ), PriorityFactorByQueue: map[string]float64{"A": 1.0}, Nodes: testfixtures.N32CpuNodes(1, testfixtures.TestPriorities), Jobs: armadaslices.Concatenate( - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass3, 4), - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass2, 8), - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass1, 6), - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 18), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 1), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 2), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass1, 2), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass1, 3), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass2, 3), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass2, 3), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass3, 4), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass3, 4), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 1), ), ExpectedScheduledIndices: armadaslices.Concatenate( - testfixtures.IntRange(0, 2), - testfixtures.IntRange(4, 10), - testfixtures.IntRange(12, 16), - testfixtures.IntRange(18, 34), + testfixtures.IntRange(0, 0), + testfixtures.IntRange(3, 4), + testfixtures.IntRange(8, 10), + testfixtures.IntRange(14, 17), ), }, - "PerPriorityLimits equal MaximumResourceFractionToSchedule": { - SchedulingConfig: testfixtures.WithPerPriorityLimitsConfig( - map[int32]map[string]float64{ - 0: {"cpu": 0.9}, // 28 cpu - 1: {"cpu": 0.9}, - }, testfixtures.TestSchedulingConfig()), - Nodes: testfixtures.N32CpuNodes(1, testfixtures.TestPriorities), - Jobs: armadaslices.Concatenate(testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 5), testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 5)), - PriorityFactorByQueue: map[string]float64{"A": 1}, - InitialAllocatedByQueueAndPriority: map[string]schedulerobjects.QuantityByPriorityAndResourceType{ - "A": { - 0: schedulerobjects.ResourceList{ - Resources: map[string]resource.Quantity{ - "cpu": resource.MustParse("13"), - }, - }, - 1: schedulerobjects.ResourceList{ - Resources: map[string]resource.Quantity{ - "cpu": resource.MustParse("14"), - }, - }, - }, - }, - ExpectedScheduledIndices: []int{0}, - }, - "limit hit at higher priority doesn't block jobs at lower priority": { - SchedulingConfig: testfixtures.WithPerPriorityLimitsConfig( - map[int32]map[string]float64{ - 0: {"cpu": 0.9}, // 28 cpu - 1: {"cpu": 0.5}, // 14 cpu - }, testfixtures.TestSchedulingConfig()), - Nodes: testfixtures.N32CpuNodes(1, testfixtures.TestPriorities), - Jobs: armadaslices.Concatenate(testfixtures.N1CpuJobs("A", testfixtures.PriorityClass1, 1), testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 5)), - PriorityFactorByQueue: map[string]float64{"A": 1}, - InitialAllocatedByQueueAndPriority: map[string]schedulerobjects.QuantityByPriorityAndResourceType{ - "A": { - 0: schedulerobjects.ResourceList{ - Resources: map[string]resource.Quantity{ - "cpu": resource.MustParse("7"), // out of 28 - }, - }, - 1: schedulerobjects.ResourceList{ - Resources: map[string]resource.Quantity{ - "cpu": resource.MustParse("20"), // out of 14, i.e., over the limit - }, - }, - }, - }, - ExpectedScheduledIndices: []int{1}, - }, "fairness two queues": { SchedulingConfig: testfixtures.TestSchedulingConfig(), Nodes: testfixtures.N32CpuNodes(1, testfixtures.TestPriorities), - Jobs: armadaslices.Concatenate(testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 32), testfixtures.N1CpuJobs("B", testfixtures.PriorityClass0, 32)), + Jobs: armadaslices.Concatenate(testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 32), testfixtures.N1Cpu4GiJobs("B", testfixtures.PriorityClass0, 32)), PriorityFactorByQueue: map[string]float64{"A": 1, "B": 1}, ExpectedScheduledIndices: armadaslices.Concatenate(testfixtures.IntRange(0, 15), testfixtures.IntRange(32, 47)), }, @@ -222,9 +177,9 @@ func TestQueueScheduler(t *testing.T) { SchedulingConfig: testfixtures.TestSchedulingConfig(), Nodes: testfixtures.N32CpuNodes(1, testfixtures.TestPriorities), Jobs: armadaslices.Concatenate( - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 32), - testfixtures.N1CpuJobs("B", testfixtures.PriorityClass0, 32), - testfixtures.N1CpuJobs("C", testfixtures.PriorityClass0, 32), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 32), + testfixtures.N1Cpu4GiJobs("B", testfixtures.PriorityClass0, 32), + testfixtures.N1Cpu4GiJobs("C", testfixtures.PriorityClass0, 32), ), PriorityFactorByQueue: map[string]float64{ "A": 1, @@ -241,8 +196,8 @@ func TestQueueScheduler(t *testing.T) { SchedulingConfig: testfixtures.TestSchedulingConfig(), Nodes: testfixtures.N32CpuNodes(3, testfixtures.TestPriorities), Jobs: armadaslices.Concatenate( - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 96), - testfixtures.N1CpuJobs("B", testfixtures.PriorityClass0, 96), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 96), + testfixtures.N1Cpu4GiJobs("B", testfixtures.PriorityClass0, 96), ), PriorityFactorByQueue: map[string]float64{ "A": 1, @@ -257,9 +212,9 @@ func TestQueueScheduler(t *testing.T) { SchedulingConfig: testfixtures.TestSchedulingConfig(), Nodes: testfixtures.N32CpuNodes(3, testfixtures.TestPriorities), Jobs: armadaslices.Concatenate( - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 96), - testfixtures.N1CpuJobs("B", testfixtures.PriorityClass0, 96), - testfixtures.N1CpuJobs("C", testfixtures.PriorityClass0, 96), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 96), + testfixtures.N1Cpu4GiJobs("B", testfixtures.PriorityClass0, 96), + testfixtures.N1Cpu4GiJobs("C", testfixtures.PriorityClass0, 96), ), PriorityFactorByQueue: map[string]float64{ "A": 1, @@ -276,16 +231,16 @@ func TestQueueScheduler(t *testing.T) { SchedulingConfig: testfixtures.TestSchedulingConfig(), Nodes: testfixtures.N32CpuNodes(1, testfixtures.TestPriorities), Jobs: armadaslices.Concatenate( - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 32), - testfixtures.N1CpuJobs("B", testfixtures.PriorityClass0, 32), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 32), + testfixtures.N1Cpu4GiJobs("B", testfixtures.PriorityClass0, 32), ), PriorityFactorByQueue: map[string]float64{ "A": 1, "B": 1, }, - InitialAllocatedByQueueAndPriority: map[string]schedulerobjects.QuantityByPriorityAndResourceType{ + InitialAllocatedByQueueAndPriorityClass: map[string]schedulerobjects.QuantityByTAndResourceType[string]{ "A": { - 0: schedulerobjects.ResourceList{ + testfixtures.PriorityClass0: schedulerobjects.ResourceList{ Resources: map[string]resource.Quantity{ "cpu": resource.MustParse("100"), }, @@ -305,7 +260,7 @@ func TestQueueScheduler(t *testing.T) { }, testfixtures.N32CpuNodes(1, testfixtures.TestPriorities), ), - Jobs: testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 1), + Jobs: testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 1), PriorityFactorByQueue: map[string]float64{"A": 1}, ExpectedScheduledIndices: nil, }, @@ -320,7 +275,7 @@ func TestQueueScheduler(t *testing.T) { }, testfixtures.N32CpuNodes(1, testfixtures.TestPriorities), ), - Jobs: testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 2), + Jobs: testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 2), PriorityFactorByQueue: map[string]float64{"A": 1}, ExpectedScheduledIndices: []int{0}, }, @@ -335,21 +290,21 @@ func TestQueueScheduler(t *testing.T) { }, testfixtures.N32CpuNodes(1, testfixtures.TestPriorities), ), - Jobs: testfixtures.N32CpuJobs("A", testfixtures.PriorityClass1, 1), + Jobs: testfixtures.N32Cpu256GiJobs("A", testfixtures.PriorityClass1, 1), PriorityFactorByQueue: map[string]float64{"A": 1}, ExpectedScheduledIndices: []int{0}, }, "respect taints": { SchedulingConfig: testfixtures.TestSchedulingConfig(), Nodes: testfixtures.NTainted32CpuNodes(1, testfixtures.TestPriorities), - Jobs: armadaslices.Concatenate(testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 1), testfixtures.N32CpuJobs("A", testfixtures.PriorityClass0, 1)), + Jobs: armadaslices.Concatenate(testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 1), testfixtures.N32Cpu256GiJobs("A", testfixtures.PriorityClass0, 1)), PriorityFactorByQueue: map[string]float64{"A": 1}, ExpectedScheduledIndices: []int{1}, }, "minimum job size": { SchedulingConfig: testfixtures.TestSchedulingConfig(), Nodes: testfixtures.N32CpuNodes(1, testfixtures.TestPriorities), - Jobs: armadaslices.Concatenate(testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 1), testfixtures.N32CpuJobs("A", testfixtures.PriorityClass0, 1)), + Jobs: armadaslices.Concatenate(testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 1), testfixtures.N32Cpu256GiJobs("A", testfixtures.PriorityClass0, 1)), PriorityFactorByQueue: map[string]float64{"A": 1}, MinimumJobSize: map[string]resource.Quantity{ "cpu": resource.MustParse("2"), @@ -360,8 +315,8 @@ func TestQueueScheduler(t *testing.T) { SchedulingConfig: testfixtures.TestSchedulingConfig(), Nodes: testfixtures.N8GpuNodes(2, testfixtures.TestPriorities), Jobs: armadaslices.Concatenate( - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 1), - testfixtures.N32CpuJobs("A", testfixtures.PriorityClass0, 1), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 1), + testfixtures.N32Cpu256GiJobs("A", testfixtures.PriorityClass0, 1), testfixtures.N1GpuJobs("A", testfixtures.PriorityClass0, 1), ), PriorityFactorByQueue: map[string]float64{"A": 1}, @@ -374,8 +329,8 @@ func TestQueueScheduler(t *testing.T) { SchedulingConfig: testfixtures.TestSchedulingConfig(), Nodes: testfixtures.N8GpuNodes(2, testfixtures.TestPriorities), Jobs: armadaslices.Concatenate( - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 1), - testfixtures.N32CpuJobs("A", testfixtures.PriorityClass0, 1), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 1), + testfixtures.N32Cpu256GiJobs("A", testfixtures.PriorityClass0, 1), testfixtures.N1GpuJobs("A", testfixtures.PriorityClass0, 1), ), PriorityFactorByQueue: map[string]float64{"A": 1}, @@ -387,7 +342,7 @@ func TestQueueScheduler(t *testing.T) { "taints and tolerations": { SchedulingConfig: testfixtures.TestSchedulingConfig(), Nodes: testfixtures.NTainted32CpuNodes(1, testfixtures.TestPriorities), - Jobs: armadaslices.Concatenate(testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 1), testfixtures.N32CpuJobs("A", testfixtures.PriorityClass0, 1)), + Jobs: armadaslices.Concatenate(testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 1), testfixtures.N32Cpu256GiJobs("A", testfixtures.PriorityClass0, 1)), PriorityFactorByQueue: map[string]float64{"A": 1}, ExpectedScheduledIndices: []int{1}, }, @@ -397,14 +352,14 @@ func TestQueueScheduler(t *testing.T) { testfixtures.N32CpuNodes(1, testfixtures.TestPriorities), testfixtures.WithLabelsNodes(map[string]string{"foo": "foo"}, testfixtures.N32CpuNodes(1, testfixtures.TestPriorities)), ), - Jobs: testfixtures.WithNodeSelectorJobs(map[string]string{"foo": "foo"}, testfixtures.N32CpuJobs("A", testfixtures.PriorityClass0, 2)), + Jobs: testfixtures.WithNodeSelectorJobs(map[string]string{"foo": "foo"}, testfixtures.N32Cpu256GiJobs("A", testfixtures.PriorityClass0, 2)), PriorityFactorByQueue: map[string]float64{"A": 1}, ExpectedScheduledIndices: []int{0}, }, "taints and tolerations (indexed)": { SchedulingConfig: testfixtures.WithIndexedTaintsConfig([]string{"largeJobsOnly"}, testfixtures.TestSchedulingConfig()), Nodes: testfixtures.NTainted32CpuNodes(1, testfixtures.TestPriorities), - Jobs: armadaslices.Concatenate(testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 1), testfixtures.N32CpuJobs("A", testfixtures.PriorityClass0, 1)), + Jobs: armadaslices.Concatenate(testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 1), testfixtures.N32Cpu256GiJobs("A", testfixtures.PriorityClass0, 1)), PriorityFactorByQueue: map[string]float64{"A": 1}, ExpectedScheduledIndices: []int{1}, }, @@ -414,7 +369,7 @@ func TestQueueScheduler(t *testing.T) { testfixtures.N32CpuNodes(1, testfixtures.TestPriorities), testfixtures.WithLabelsNodes(map[string]string{"foo": "foo"}, testfixtures.N32CpuNodes(1, testfixtures.TestPriorities))..., ), - Jobs: testfixtures.WithNodeSelectorJobs(map[string]string{"foo": "foo"}, testfixtures.N32CpuJobs("A", testfixtures.PriorityClass0, 2)), + Jobs: testfixtures.WithNodeSelectorJobs(map[string]string{"foo": "foo"}, testfixtures.N32Cpu256GiJobs("A", testfixtures.PriorityClass0, 2)), PriorityFactorByQueue: map[string]float64{"A": 1}, ExpectedScheduledIndices: []int{0}, }, @@ -422,9 +377,9 @@ func TestQueueScheduler(t *testing.T) { SchedulingConfig: testfixtures.WithMaxQueueLookbackConfig(3, testfixtures.TestSchedulingConfig()), Nodes: testfixtures.N32CpuNodes(1, testfixtures.TestPriorities), Jobs: armadaslices.Concatenate( - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 1), - testfixtures.N32CpuJobs("A", testfixtures.PriorityClass0, 3), - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 1), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 1), + testfixtures.N32Cpu256GiJobs("A", testfixtures.PriorityClass0, 3), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 1), ), PriorityFactorByQueue: map[string]float64{"A": 1}, ExpectedScheduledIndices: []int{0}, @@ -433,7 +388,7 @@ func TestQueueScheduler(t *testing.T) { "gang success": { SchedulingConfig: testfixtures.TestSchedulingConfig(), Nodes: testfixtures.N32CpuNodes(2, testfixtures.TestPriorities), - Jobs: testfixtures.WithGangAnnotationsJobs(testfixtures.N32CpuJobs("A", testfixtures.PriorityClass0, 2)), + Jobs: testfixtures.WithGangAnnotationsJobs(testfixtures.N32Cpu256GiJobs("A", testfixtures.PriorityClass0, 2)), PriorityFactorByQueue: map[string]float64{"A": 1}, ExpectedScheduledIndices: []int{0, 1}, }, @@ -441,9 +396,9 @@ func TestQueueScheduler(t *testing.T) { SchedulingConfig: testfixtures.TestSchedulingConfig(), Nodes: testfixtures.N32CpuNodes(3, testfixtures.TestPriorities), Jobs: armadaslices.Concatenate( - testfixtures.WithAnnotationsJobs(map[string]string{configuration.GangIdAnnotation: "my-gang", configuration.GangCardinalityAnnotation: "2"}, testfixtures.N32CpuJobs("A", testfixtures.PriorityClass0, 1)), - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 1), - testfixtures.WithAnnotationsJobs(map[string]string{configuration.GangIdAnnotation: "my-gang", configuration.GangCardinalityAnnotation: "2"}, testfixtures.N32CpuJobs("A", testfixtures.PriorityClass0, 1)), + testfixtures.WithAnnotationsJobs(map[string]string{configuration.GangIdAnnotation: "my-gang", configuration.GangCardinalityAnnotation: "2"}, testfixtures.N32Cpu256GiJobs("A", testfixtures.PriorityClass0, 1)), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 1), + testfixtures.WithAnnotationsJobs(map[string]string{configuration.GangIdAnnotation: "my-gang", configuration.GangCardinalityAnnotation: "2"}, testfixtures.N32Cpu256GiJobs("A", testfixtures.PriorityClass0, 1)), ), PriorityFactorByQueue: map[string]float64{"A": 1}, ExpectedScheduledIndices: []int{0, 1, 2}, @@ -451,7 +406,7 @@ func TestQueueScheduler(t *testing.T) { "gang failure": { SchedulingConfig: testfixtures.TestSchedulingConfig(), Nodes: testfixtures.N32CpuNodes(2, testfixtures.TestPriorities), - Jobs: testfixtures.WithGangAnnotationsJobs(testfixtures.N32CpuJobs("A", testfixtures.PriorityClass0, 3)), + Jobs: testfixtures.WithGangAnnotationsJobs(testfixtures.N32Cpu256GiJobs("A", testfixtures.PriorityClass0, 3)), PriorityFactorByQueue: map[string]float64{"A": 1}, ExpectedScheduledIndices: nil, }, @@ -459,9 +414,9 @@ func TestQueueScheduler(t *testing.T) { SchedulingConfig: testfixtures.TestSchedulingConfig(), Nodes: testfixtures.N32CpuNodes(2, testfixtures.TestPriorities), Jobs: armadaslices.Concatenate( - testfixtures.WithAnnotationsJobs(map[string]string{configuration.GangIdAnnotation: "my-gang", configuration.GangCardinalityAnnotation: "2"}, testfixtures.N32CpuJobs("A", testfixtures.PriorityClass0, 1)), - testfixtures.N1CpuJobs("A", testfixtures.PriorityClass0, 1), - testfixtures.WithAnnotationsJobs(map[string]string{configuration.GangIdAnnotation: "my-gang", configuration.GangCardinalityAnnotation: "2"}, testfixtures.N32CpuJobs("A", testfixtures.PriorityClass0, 1)), + testfixtures.WithAnnotationsJobs(map[string]string{configuration.GangIdAnnotation: "my-gang", configuration.GangCardinalityAnnotation: "2"}, testfixtures.N32Cpu256GiJobs("A", testfixtures.PriorityClass0, 1)), + testfixtures.N1Cpu4GiJobs("A", testfixtures.PriorityClass0, 1), + testfixtures.WithAnnotationsJobs(map[string]string{configuration.GangIdAnnotation: "my-gang", configuration.GangCardinalityAnnotation: "2"}, testfixtures.N32Cpu256GiJobs("A", testfixtures.PriorityClass0, 1)), ), PriorityFactorByQueue: map[string]float64{"A": 1}, ExpectedScheduledIndices: []int{1}, @@ -469,8 +424,14 @@ func TestQueueScheduler(t *testing.T) { } for name, tc := range tests { t.Run(name, func(t *testing.T) { - nodeDb, err := CreateNodeDb(tc.Nodes) + nodeDb, err := NewNodeDb() require.NoError(t, err) + txn := nodeDb.Txn(true) + for _, node := range tc.Nodes { + err := nodeDb.CreateAndInsertWithJobDbJobsWithTxn(txn, nil, node) + require.NoError(t, err) + } + txn.Commit() if tc.TotalResources.Resources == nil { // Default to NodeDb total. tc.TotalResources = nodeDb.TotalResources() @@ -498,7 +459,8 @@ func TestQueueScheduler(t *testing.T) { tc.TotalResources, ) for queue, priorityFactor := range tc.PriorityFactorByQueue { - err := sctx.AddQueueSchedulingContext(queue, priorityFactor, tc.InitialAllocatedByQueueAndPriority[queue]) + weight := 1 / priorityFactor + err := sctx.AddQueueSchedulingContext(queue, weight, tc.InitialAllocatedByQueueAndPriorityClass[queue]) require.NoError(t, err) } constraints := schedulerconstraints.SchedulingConstraintsFromSchedulingConfig( @@ -595,12 +557,11 @@ func TestQueueScheduler(t *testing.T) { for _, qctx := range sctx.QueueSchedulingContexts { for _, jctx := range qctx.SuccessfulJobSchedulingContexts { assert.NotNil(t, jctx.PodSchedulingContext) - assert.NotNil(t, jctx.PodSchedulingContext.Node) - assert.Equal(t, result.NodeIdByJobId[jctx.JobId], jctx.PodSchedulingContext.Node.Id) + assert.Equal(t, result.NodeIdByJobId[jctx.JobId], jctx.PodSchedulingContext.NodeId) } for _, jctx := range qctx.UnsuccessfulJobSchedulingContexts { if jctx.PodSchedulingContext != nil { - assert.Nil(t, jctx.PodSchedulingContext.Node) + assert.Equal(t, "", jctx.PodSchedulingContext.NodeId) } } } @@ -622,7 +583,7 @@ func TestQueueScheduler(t *testing.T) { continue } assert.Equal(t, nodeDb.NumNodes(), pctx.NumNodes) - _, _, isGangJob, err := GangIdAndCardinalityFromLegacySchedulerJob(jctx.Job, nil) + _, _, isGangJob, err := GangIdAndCardinalityFromLegacySchedulerJob(jctx.Job) require.NoError(t, err) if !isGangJob { numExcludedNodes := 0 @@ -640,8 +601,8 @@ func TestQueueScheduler(t *testing.T) { } } -func CreateNodeDb(nodes []*schedulerobjects.Node) (*nodedb.NodeDb, error) { - db, err := nodedb.NewNodeDb( +func NewNodeDb() (*nodedb.NodeDb, error) { + nodeDb, err := nodedb.NewNodeDb( testfixtures.TestPriorityClasses, testfixtures.TestMaxExtraNodesToConsider, testfixtures.TestResources, @@ -651,8 +612,5 @@ func CreateNodeDb(nodes []*schedulerobjects.Node) (*nodedb.NodeDb, error) { if err != nil { return nil, err } - if err := db.UpsertMany(nodes); err != nil { - return nil, err - } - return db, nil + return nodeDb, nil } diff --git a/internal/scheduler/reports.go b/internal/scheduler/reports.go index 0fd5415de4d..aefef6eb884 100644 --- a/internal/scheduler/reports.go +++ b/internal/scheduler/reports.go @@ -136,13 +136,13 @@ func (repo *SchedulingContextRepository) addSchedulingContext(sctx *schedulercon mostRecentSuccessfulByExecutor := *repo.mostRecentSuccessfulByExecutor.Load() mostRecentSuccessfulByExecutor = maps.Clone(mostRecentSuccessfulByExecutor) - if !sctx.ScheduledResourcesByPriority.IsZero() { + if !sctx.ScheduledResourcesByPriorityClass.IsZero() { mostRecentSuccessfulByExecutor[sctx.ExecutorId] = sctx } mostRecentPreemptingByExecutor := *repo.mostRecentPreemptingByExecutor.Load() mostRecentPreemptingByExecutor = maps.Clone(mostRecentPreemptingByExecutor) - if !sctx.EvictedResourcesByPriority.IsZero() { + if !sctx.EvictedResourcesByPriorityClass.IsZero() { mostRecentPreemptingByExecutor[sctx.ExecutorId] = sctx } @@ -190,7 +190,7 @@ func (repo *SchedulingContextRepository) addSchedulingContextForQueues(sctx *sch mostRecentByExecutorByQueue[queue] = SchedulingContextByExecutor{executorId: sctx} } - if !qctx.ScheduledResourcesByPriority.IsZero() { + if !qctx.ScheduledResourcesByPriorityClass.IsZero() { if previous := mostRecentSuccessfulByExecutorByQueue[queue]; previous != nil { previous = maps.Clone(previous) previous[executorId] = sctx @@ -200,7 +200,7 @@ func (repo *SchedulingContextRepository) addSchedulingContextForQueues(sctx *sch } } - if !qctx.EvictedResourcesByPriority.IsZero() { + if !qctx.EvictedResourcesByPriorityClass.IsZero() { if previous := mostRecentPreemptingByExecutorByQueue[queue]; previous != nil { previous = maps.Clone(previous) previous[executorId] = sctx diff --git a/internal/scheduler/reports_test.go b/internal/scheduler/reports_test.go index 110ed1fe2d5..fc96cc1b25e 100644 --- a/internal/scheduler/reports_test.go +++ b/internal/scheduler/reports_test.go @@ -246,20 +246,17 @@ func withSuccessfulJobSchedulingContext(sctx *schedulercontext.SchedulingContext } qctx := sctx.QueueSchedulingContexts[queue] if qctx == nil { - if err := sctx.AddQueueSchedulingContext(queue, 1.0, make(schedulerobjects.QuantityByPriorityAndResourceType)); err != nil { + if err := sctx.AddQueueSchedulingContext(queue, 1.0, make(schedulerobjects.QuantityByTAndResourceType[string])); err != nil { panic(err) } qctx = sctx.QueueSchedulingContexts[queue] qctx.SchedulingContext = nil qctx.Created = time.Time{} } - qctx.SuccessfulJobSchedulingContexts[jobId] = &schedulercontext.JobSchedulingContext{ - ExecutorId: sctx.ExecutorId, - JobId: jobId, - } + qctx.SuccessfulJobSchedulingContexts[jobId] = &schedulercontext.JobSchedulingContext{JobId: jobId} rl := schedulerobjects.ResourceList{Resources: map[string]resource.Quantity{"cpu": resource.MustParse("1")}} - qctx.ScheduledResourcesByPriority.AddResourceList(0, rl) - sctx.ScheduledResourcesByPriority.AddResourceList(0, rl) + qctx.ScheduledResourcesByPriorityClass.AddResourceList("foo", rl) + sctx.ScheduledResourcesByPriorityClass.AddResourceList("foo", rl) return sctx } @@ -269,7 +266,7 @@ func withPreemptingJobSchedulingContext(sctx *schedulercontext.SchedulingContext } qctx := sctx.QueueSchedulingContexts[queue] if qctx == nil { - if err := sctx.AddQueueSchedulingContext(queue, 1.0, make(schedulerobjects.QuantityByPriorityAndResourceType)); err != nil { + if err := sctx.AddQueueSchedulingContext(queue, 1.0, make(schedulerobjects.QuantityByTAndResourceType[string])); err != nil { panic(err) } qctx = sctx.QueueSchedulingContexts[queue] @@ -278,8 +275,8 @@ func withPreemptingJobSchedulingContext(sctx *schedulercontext.SchedulingContext } qctx.EvictedJobsById[jobId] = true rl := schedulerobjects.ResourceList{Resources: map[string]resource.Quantity{"cpu": resource.MustParse("1")}} - qctx.EvictedResourcesByPriority.AddResourceList(0, rl) - sctx.EvictedResourcesByPriority.AddResourceList(0, rl) + qctx.EvictedResourcesByPriorityClass.AddResourceList("foo", rl) + sctx.EvictedResourcesByPriorityClass.AddResourceList("foo", rl) return sctx } @@ -289,18 +286,14 @@ func withUnsuccessfulJobSchedulingContext(sctx *schedulercontext.SchedulingConte } qctx := sctx.QueueSchedulingContexts[queue] if qctx == nil { - if err := sctx.AddQueueSchedulingContext(queue, 1.0, make(schedulerobjects.QuantityByPriorityAndResourceType)); err != nil { + if err := sctx.AddQueueSchedulingContext(queue, 1.0, make(schedulerobjects.QuantityByTAndResourceType[string])); err != nil { panic(err) } qctx = sctx.QueueSchedulingContexts[queue] qctx.SchedulingContext = nil qctx.Created = time.Time{} } - qctx.UnsuccessfulJobSchedulingContexts[jobId] = &schedulercontext.JobSchedulingContext{ - ExecutorId: sctx.ExecutorId, - JobId: jobId, - UnschedulableReason: "unknown", - } + qctx.UnsuccessfulJobSchedulingContexts[jobId] = &schedulercontext.JobSchedulingContext{JobId: jobId, UnschedulableReason: "unknown"} return sctx } diff --git a/internal/scheduler/scheduler.go b/internal/scheduler/scheduler.go index 24a44336a99..31003cf15c8 100644 --- a/internal/scheduler/scheduler.go +++ b/internal/scheduler/scheduler.go @@ -209,8 +209,8 @@ func (s *Scheduler) cycle(ctx context.Context, updateAll bool, leaderToken Leade } events = append(events, expirationEvents...) + // Schedule jobs. if s.clock.Now().Sub(s.previousSchedulingRoundEnd) > s.schedulePeriod { - // Schedule jobs. overallSchedulerResult, err := s.schedulingAlgo.Schedule(ctx, txn, s.jobDb) if err != nil { return err @@ -222,8 +222,6 @@ func (s *Scheduler) cycle(ctx context.Context, updateAll bool, leaderToken Leade } events = append(events, resultEvents...) s.previousSchedulingRoundEnd = s.clock.Now() - } else { - log.Infof("skipping scheduling new jobs this cycle as a scheduling round ran less than %s ago", s.schedulePeriod) } // Publish to Pulsar. @@ -264,7 +262,7 @@ func (s *Scheduler) syncState(ctx context.Context) ([]*jobdb.Job, error) { // Try and retrieve the job from the jobDb. If it doesn't exist then create it. job := s.jobDb.GetById(txn, dbJob.JobID) if job == nil { - job, err = s.createSchedulerJob(&dbJob) + job, err = s.schedulerJobFromDatabaseJob(&dbJob) if err != nil { return nil, err } @@ -328,11 +326,11 @@ func (s *Scheduler) syncState(ctx context.Context) ([]*jobdb.Job, error) { func (s *Scheduler) createSchedulingInfoWithNodeAntiAffinityForAttemptedRuns(job *jobdb.Job) (*schedulerobjects.JobSchedulingInfo, error) { newSchedulingInfo := proto.Clone(job.JobSchedulingInfo()).(*schedulerobjects.JobSchedulingInfo) newSchedulingInfo.Version = job.JobSchedulingInfo().Version + 1 - podSchedulingRequirement := PodRequirementFromJobSchedulingInfo(newSchedulingInfo) - if podSchedulingRequirement == nil { + podRequirements := newSchedulingInfo.GetPodRequirements() + if podRequirements == nil { return nil, errors.Errorf("no pod scheduling requirement found for job %s", job.GetId()) } - newAffinity := podSchedulingRequirement.Affinity + newAffinity := podRequirements.Affinity if newAffinity == nil { newAffinity = &v1.Affinity{} } @@ -345,9 +343,7 @@ func (s *Scheduler) createSchedulingInfoWithNodeAntiAffinityForAttemptedRuns(job } } } - podSchedulingRequirement.Affinity = newAffinity - podSchedulingRequirement.ClearCachedSchedulingKey() - + podRequirements.Affinity = newAffinity return newSchedulingInfo, nil } @@ -356,12 +352,9 @@ func (s *Scheduler) addNodeAntiAffinitiesForAttemptedRunsIfSchedulable(job *jobd if err != nil { return nil, false, err } - podSchedulingRequirement := PodRequirementFromJobSchedulingInfo(schedulingInfoWithNodeAntiAffinity) - if podSchedulingRequirement == nil { - return nil, false, errors.Errorf("no pod scheduling requirement found for job %s", job.GetId()) - } - isSchedulable, _ := s.submitChecker.CheckPodRequirements(podSchedulingRequirement) - return job.WithJobSchedulingInfo(schedulingInfoWithNodeAntiAffinity), isSchedulable, nil + job = job.WithJobSchedulingInfo(schedulingInfoWithNodeAntiAffinity) + isSchedulable, _ := s.submitChecker.CheckJobDbJobs([]*jobdb.Job{job}) + return job, isSchedulable, nil } // eventsFromSchedulerResult generates necessary EventSequences from the provided SchedulerResult. @@ -591,7 +584,7 @@ func (s *Scheduler) generateUpdateMessagesFromJob(job *jobdb.Job, jobRunErrors m if lastRun.Returned() { errorMessage := fmt.Sprintf("Maximum number of attempts (%d) reached - this job will no longer be retried", s.maxAttemptedRuns) if job.NumAttempts() < s.maxAttemptedRuns { - errorMessage = fmt.Sprintf("Job was attempeted %d times, and has been tried once on all nodes it can run on - this job will no longer be retried", job.NumAttempts()) + errorMessage = fmt.Sprintf("Job was attempted %d times, and has been tried once on all nodes it can run on - this job will no longer be retried", job.NumAttempts()) } runError = &armadaevents.Error{ Terminal: true, @@ -817,8 +810,8 @@ func (s *Scheduler) ensureDbUpToDate(ctx context.Context, pollInterval time.Dura } } -// createSchedulerJob creates a new scheduler job from a database job. -func (s *Scheduler) createSchedulerJob(dbJob *database.Job) (*jobdb.Job, error) { +// schedulerJobFromDatabaseJob creates a new scheduler job from a database job. +func (s *Scheduler) schedulerJobFromDatabaseJob(dbJob *database.Job) (*jobdb.Job, error) { schedulingInfo := &schedulerobjects.JobSchedulingInfo{} err := proto.Unmarshal(dbJob.SchedulingInfo, schedulingInfo) if err != nil { @@ -892,7 +885,7 @@ func updateSchedulerRun(run *jobdb.JobRun, dbRun *database.Run) *jobdb.JobRun { return run } -// updateSchedulerJob updates the scheduler job (in-place) to match the database job +// updateSchedulerJob updates the scheduler job in-place to match the database job. func updateSchedulerJob(job *jobdb.Job, dbJob *database.Job) (*jobdb.Job, error) { if dbJob.CancelRequested && !job.CancelRequested() { job = job.WithCancelRequested(true) diff --git a/internal/scheduler/scheduler_test.go b/internal/scheduler/scheduler_test.go index d43dfaa89a0..35ef528c668 100644 --- a/internal/scheduler/scheduler_test.go +++ b/internal/scheduler/scheduler_test.go @@ -512,7 +512,7 @@ func TestScheduler_TestCycle(t *testing.T) { expectedAffinity := createAntiAffinity(t, nodeIdLabel, tc.expectedNodeAntiAffinities) assert.Equal(t, expectedAffinity, affinity) } - podRequirements := PodRequirementFromJobSchedulingInfo(job.JobSchedulingInfo()) + podRequirements := job.PodRequirements() assert.NotNil(t, podRequirements) expectedQueuedVersion := int32(1) @@ -819,12 +819,20 @@ type testSubmitChecker struct { checkSuccess bool } -func (t *testSubmitChecker) CheckPodRequirements(podRequirement *schedulerobjects.PodRequirements) (bool, string) { - return t.checkSuccess, "" +func (t *testSubmitChecker) CheckApiJobs(_ []*api.Job) (bool, string) { + reason := "" + if !t.checkSuccess { + reason = "CheckApiJobs failed" + } + return t.checkSuccess, reason } -func (t *testSubmitChecker) CheckApiJobs(jobs []*api.Job) (bool, string) { - return t.checkSuccess, "2" +func (t *testSubmitChecker) CheckJobDbJobs(_ []*jobdb.Job) (bool, string) { + reason := "" + if !t.checkSuccess { + reason = "CheckJobDbJobs failed" + } + return t.checkSuccess, reason } // Test implementations of the interfaces needed by the Scheduler diff --git a/internal/scheduler/schedulerapp.go b/internal/scheduler/schedulerapp.go index fd1b8ade516..090edc93305 100644 --- a/internal/scheduler/schedulerapp.go +++ b/internal/scheduler/schedulerapp.go @@ -3,6 +3,7 @@ package scheduler import ( "fmt" "net" + "net/http" "strings" "time" @@ -23,6 +24,7 @@ import ( "github.com/armadaproject/armada/internal/common/auth" dbcommon "github.com/armadaproject/armada/internal/common/database" grpcCommon "github.com/armadaproject/armada/internal/common/grpc" + "github.com/armadaproject/armada/internal/common/health" "github.com/armadaproject/armada/internal/common/pulsarutils" "github.com/armadaproject/armada/internal/common/stringinterner" schedulerconfig "github.com/armadaproject/armada/internal/scheduler/configuration" @@ -36,6 +38,17 @@ func Run(config schedulerconfig.Configuration) error { logrusLogger := log.NewEntry(log.StandardLogger()) ctx = ctxlogrus.ToContext(ctx, logrusLogger) + ////////////////////////////////////////////////////////////////////////// + // Health Checks + ////////////////////////////////////////////////////////////////////////// + mux := http.NewServeMux() + + startupCompleteCheck := health.NewStartupCompleteChecker() + healthChecks := health.NewMultiChecker(startupCompleteCheck) + health.SetupHttpMux(mux, healthChecks) + shutdownHttpServer := common.ServeHttp(uint16(config.Http.Port), mux) + defer shutdownHttpServer() + // List of services to run concurrently. // Because we want to start services only once all input validation has been completed, // we add all services to a slice and start them together at the end of this function. @@ -115,7 +128,7 @@ func Run(config schedulerconfig.Configuration) error { defer grpcServer.GracefulStop() lis, err := net.Listen("tcp", fmt.Sprintf(":%d", config.Grpc.Port)) if err != nil { - return errors.WithMessage(err, "error setting up grpc server") + return errors.WithMessage(err, "error setting up gRPC server") } allowedPcs := config.Scheduling.Preemption.AllowedPriorities() executorServer, err := NewExecutorApi( @@ -127,6 +140,7 @@ func Run(config schedulerconfig.Configuration) error { config.Scheduling.MaximumJobsToSchedule, config.Scheduling.Preemption.NodeIdLabel, config.Scheduling.Preemption.PriorityClassNameOverride, + config.Pulsar.MaxAllowedMessageSize, ) if err != nil { return errors.WithMessage(err, "error creating executorApi") @@ -205,6 +219,9 @@ func Run(config schedulerconfig.Configuration) error { g.Go(service) } + // Mark startup as complete, will allow the health check to return healthy + startupCompleteCheck.MarkComplete() + return g.Wait() } @@ -223,7 +240,11 @@ func createLeaderController(config schedulerconfig.LeaderConfig) (LeaderControll if err != nil { return nil, errors.Wrapf(err, "Error creating kubernetes client") } - return NewKubernetesLeaderController(config, clientSet.CoordinationV1()), nil + leaderController := NewKubernetesLeaderController(config, clientSet.CoordinationV1()) + leaderStatusMetrics := NewLeaderStatusMetricsCollector(config.PodName) + leaderController.RegisterListener(leaderStatusMetrics) + prometheus.MustRegister(leaderStatusMetrics) + return leaderController, nil default: return nil, errors.Errorf("%s is not a value leader mode", config.Mode) } diff --git a/internal/scheduler/schedulerobjects/nodematching.go b/internal/scheduler/schedulerobjects/nodematching.go index a9e9668a5d4..6db3402526a 100644 --- a/internal/scheduler/schedulerobjects/nodematching.go +++ b/internal/scheduler/schedulerobjects/nodematching.go @@ -124,34 +124,34 @@ func (nodeType *NodeType) PodRequirementsMet(req *PodRequirements) (bool, PodReq // - 1: Pod can be scheduled without preempting any running pods. // If the requirements are not met, it returns the reason why. // If the requirements can't be parsed, an error is returned. -func (node *Node) PodRequirementsMet(priority int32, req *PodRequirements) (bool, int, PodRequirementsNotMetReason, error) { - matches, reason, err := node.StaticPodRequirementsMet(req) +func PodRequirementsMet(taints []v1.Taint, labels map[string]string, totalResources ResourceList, allocatableResources ResourceList, req *PodRequirements) (bool, int, PodRequirementsNotMetReason, error) { + matches, reason, err := StaticPodRequirementsMet(taints, labels, totalResources, req) if !matches || err != nil { return matches, 0, reason, err } - return node.DynamicPodRequirementsMet(priority, req) + return DynamicPodRequirementsMet(allocatableResources, req) } // StaticPodRequirementsMet checks if a pod can be scheduled onto this node, // accounting for taints, node selectors, node affinity, and total resources available on the node. -func (node *Node) StaticPodRequirementsMet(req *PodRequirements) (bool, PodRequirementsNotMetReason, error) { - matches, reason, err := podTolerationRequirementsMet(node.GetTaints(), req) +func StaticPodRequirementsMet(taints []v1.Taint, labels map[string]string, totalResources ResourceList, req *PodRequirements) (bool, PodRequirementsNotMetReason, error) { + matches, reason, err := podTolerationRequirementsMet(taints, req) if !matches || err != nil { return matches, reason, err } - matches, reason, err = podNodeSelectorRequirementsMet(node.GetLabels(), nil, req) + matches, reason, err = podNodeSelectorRequirementsMet(labels, nil, req) if !matches || err != nil { return matches, reason, err } - matches, reason, err = podNodeAffinityRequirementsMet(node.GetLabels(), req) + matches, reason, err = podNodeAffinityRequirementsMet(labels, req) if !matches || err != nil { return matches, reason, err } for resource, required := range req.ResourceRequirements.Requests { - available := node.TotalResources.Get(string(resource)) + available := totalResources.Get(string(resource)) if required.Cmp(available) == 1 { return false, &InsufficientResources{ Resource: string(resource), @@ -166,8 +166,8 @@ func (node *Node) StaticPodRequirementsMet(req *PodRequirements) (bool, PodRequi // DynamicPodRequirementsMet checks if a pod can be scheduled onto this node, // accounting for resources allocated to pods already assigned to this node. -func (node *Node) DynamicPodRequirementsMet(priority int32, req *PodRequirements) (bool, int, PodRequirementsNotMetReason, error) { - matches, reason, err := podResourceRequirementsMet(priority, node.AllocatableByPriorityAndResource, req) +func DynamicPodRequirementsMet(allocatableResources ResourceList, req *PodRequirements) (bool, int, PodRequirementsNotMetReason, error) { + matches, reason, err := podResourceRequirementsMet(allocatableResources, req) return matches, SchedulableScore, reason, err } @@ -235,9 +235,9 @@ func podNodeAffinityRequirementsMet(nodeLabels map[string]string, req *PodRequir return true, nil, nil } -func podResourceRequirementsMet(priority int32, allocatableResources AllocatableByPriorityAndResourceType, req *PodRequirements) (bool, PodRequirementsNotMetReason, error) { +func podResourceRequirementsMet(allocatableResources ResourceList, req *PodRequirements) (bool, PodRequirementsNotMetReason, error) { for resource, required := range req.ResourceRequirements.Requests { - available := allocatableResources.Get(priority, string(resource)) + available := allocatableResources.Get(string(resource)) if required.Cmp(available) == 1 { return false, &InsufficientResources{ Resource: string(resource), diff --git a/internal/scheduler/schedulerobjects/nodematching_test.go b/internal/scheduler/schedulerobjects/nodematching_test.go index a64db33a018..6e2d836fbec 100644 --- a/internal/scheduler/schedulerobjects/nodematching_test.go +++ b/internal/scheduler/schedulerobjects/nodematching_test.go @@ -375,7 +375,7 @@ func TestNodeSchedulingRequirementsMet(t *testing.T) { } for name, tc := range tests { t.Run(name, func(t *testing.T) { - matches, _, reason, err := tc.node.PodRequirementsMet(tc.req.Priority, tc.req) + matches, _, reason, err := PodRequirementsMet(tc.node.Taints, tc.node.Labels, tc.node.TotalResources, tc.node.AllocatableByPriorityAndResource[tc.req.Priority], tc.req) assert.NoError(t, err) if tc.expectSuccess { // TODO: Test score set correctly. assert.True(t, matches) diff --git a/internal/scheduler/schedulerobjects/nodetype.go b/internal/scheduler/schedulerobjects/nodetype.go index bc14d91cb51..d52d35e36fb 100644 --- a/internal/scheduler/schedulerobjects/nodetype.go +++ b/internal/scheduler/schedulerobjects/nodetype.go @@ -12,10 +12,6 @@ type ( labelsFilterFunc func(key, value string) bool ) -func NewNodeTypeFromNode(node *v1.Node, indexedTaints map[string]interface{}, indexedLabels map[string]interface{}) *NodeType { - return NewNodeType(node.Spec.Taints, node.GetLabels(), indexedTaints, indexedLabels) -} - func NewNodeType(taints []v1.Taint, labels map[string]string, indexedTaints map[string]interface{}, indexedLabels map[string]interface{}) *NodeType { if taints == nil { taints = make([]v1.Taint, 0) diff --git a/internal/scheduler/schedulerobjects/podutils.go b/internal/scheduler/schedulerobjects/podutils.go index 287401d2826..9bd476ca99a 100644 --- a/internal/scheduler/schedulerobjects/podutils.go +++ b/internal/scheduler/schedulerobjects/podutils.go @@ -212,25 +212,6 @@ func (skg *PodRequirementsSerialiser) AppendResourceList(out []byte, resourceLis return out } -// ClearCachedSchedulingKey clears any cached scheduling keys. -// Necessary after changing scheduling requirements to avoid inconsistency. -func (jobSchedulingInfo *JobSchedulingInfo) ClearCachedSchedulingKey() { - if jobSchedulingInfo == nil { - return - } - for _, objReq := range jobSchedulingInfo.ObjectRequirements { - if req := objReq.GetPodRequirements(); req != nil { - req.ClearCachedSchedulingKey() - } - } -} - -// ClearCachedSchedulingKey clears any cached scheduling key. -// Necessary after changing scheduling requirements to avoid inconsistency. -func (req *PodRequirements) ClearCachedSchedulingKey() { - req.CachedSchedulingKey = nil -} - func lessToleration(a, b v1.Toleration) bool { if a.Key < b.Key { return true diff --git a/internal/scheduler/schedulerobjects/requirements.go b/internal/scheduler/schedulerobjects/requirements.go deleted file mode 100644 index d2bd19e7ea3..00000000000 --- a/internal/scheduler/schedulerobjects/requirements.go +++ /dev/null @@ -1,11 +0,0 @@ -package schedulerobjects - -func (info *JobSchedulingInfo) GetTotalResourceRequest() ResourceList { - rv := ResourceList{} - for _, oreq := range info.ObjectRequirements { - if preq := oreq.GetPodRequirements(); preq != nil { - rv.AddV1ResourceList(preq.ResourceRequirements.Requests) - } - } - return rv -} diff --git a/internal/scheduler/schedulerobjects/resourcelist.go b/internal/scheduler/schedulerobjects/resourcelist.go index af90c4ffc8a..9a3c67eb5e3 100644 --- a/internal/scheduler/schedulerobjects/resourcelist.go +++ b/internal/scheduler/schedulerobjects/resourcelist.go @@ -2,6 +2,7 @@ package schedulerobjects import ( "fmt" + math "math" "strings" v1 "k8s.io/api/core/v1" @@ -39,75 +40,77 @@ func V1ResourceListFromResourceList(rl ResourceList) v1.ResourceList { return rv } -type QuantityByPriorityAndResourceType map[int32]ResourceList +type QuantityByTAndResourceType[T comparable] map[T]ResourceList -func (a QuantityByPriorityAndResourceType) DeepCopy() QuantityByPriorityAndResourceType { - rv := make(QuantityByPriorityAndResourceType) - for p, rl := range a { - rv[p] = rl.DeepCopy() +// type QuantityByPriorityAndResourceType QuantityByTAndResourceType[int32] + +func (a QuantityByTAndResourceType[T]) Add(b QuantityByTAndResourceType[T]) { + for p, rlb := range b { + a.AddResourceList(p, rlb) + } +} + +func (a QuantityByTAndResourceType[T]) AddResourceList(t T, rlb ResourceList) { + rla := a[t] + rla.Add(rlb) + a[t] = rla +} + +func (a QuantityByTAndResourceType[T]) DeepCopy() QuantityByTAndResourceType[T] { + rv := make(QuantityByTAndResourceType[T]) + for t, rl := range a { + rv[t] = rl.DeepCopy() } return rv } -func (a QuantityByPriorityAndResourceType) String() string { +func (a QuantityByTAndResourceType[T]) String() string { var sb strings.Builder i := 0 sb.WriteString("{") - for p, rl := range a { + for t, rl := range a { if i < len(a)-1 { - sb.WriteString(fmt.Sprintf("%d: %s, ", p, rl.CompactString())) + sb.WriteString(fmt.Sprintf("%v: %s, ", t, rl.CompactString())) } else { - sb.WriteString(fmt.Sprintf("%d: %s", p, rl.CompactString())) + sb.WriteString(fmt.Sprintf("%v: %s", t, rl.CompactString())) } } sb.WriteString("}") return sb.String() } -func (a QuantityByPriorityAndResourceType) Add(b QuantityByPriorityAndResourceType) { - for p, rlb := range b { - a.AddResourceList(p, rlb) +func (a QuantityByTAndResourceType[T]) Sub(b QuantityByTAndResourceType[T]) { + for t, rlb := range b { + a.SubResourceList(t, rlb) } } -func (a QuantityByPriorityAndResourceType) Sub(b QuantityByPriorityAndResourceType) { - for p, rlb := range b { - a.SubResourceList(p, rlb) - } -} - -func (a QuantityByPriorityAndResourceType) AddResourceList(priority int32, rlb ResourceList) { - rla := a[priority] - rla.Add(rlb) - a[priority] = rla -} - -func (a QuantityByPriorityAndResourceType) AddV1ResourceList(priority int32, rlb v1.ResourceList) { - rla := a[priority] +func (a QuantityByTAndResourceType[T]) AddV1ResourceList(t T, rlb v1.ResourceList) { + rla := a[t] rla.AddV1ResourceList(rlb) - a[priority] = rla + a[t] = rla } -func (a QuantityByPriorityAndResourceType) SubResourceList(priority int32, rlb ResourceList) { - rla := a[priority] +func (a QuantityByTAndResourceType[T]) SubResourceList(t T, rlb ResourceList) { + rla := a[t] rla.Sub(rlb) - a[priority] = rla + a[t] = rla } -func (a QuantityByPriorityAndResourceType) SubV1ResourceList(priority int32, rlb v1.ResourceList) { - rla := a[priority] +func (a QuantityByTAndResourceType[T]) SubV1ResourceList(t T, rlb v1.ResourceList) { + rla := a[t] rla.SubV1ResourceList(rlb) - a[priority] = rla + a[t] = rla } -func (a QuantityByPriorityAndResourceType) Equal(b QuantityByPriorityAndResourceType) bool { - for p, rla := range a { - if !rla.Equal(b[p]) { +func (a QuantityByTAndResourceType[T]) Equal(b QuantityByTAndResourceType[T]) bool { + for t, rla := range a { + if !rla.Equal(b[t]) { return false } } - for p, rlb := range b { - if !rlb.Equal(a[p]) { + for t, rlb := range b { + if !rlb.Equal(a[t]) { return false } } @@ -115,7 +118,7 @@ func (a QuantityByPriorityAndResourceType) Equal(b QuantityByPriorityAndResource } // IsZero returns true if all quantities in a are zero. -func (a QuantityByPriorityAndResourceType) IsZero() bool { +func (a QuantityByTAndResourceType[T]) IsZero() bool { for _, rl := range a { if !rl.IsZero() { return false @@ -125,7 +128,7 @@ func (a QuantityByPriorityAndResourceType) IsZero() bool { } // IsStrictlyNonNegative returns true if there are no quantities in a with value less than zero. -func (a QuantityByPriorityAndResourceType) IsStrictlyNonNegative() bool { +func (a QuantityByTAndResourceType[T]) IsStrictlyNonNegative() bool { for _, rl := range a { if !rl.IsStrictlyNonNegative() { return false @@ -134,7 +137,7 @@ func (a QuantityByPriorityAndResourceType) IsStrictlyNonNegative() bool { return true } -func (a QuantityByPriorityAndResourceType) AggregateByResource() ResourceList { +func (a QuantityByTAndResourceType[T]) AggregateByResource() ResourceList { rv := NewResourceListWithDefaultSize() for _, rl := range a { rv.Add(rl) @@ -147,7 +150,7 @@ func (a QuantityByPriorityAndResourceType) AggregateByResource() ResourceList { // where p1, ..., pn are the priorities in a, for each resource set explicitly in rl. // // If necessary to add resources to make up the difference, those resources are added at priority p. -func (a QuantityByPriorityAndResourceType) MaxAggregatedByResource(p int32, rl ResourceList) { +func (a QuantityByTAndResourceType[T]) MaxAggregatedByResource(t T, rl ResourceList) { aggregate := a.AggregateByResource() var difference ResourceList for t, q := range rl.Resources { @@ -158,7 +161,7 @@ func (a QuantityByPriorityAndResourceType) MaxAggregatedByResource(p int32, rl R } } if len(difference.Resources) > 0 { - a.AddResourceList(p, difference) + a.AddResourceList(t, difference) } } @@ -275,7 +278,7 @@ func (a ResourceList) IsStrictlyNonNegative() bool { return true } -// IsStrictlyLessOrEqual returns true if all quantities in a are strictly less or equal than those in b. +// IsStrictlyLessOrEqual returns false if there is a quantity in b greater than that in a and true otherwise. func (a ResourceList) IsStrictlyLessOrEqual(b ResourceList) bool { for t, q := range b.Resources { if q.Cmp(a.Get(t)) == -1 { @@ -301,6 +304,17 @@ func (rl ResourceList) CompactString() string { return sb.String() } +// AsWeightedMillis returns the linear combination of the milli values in rl with given weights. +// This function overflows for values greater than MaxInt64. E.g., 1Pi is fine but not 10Pi. +func (rl *ResourceList) AsWeightedMillis(weights map[string]float64) int64 { + var rv int64 + for t, w := range weights { + q := rl.Get(t) + rv += int64(math.Round(float64(q.MilliValue()) * w)) + } + return rv +} + func (rl *ResourceList) initialise() { if rl.Resources == nil { rl.Resources = make(map[string]resource.Quantity) @@ -310,7 +324,7 @@ func (rl *ResourceList) initialise() { // AllocatableByPriorityAndResourceType accounts for resources that can be allocated to pods of a given priority. // E.g., AllocatableByPriorityAndResourceType[5]["cpu"] is the amount of CPU available to pods with priority 5, // where alloctable resources = unused resources + resources allocated to lower-priority pods. -type AllocatableByPriorityAndResourceType QuantityByPriorityAndResourceType +type AllocatableByPriorityAndResourceType QuantityByTAndResourceType[int32] func NewAllocatableByPriorityAndResourceType(priorities []int32, rl ResourceList) AllocatableByPriorityAndResourceType { rv := make(AllocatableByPriorityAndResourceType) @@ -366,7 +380,7 @@ func (m AllocatableByPriorityAndResourceType) MarkAllocatableV1ResourceList(p in // AllocatedByPriorityAndResourceType accounts for resources allocated to pods of a given priority or lower. // E.g., AllocatedByPriorityAndResourceType[5]["cpu"] is the amount of CPU allocated to pods with priority 5 or lower. -type AllocatedByPriorityAndResourceType QuantityByPriorityAndResourceType +type AllocatedByPriorityAndResourceType QuantityByTAndResourceType[int32] func NewAllocatedByPriorityAndResourceType(priorities []int32) AllocatedByPriorityAndResourceType { rv := make(AllocatedByPriorityAndResourceType) diff --git a/internal/scheduler/schedulerobjects/resourcelist_test.go b/internal/scheduler/schedulerobjects/resourcelist_test.go index 7308124ba83..f744686ea66 100644 --- a/internal/scheduler/schedulerobjects/resourcelist_test.go +++ b/internal/scheduler/schedulerobjects/resourcelist_test.go @@ -8,11 +8,11 @@ import ( "k8s.io/apimachinery/pkg/api/resource" ) -func TestQuantityByPriorityAndResourceTypeAdd(t *testing.T) { +func TestQuantityByTAndResourceTypeAdd(t *testing.T) { tests := map[string]struct { - a QuantityByPriorityAndResourceType - b QuantityByPriorityAndResourceType - expected QuantityByPriorityAndResourceType + a QuantityByTAndResourceType[int32] + b QuantityByTAndResourceType[int32] + expected QuantityByTAndResourceType[int32] }{ "nil and nil": { a: nil, @@ -20,34 +20,34 @@ func TestQuantityByPriorityAndResourceTypeAdd(t *testing.T) { expected: nil, }, "empty and nil": { - a: QuantityByPriorityAndResourceType{}, + a: QuantityByTAndResourceType[int32]{}, b: nil, - expected: QuantityByPriorityAndResourceType{}, + expected: QuantityByTAndResourceType[int32]{}, }, "nil and empty": { a: nil, - b: QuantityByPriorityAndResourceType{}, + b: QuantityByTAndResourceType[int32]{}, expected: nil, }, "matching": { - a: QuantityByPriorityAndResourceType{ + a: QuantityByTAndResourceType[int32]{ 0: ResourceList{Resources: map[string]resource.Quantity{"foo": resource.MustParse("3")}}, }, - b: QuantityByPriorityAndResourceType{ + b: QuantityByTAndResourceType[int32]{ 0: ResourceList{Resources: map[string]resource.Quantity{"foo": resource.MustParse("1")}}, }, - expected: QuantityByPriorityAndResourceType{ + expected: QuantityByTAndResourceType[int32]{ 0: ResourceList{Resources: map[string]resource.Quantity{"foo": resource.MustParse("4")}}, }, }, "mismatched resources": { - a: QuantityByPriorityAndResourceType{ + a: QuantityByTAndResourceType[int32]{ 0: ResourceList{Resources: map[string]resource.Quantity{"foo": resource.MustParse("3")}}, }, - b: QuantityByPriorityAndResourceType{ + b: QuantityByTAndResourceType[int32]{ 0: ResourceList{Resources: map[string]resource.Quantity{"bar": resource.MustParse("1")}}, }, - expected: QuantityByPriorityAndResourceType{ + expected: QuantityByTAndResourceType[int32]{ 0: ResourceList{ Resources: map[string]resource.Quantity{ "foo": resource.MustParse("3"), @@ -57,13 +57,13 @@ func TestQuantityByPriorityAndResourceTypeAdd(t *testing.T) { }, }, "mismatched priorities": { - a: QuantityByPriorityAndResourceType{ + a: QuantityByTAndResourceType[int32]{ 0: ResourceList{Resources: map[string]resource.Quantity{"foo": resource.MustParse("3")}}, }, - b: QuantityByPriorityAndResourceType{ + b: QuantityByTAndResourceType[int32]{ 1: ResourceList{Resources: map[string]resource.Quantity{"foo": resource.MustParse("1")}}, }, - expected: QuantityByPriorityAndResourceType{ + expected: QuantityByTAndResourceType[int32]{ 0: ResourceList{Resources: map[string]resource.Quantity{"foo": resource.MustParse("3")}}, 1: ResourceList{Resources: map[string]resource.Quantity{"foo": resource.MustParse("1")}}, }, @@ -77,11 +77,11 @@ func TestQuantityByPriorityAndResourceTypeAdd(t *testing.T) { } } -func TestQuantityByPriorityAndResourceTypeSub(t *testing.T) { +func TestQuantityByTAndResourceTypeSub(t *testing.T) { tests := map[string]struct { - a QuantityByPriorityAndResourceType - b QuantityByPriorityAndResourceType - expected QuantityByPriorityAndResourceType + a QuantityByTAndResourceType[int32] + b QuantityByTAndResourceType[int32] + expected QuantityByTAndResourceType[int32] }{ "nil and nil": { a: nil, @@ -89,34 +89,34 @@ func TestQuantityByPriorityAndResourceTypeSub(t *testing.T) { expected: nil, }, "empty and nil": { - a: QuantityByPriorityAndResourceType{}, + a: QuantityByTAndResourceType[int32]{}, b: nil, - expected: QuantityByPriorityAndResourceType{}, + expected: QuantityByTAndResourceType[int32]{}, }, "nil and empty": { a: nil, - b: QuantityByPriorityAndResourceType{}, + b: QuantityByTAndResourceType[int32]{}, expected: nil, }, "matching": { - a: QuantityByPriorityAndResourceType{ + a: QuantityByTAndResourceType[int32]{ 0: ResourceList{Resources: map[string]resource.Quantity{"foo": resource.MustParse("3")}}, }, - b: QuantityByPriorityAndResourceType{ + b: QuantityByTAndResourceType[int32]{ 0: ResourceList{Resources: map[string]resource.Quantity{"foo": resource.MustParse("1")}}, }, - expected: QuantityByPriorityAndResourceType{ + expected: QuantityByTAndResourceType[int32]{ 0: ResourceList{Resources: map[string]resource.Quantity{"foo": resource.MustParse("2")}}, }, }, "mismatched resources": { - a: QuantityByPriorityAndResourceType{ + a: QuantityByTAndResourceType[int32]{ 0: ResourceList{Resources: map[string]resource.Quantity{"foo": resource.MustParse("3")}}, }, - b: QuantityByPriorityAndResourceType{ + b: QuantityByTAndResourceType[int32]{ 0: ResourceList{Resources: map[string]resource.Quantity{"bar": resource.MustParse("1")}}, }, - expected: QuantityByPriorityAndResourceType{ + expected: QuantityByTAndResourceType[int32]{ 0: ResourceList{ Resources: map[string]resource.Quantity{ "foo": resource.MustParse("3"), @@ -126,13 +126,13 @@ func TestQuantityByPriorityAndResourceTypeSub(t *testing.T) { }, }, "mismatched priorities": { - a: QuantityByPriorityAndResourceType{ + a: QuantityByTAndResourceType[int32]{ 0: ResourceList{Resources: map[string]resource.Quantity{"foo": resource.MustParse("3")}}, }, - b: QuantityByPriorityAndResourceType{ + b: QuantityByTAndResourceType[int32]{ 1: ResourceList{Resources: map[string]resource.Quantity{"foo": resource.MustParse("1")}}, }, - expected: QuantityByPriorityAndResourceType{ + expected: QuantityByTAndResourceType[int32]{ 0: ResourceList{Resources: map[string]resource.Quantity{"foo": resource.MustParse("3")}}, 1: ResourceList{Resources: map[string]resource.Quantity{"foo": resource.MustParse("-1")}}, }, @@ -146,66 +146,66 @@ func TestQuantityByPriorityAndResourceTypeSub(t *testing.T) { } } -func TestQuantityByPriorityAndResourceTypeEqual(t *testing.T) { +func TestQuantityByTAndResourceTypeEqual(t *testing.T) { tests := map[string]struct { - a QuantityByPriorityAndResourceType - b QuantityByPriorityAndResourceType + a QuantityByTAndResourceType[int32] + b QuantityByTAndResourceType[int32] expected bool }{ "both empty": { - a: QuantityByPriorityAndResourceType{}, - b: QuantityByPriorityAndResourceType{}, + a: QuantityByTAndResourceType[int32]{}, + b: QuantityByTAndResourceType[int32]{}, expected: true, }, "both with an empty map": { - a: QuantityByPriorityAndResourceType{ + a: QuantityByTAndResourceType[int32]{ 0: ResourceList{}, }, - b: QuantityByPriorityAndResourceType{ + b: QuantityByTAndResourceType[int32]{ 0: ResourceList{}, }, expected: true, }, "one empty map": { - a: QuantityByPriorityAndResourceType{ + a: QuantityByTAndResourceType[int32]{ 0: ResourceList{}, }, - b: QuantityByPriorityAndResourceType{}, + b: QuantityByTAndResourceType[int32]{}, expected: true, }, "zero equals empty": { - a: QuantityByPriorityAndResourceType{ + a: QuantityByTAndResourceType[int32]{ 0: ResourceList{ Resources: map[string]resource.Quantity{ "foo": resource.MustParse("0"), }, }, }, - b: QuantityByPriorityAndResourceType{ + b: QuantityByTAndResourceType[int32]{ 0: ResourceList{}, }, expected: true, }, "zero equals missing": { - a: QuantityByPriorityAndResourceType{ + a: QuantityByTAndResourceType[int32]{ 0: ResourceList{}, }, - b: QuantityByPriorityAndResourceType{}, + b: QuantityByTAndResourceType[int32]{}, expected: true, }, "zero equals missing with empty ResourceList": { - a: QuantityByPriorityAndResourceType{ + a: QuantityByTAndResourceType[int32]{ 0: ResourceList{ Resources: map[string]resource.Quantity{ "foo": resource.MustParse("0"), }, }, }, - b: QuantityByPriorityAndResourceType{}, + b: QuantityByTAndResourceType[int32]{}, expected: true, }, "simple equal": { - a: QuantityByPriorityAndResourceType{ + a: QuantityByTAndResourceType[int32]{ 0: ResourceList{ Resources: map[string]resource.Quantity{ "cpu": resource.MustParse("1"), @@ -214,7 +214,7 @@ func TestQuantityByPriorityAndResourceTypeEqual(t *testing.T) { }, }, }, - b: QuantityByPriorityAndResourceType{ + b: QuantityByTAndResourceType[int32]{ 0: ResourceList{ Resources: map[string]resource.Quantity{ "cpu": resource.MustParse("1"), @@ -226,7 +226,7 @@ func TestQuantityByPriorityAndResourceTypeEqual(t *testing.T) { expected: true, }, "equal with two priorities": { - a: QuantityByPriorityAndResourceType{ + a: QuantityByTAndResourceType[int32]{ 0: ResourceList{ Resources: map[string]resource.Quantity{ "cpu": resource.MustParse("1"), @@ -242,7 +242,7 @@ func TestQuantityByPriorityAndResourceTypeEqual(t *testing.T) { }, }, }, - b: QuantityByPriorityAndResourceType{ + b: QuantityByTAndResourceType[int32]{ 0: ResourceList{ Resources: map[string]resource.Quantity{ "cpu": resource.MustParse("1"), @@ -261,7 +261,7 @@ func TestQuantityByPriorityAndResourceTypeEqual(t *testing.T) { expected: true, }, "simple unequal": { - a: QuantityByPriorityAndResourceType{ + a: QuantityByTAndResourceType[int32]{ 0: ResourceList{ Resources: map[string]resource.Quantity{ "cpu": resource.MustParse("1"), @@ -270,7 +270,7 @@ func TestQuantityByPriorityAndResourceTypeEqual(t *testing.T) { }, }, }, - b: QuantityByPriorityAndResourceType{ + b: QuantityByTAndResourceType[int32]{ 0: ResourceList{ Resources: map[string]resource.Quantity{ "cpu": resource.MustParse("1"), @@ -282,7 +282,7 @@ func TestQuantityByPriorityAndResourceTypeEqual(t *testing.T) { expected: false, }, "unequal differing priority": { - a: QuantityByPriorityAndResourceType{ + a: QuantityByTAndResourceType[int32]{ 0: ResourceList{ Resources: map[string]resource.Quantity{ "cpu": resource.MustParse("1"), @@ -291,7 +291,7 @@ func TestQuantityByPriorityAndResourceTypeEqual(t *testing.T) { }, }, }, - b: QuantityByPriorityAndResourceType{ + b: QuantityByTAndResourceType[int32]{ 1: ResourceList{ Resources: map[string]resource.Quantity{ "cpu": resource.MustParse("1"), @@ -311,9 +311,9 @@ func TestQuantityByPriorityAndResourceTypeEqual(t *testing.T) { } } -func TestQuantityByPriorityAndResourceTypeIsStrictlyNonNegative(t *testing.T) { +func TestQuantityByTAndResourceTypeIsStrictlyNonNegative(t *testing.T) { tests := map[string]struct { - m QuantityByPriorityAndResourceType + m QuantityByTAndResourceType[int32] expected bool }{ "nil": { @@ -321,23 +321,23 @@ func TestQuantityByPriorityAndResourceTypeIsStrictlyNonNegative(t *testing.T) { expected: true, }, "empty": { - m: QuantityByPriorityAndResourceType{}, + m: QuantityByTAndResourceType[int32]{}, expected: true, }, "simple zero": { - m: QuantityByPriorityAndResourceType{ + m: QuantityByTAndResourceType[int32]{ 0: ResourceList{Resources: map[string]resource.Quantity{"foo": resource.MustParse("0")}}, }, expected: true, }, "simple positive": { - m: QuantityByPriorityAndResourceType{ + m: QuantityByTAndResourceType[int32]{ 0: ResourceList{Resources: map[string]resource.Quantity{"foo": resource.MustParse("1")}}, }, expected: true, }, "simple positive and negative": { - m: QuantityByPriorityAndResourceType{ + m: QuantityByTAndResourceType[int32]{ 0: ResourceList{Resources: map[string]resource.Quantity{"foo": resource.MustParse("1")}}, 1: ResourceList{Resources: map[string]resource.Quantity{"bar": resource.MustParse("-1")}}, }, @@ -351,80 +351,80 @@ func TestQuantityByPriorityAndResourceTypeIsStrictlyNonNegative(t *testing.T) { } } -func TestQuantityByPriorityAndResourceTypeMaxAggregatedByResource(t *testing.T) { +func TestQuantityByTAndResourceTypeMaxAggregatedByResource(t *testing.T) { tests := map[string]struct { - q QuantityByPriorityAndResourceType + q QuantityByTAndResourceType[int32] p int32 rl ResourceList - expected QuantityByPriorityAndResourceType + expected QuantityByTAndResourceType[int32] }{ "no change": { - q: QuantityByPriorityAndResourceType{ + q: QuantityByTAndResourceType[int32]{ 0: ResourceList{Resources: map[string]resource.Quantity{"cpu": resource.MustParse("1")}}, }, p: 1, rl: ResourceList{Resources: map[string]resource.Quantity{"cpu": resource.MustParse("1")}}, - expected: QuantityByPriorityAndResourceType{ + expected: QuantityByTAndResourceType[int32]{ 0: ResourceList{Resources: map[string]resource.Quantity{"cpu": resource.MustParse("1")}}, }, }, "empty": { - q: QuantityByPriorityAndResourceType{}, + q: QuantityByTAndResourceType[int32]{}, p: 0, rl: ResourceList{Resources: map[string]resource.Quantity{"cpu": resource.MustParse("1")}}, - expected: QuantityByPriorityAndResourceType{ + expected: QuantityByTAndResourceType[int32]{ 0: ResourceList{Resources: map[string]resource.Quantity{"cpu": resource.MustParse("1")}}, }, }, "add same resource at same priority": { - q: QuantityByPriorityAndResourceType{ + q: QuantityByTAndResourceType[int32]{ 0: ResourceList{Resources: map[string]resource.Quantity{"cpu": resource.MustParse("1")}}, }, p: 0, rl: ResourceList{Resources: map[string]resource.Quantity{"cpu": resource.MustParse("2")}}, - expected: QuantityByPriorityAndResourceType{ + expected: QuantityByTAndResourceType[int32]{ 0: ResourceList{Resources: map[string]resource.Quantity{"cpu": resource.MustParse("2")}}, }, }, "add different resource at same priority": { - q: QuantityByPriorityAndResourceType{ + q: QuantityByTAndResourceType[int32]{ 0: ResourceList{Resources: map[string]resource.Quantity{"cpu": resource.MustParse("1")}}, }, p: 0, rl: ResourceList{Resources: map[string]resource.Quantity{"memory": resource.MustParse("1Gi")}}, - expected: QuantityByPriorityAndResourceType{ + expected: QuantityByTAndResourceType[int32]{ 0: ResourceList{Resources: map[string]resource.Quantity{"cpu": resource.MustParse("1"), "memory": resource.MustParse("1Gi")}}, }, }, "add same resource at different priority": { - q: QuantityByPriorityAndResourceType{ + q: QuantityByTAndResourceType[int32]{ 0: ResourceList{Resources: map[string]resource.Quantity{"cpu": resource.MustParse("1")}}, }, p: 1, rl: ResourceList{Resources: map[string]resource.Quantity{"cpu": resource.MustParse("2")}}, - expected: QuantityByPriorityAndResourceType{ + expected: QuantityByTAndResourceType[int32]{ 0: ResourceList{Resources: map[string]resource.Quantity{"cpu": resource.MustParse("1")}}, 1: ResourceList{Resources: map[string]resource.Quantity{"cpu": resource.MustParse("1")}}, }, }, "add different resource at different priority": { - q: QuantityByPriorityAndResourceType{ + q: QuantityByTAndResourceType[int32]{ 0: ResourceList{Resources: map[string]resource.Quantity{"cpu": resource.MustParse("1")}}, }, p: 1, rl: ResourceList{Resources: map[string]resource.Quantity{"memory": resource.MustParse("1Gi")}}, - expected: QuantityByPriorityAndResourceType{ + expected: QuantityByTAndResourceType[int32]{ 0: ResourceList{Resources: map[string]resource.Quantity{"cpu": resource.MustParse("1")}}, 1: ResourceList{Resources: map[string]resource.Quantity{"memory": resource.MustParse("1Gi")}}, }, }, "multiple resources": { - q: QuantityByPriorityAndResourceType{ + q: QuantityByTAndResourceType[int32]{ 0: ResourceList{Resources: map[string]resource.Quantity{"cpu": resource.MustParse("100m"), "memory": resource.MustParse("50Mi")}}, }, p: 1, rl: ResourceList{Resources: map[string]resource.Quantity{"cpu": resource.MustParse("10"), "memory": resource.MustParse("4000Mi")}}, - expected: QuantityByPriorityAndResourceType{ + expected: QuantityByTAndResourceType[int32]{ 0: ResourceList{Resources: map[string]resource.Quantity{"cpu": resource.MustParse("100m"), "memory": resource.MustParse("50Mi")}}, 1: ResourceList{Resources: map[string]resource.Quantity{"cpu": resource.MustParse("9900m"), "memory": resource.MustParse("3950Mi")}}, }, @@ -940,3 +940,34 @@ func BenchmarkResourceListZeroAdd(b *testing.B) { rla.Add(rlb) } } + +func BenchmarkQuantityByTAndResourceTypeAdd(b *testing.B) { + dst := make(QuantityByTAndResourceType[string], 3) + src := QuantityByTAndResourceType[string]{ + "1": ResourceList{ + Resources: map[string]resource.Quantity{ + "foo": resource.MustParse("1"), + "bar": resource.MustParse("2"), + "baz": resource.MustParse("3"), + }, + }, + "2": ResourceList{ + Resources: map[string]resource.Quantity{ + "foo": resource.MustParse("1"), + "bar": resource.MustParse("2"), + "baz": resource.MustParse("3"), + }, + }, + "3": ResourceList{ + Resources: map[string]resource.Quantity{ + "foo": resource.MustParse("1"), + "bar": resource.MustParse("2"), + "baz": resource.MustParse("3"), + }, + }, + } + b.ResetTimer() + for n := 0; n < b.N; n++ { + dst.Add(src) + } +} diff --git a/internal/scheduler/schedulerobjects/schedulerobjects.pb.go b/internal/scheduler/schedulerobjects/schedulerobjects.pb.go index ad9614deea0..18cc6ced935 100644 --- a/internal/scheduler/schedulerobjects/schedulerobjects.pb.go +++ b/internal/scheduler/schedulerobjects/schedulerobjects.pb.go @@ -443,13 +443,13 @@ func (m *NodeType) GetUnsetIndexedLabels() map[string]string { return nil } -// Captures the resource usage of a particular queue -// in a given cluster. +// Captures the resource usage of a particular queue in a given cluster. type QueueClusterResourceUsage struct { - Created time.Time `protobuf:"bytes,1,opt,name=created,proto3,stdtime" json:"created"` - Queue string `protobuf:"bytes,2,opt,name=queue,proto3" json:"queue,omitempty"` - ExecutorId string `protobuf:"bytes,3,opt,name=executorId,proto3" json:"executorId,omitempty"` - ResourcesByPriority map[int32]ResourceList `protobuf:"bytes,4,rep,name=resourcesByPriority,proto3" json:"resourcesByPriority" protobuf_key:"varint,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"` + Created time.Time `protobuf:"bytes,1,opt,name=created,proto3,stdtime" json:"created"` + Queue string `protobuf:"bytes,2,opt,name=queue,proto3" json:"queue,omitempty"` + ExecutorId string `protobuf:"bytes,3,opt,name=executorId,proto3" json:"executorId,omitempty"` + ResourcesByPriority map[int32]ResourceList `protobuf:"bytes,4,rep,name=resourcesByPriority,proto3" json:"resourcesByPriority" protobuf_key:"varint,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"` // Deprecated: Do not use. + ResourcesByPriorityClassName map[string]ResourceList `protobuf:"bytes,5,rep,name=resourcesByPriorityClassName,proto3" json:"resourcesByPriorityClassName" protobuf_key:"bytes,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"` } func (m *QueueClusterResourceUsage) Reset() { *m = QueueClusterResourceUsage{} } @@ -506,6 +506,7 @@ func (m *QueueClusterResourceUsage) GetExecutorId() string { return "" } +// Deprecated: Do not use. func (m *QueueClusterResourceUsage) GetResourcesByPriority() map[int32]ResourceList { if m != nil { return m.ResourcesByPriority @@ -513,6 +514,13 @@ func (m *QueueClusterResourceUsage) GetResourcesByPriority() map[int32]ResourceL return nil } +func (m *QueueClusterResourceUsage) GetResourcesByPriorityClassName() map[string]ResourceList { + if m != nil { + return m.ResourcesByPriorityClassName + } + return nil +} + // A collection of QueueClusterResourceUsage // This is only needed to brige the gap between the redis based scheduler and the new scheduler. type ClusterResourceUsageReport struct { @@ -989,6 +997,7 @@ func init() { proto.RegisterMapType((map[string]string)(nil), "schedulerobjects.NodeType.LabelsEntry") proto.RegisterMapType((map[string]string)(nil), "schedulerobjects.NodeType.UnsetIndexedLabelsEntry") proto.RegisterType((*QueueClusterResourceUsage)(nil), "schedulerobjects.QueueClusterResourceUsage") + proto.RegisterMapType((map[string]ResourceList)(nil), "schedulerobjects.QueueClusterResourceUsage.ResourcesByPriorityClassNameEntry") proto.RegisterMapType((map[int32]ResourceList)(nil), "schedulerobjects.QueueClusterResourceUsage.ResourcesByPriorityEntry") proto.RegisterType((*ClusterResourceUsageReport)(nil), "schedulerobjects.ClusterResourceUsageReport") proto.RegisterMapType((map[string]*QueueClusterResourceUsage)(nil), "schedulerobjects.ClusterResourceUsageReport.ResourcesByQueueEntry") @@ -1007,141 +1016,143 @@ func init() { } var fileDescriptor_97dadc5fbd620721 = []byte{ - // 2132 bytes of a gzipped FileDescriptorProto + // 2173 bytes of a gzipped FileDescriptorProto 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0xc4, 0x19, 0x4d, 0x6f, 0x1b, 0xc7, - 0x55, 0x2b, 0x52, 0x12, 0x39, 0x94, 0x25, 0x6a, 0x64, 0xd9, 0x2b, 0xda, 0xe6, 0x32, 0x4c, 0x1a, - 0x28, 0x8d, 0x43, 0x36, 0x4e, 0x81, 0x1a, 0x6e, 0x2f, 0xa2, 0xa5, 0xd6, 0x94, 0x1d, 0x4a, 0x5e, - 0x4a, 0x2d, 0x5a, 0xa0, 0x59, 0x2c, 0xb9, 0x23, 0x7a, 0xa3, 0xe5, 0x0c, 0xbd, 0x3b, 0xab, 0x86, - 0x39, 0xb7, 0x87, 0x22, 0x40, 0x1a, 0xb4, 0x41, 0x11, 0xa0, 0x40, 0x8b, 0xdc, 0xfa, 0x0b, 0x7a, - 0xe9, 0xad, 0xa7, 0x1c, 0x73, 0xec, 0x89, 0x2d, 0xec, 0x1b, 0x8f, 0x45, 0x7f, 0x40, 0x31, 0x33, - 0xbb, 0xdc, 0xe1, 0xee, 0x52, 0x94, 0xd3, 0xba, 0x3e, 0x71, 0xe7, 0x7d, 0x7f, 0xcd, 0x9b, 0x37, - 0x43, 0x70, 0xcf, 0xc6, 0x14, 0xb9, 0xd8, 0x74, 0xea, 0x5e, 0xf7, 0x09, 0xb2, 0x7c, 0x07, 0xb9, - 0xd1, 0x17, 0xe9, 0x7c, 0x88, 0xba, 0xd4, 0x4b, 0x00, 0x6a, 0x03, 0x97, 0x50, 0x02, 0x8b, 0x71, - 0x78, 0x49, 0xeb, 0x11, 0xd2, 0x73, 0x50, 0x9d, 0xe3, 0x3b, 0xfe, 0x69, 0x9d, 0xda, 0x7d, 0xe4, - 0x51, 0xb3, 0x3f, 0x10, 0x2c, 0xa5, 0xea, 0xd9, 0x5d, 0xaf, 0x66, 0x93, 0xba, 0x39, 0xb0, 0xeb, - 0x5d, 0xe2, 0xa2, 0xfa, 0xf9, 0xbb, 0xf5, 0x1e, 0xc2, 0xc8, 0x35, 0x29, 0xb2, 0x02, 0x9a, 0xef, - 0x46, 0x34, 0x7d, 0xb3, 0xfb, 0xc4, 0xc6, 0xc8, 0x1d, 0xd6, 0x07, 0x67, 0x3d, 0xce, 0xe4, 0x22, - 0x8f, 0xf8, 0x6e, 0x17, 0x25, 0xb8, 0xde, 0xe9, 0xd9, 0xf4, 0x89, 0xdf, 0xa9, 0x75, 0x49, 0xbf, - 0xde, 0x23, 0x3d, 0x12, 0xd9, 0xc0, 0x56, 0x7c, 0xc1, 0xbf, 0x04, 0x79, 0xf5, 0xcf, 0x19, 0x90, - 0xdb, 0xff, 0x08, 0x75, 0x7d, 0x4a, 0x5c, 0x58, 0x01, 0x8b, 0xb6, 0xa5, 0x2a, 0x15, 0x65, 0x27, - 0xdf, 0x28, 0x8e, 0x47, 0xda, 0xaa, 0x6d, 0xdd, 0x26, 0x7d, 0x9b, 0xa2, 0xfe, 0x80, 0x0e, 0xf5, - 0x45, 0xdb, 0x82, 0x6f, 0x82, 0xec, 0x80, 0x10, 0x47, 0x5d, 0xe4, 0x34, 0x70, 0x3c, 0xd2, 0xd6, - 0xd8, 0x5a, 0xa2, 0xe2, 0x78, 0xb8, 0x0b, 0x96, 0x30, 0xb1, 0x90, 0xa7, 0x66, 0x2a, 0x99, 0x9d, - 0xc2, 0x9d, 0x6b, 0xb5, 0x44, 0xe8, 0x5a, 0xc4, 0x42, 0x8d, 0xcd, 0xf1, 0x48, 0x5b, 0xe7, 0x84, - 0x92, 0x04, 0xc1, 0x09, 0x3f, 0x00, 0x6b, 0x7d, 0x1b, 0xdb, 0x7d, 0xbf, 0x7f, 0x40, 0x3a, 0x6d, - 0xfb, 0x63, 0xa4, 0x66, 0x2b, 0xca, 0x4e, 0xe1, 0x4e, 0x39, 0x29, 0x4b, 0x0f, 0x82, 0xf1, 0xc8, - 0xf6, 0x68, 0xe3, 0xda, 0x57, 0x23, 0x6d, 0x81, 0x19, 0x36, 0xcd, 0xad, 0xc7, 0xd6, 0x4c, 0xbe, - 0x63, 0x7a, 0xf4, 0x64, 0x60, 0x99, 0x14, 0x1d, 0xdb, 0x7d, 0xa4, 0x2e, 0x71, 0xf9, 0xa5, 0x9a, - 0x48, 0x5e, 0x2d, 0x0c, 0x5c, 0xed, 0x38, 0x4c, 0x5e, 0xa3, 0x14, 0xca, 0x9e, 0xe6, 0xfc, 0xec, - 0x1f, 0x9a, 0xa2, 0xc7, 0x60, 0xf0, 0x10, 0x6c, 0xfa, 0xd8, 0xf4, 0x3c, 0xbb, 0x87, 0x91, 0x65, - 0x7c, 0x48, 0x3a, 0x86, 0xeb, 0x63, 0x4f, 0xcd, 0x57, 0x32, 0x3b, 0xf9, 0x86, 0x36, 0x1e, 0x69, - 0x37, 0x22, 0xf4, 0x01, 0xe9, 0xe8, 0x3e, 0x96, 0x83, 0xb0, 0x91, 0x40, 0x56, 0xff, 0xbd, 0x05, - 0xb2, 0x2c, 0x6a, 0x97, 0x4b, 0x13, 0x36, 0xfb, 0x48, 0x5d, 0x8d, 0xd2, 0xc4, 0xd6, 0x72, 0x9a, - 0xd8, 0x1a, 0xde, 0x05, 0x80, 0x05, 0x7b, 0xaf, 0xf3, 0x10, 0x0d, 0x3d, 0x15, 0x56, 0x32, 0x3b, - 0xab, 0x0d, 0x75, 0x3c, 0xd2, 0xae, 0x46, 0x50, 0x89, 0x47, 0xa2, 0x85, 0xef, 0x83, 0x3c, 0xf3, - 0xd7, 0xf0, 0x10, 0xc2, 0xbc, 0x1a, 0x2e, 0x0e, 0xdc, 0xd5, 0x20, 0x70, 0x39, 0xc6, 0xd4, 0x46, - 0x08, 0xf3, 0x90, 0x4d, 0x56, 0xf0, 0x10, 0xe4, 0x99, 0x70, 0x83, 0x0e, 0x07, 0x48, 0xcd, 0x04, - 0xe2, 0x52, 0x6b, 0xe6, 0x78, 0x38, 0x40, 0x8d, 0x6b, 0xe3, 0x91, 0x06, 0x71, 0xb0, 0x92, 0x2c, - 0xcc, 0x85, 0x30, 0x78, 0x0f, 0xac, 0x4e, 0x04, 0x1a, 0xb6, 0xc5, 0x6b, 0x27, 0x1b, 0xf9, 0xc6, - 0x68, 0x9a, 0x56, 0xdc, 0x37, 0x01, 0x85, 0xbb, 0x60, 0x99, 0x9a, 0x36, 0xa6, 0x9e, 0xba, 0xc4, - 0xab, 0x77, 0xbb, 0x26, 0x76, 0x62, 0xcd, 0x1c, 0xd8, 0x35, 0xb6, 0x5b, 0x6b, 0xe7, 0xef, 0xd6, - 0x8e, 0x19, 0x45, 0x63, 0x2d, 0xf0, 0x2b, 0x60, 0xd0, 0x83, 0x5f, 0x78, 0x04, 0x96, 0x1d, 0xb3, - 0x83, 0x1c, 0x4f, 0x5d, 0xe6, 0x22, 0xaa, 0xe9, 0xce, 0xd4, 0x1e, 0x71, 0xa2, 0x7d, 0x4c, 0xdd, - 0x61, 0xe3, 0xea, 0x78, 0xa4, 0x15, 0x05, 0x97, 0x64, 0x58, 0x20, 0x07, 0x1a, 0x60, 0x9d, 0x12, - 0x6a, 0x3a, 0x46, 0xb8, 0xf3, 0x3d, 0x75, 0xe5, 0xc5, 0xf6, 0x03, 0x67, 0x0f, 0x51, 0x9e, 0x1e, - 0x5b, 0xc3, 0xbf, 0x28, 0xe0, 0x0d, 0xd3, 0x71, 0x48, 0xd7, 0xa4, 0x66, 0xc7, 0x41, 0x46, 0x67, - 0x68, 0x0c, 0x5c, 0x9b, 0xb8, 0x36, 0x1d, 0x1a, 0x26, 0xb6, 0x26, 0x7a, 0xd5, 0x1c, 0xf7, 0xe8, - 0x07, 0x33, 0x3c, 0xda, 0x8d, 0x44, 0x34, 0x86, 0x47, 0x81, 0x80, 0x5d, 0x6c, 0x85, 0x8a, 0x84, - 0xaf, 0x3b, 0x81, 0x51, 0x15, 0x73, 0x0e, 0xb9, 0x3e, 0x97, 0x02, 0xba, 0x60, 0xd3, 0xa3, 0x26, - 0xe5, 0x16, 0x07, 0xdb, 0x8c, 0x65, 0x3c, 0xcf, 0xcd, 0x7c, 0x7b, 0x86, 0x99, 0x6d, 0xc6, 0xd1, - 0x18, 0x8a, 0xbd, 0xd5, 0xb4, 0x84, 0x55, 0xd7, 0x03, 0xab, 0xd6, 0xbd, 0x69, 0xac, 0x1e, 0x07, - 0x40, 0x1f, 0x6c, 0x06, 0x76, 0x21, 0x2b, 0xd4, 0x6b, 0x5b, 0x2a, 0xe0, 0x3a, 0x6f, 0x5f, 0x1c, - 0x1a, 0x64, 0x71, 0x41, 0xa1, 0x52, 0x35, 0x50, 0x5a, 0x34, 0x63, 0x68, 0x3d, 0x01, 0x81, 0x14, - 0xc0, 0x29, 0xb5, 0x4f, 0x7d, 0xe4, 0x23, 0xb5, 0x70, 0x59, 0xad, 0x8f, 0x19, 0xf9, 0x6c, 0xad, - 0x1c, 0xad, 0x27, 0x20, 0xcc, 0x59, 0x74, 0x6e, 0x77, 0x69, 0xd4, 0xc6, 0x0c, 0xdb, 0xf2, 0xd4, - 0xb5, 0x0b, 0xd5, 0xee, 0x0b, 0x8e, 0x30, 0x62, 0x5e, 0x4c, 0x2d, 0x8a, 0xa1, 0xf5, 0x04, 0x04, - 0x7e, 0xa9, 0x80, 0x32, 0x26, 0xd8, 0x30, 0xdd, 0xbe, 0x69, 0x99, 0x46, 0xe4, 0x78, 0xb4, 0x03, - 0xae, 0x70, 0x13, 0xbe, 0x37, 0xc3, 0x84, 0x16, 0xc1, 0xbb, 0x9c, 0x77, 0x12, 0x82, 0x49, 0xb5, - 0x0b, 0x6b, 0x5e, 0x0f, 0xac, 0xb9, 0x81, 0x67, 0x53, 0xea, 0x17, 0x21, 0xe1, 0x2e, 0xb8, 0xe2, - 0xe3, 0x40, 0x3b, 0xab, 0x50, 0x75, 0xbd, 0xa2, 0xec, 0xe4, 0x1a, 0x37, 0xc6, 0x23, 0xed, 0xfa, - 0x14, 0x42, 0xda, 0xd1, 0xd3, 0x1c, 0xf0, 0x13, 0x05, 0x5c, 0x0f, 0x3d, 0x32, 0x7c, 0xcf, 0xec, - 0xa1, 0x28, 0xb3, 0x45, 0xee, 0xdf, 0x77, 0x66, 0xf8, 0x17, 0x9a, 0x71, 0xc2, 0x98, 0xa6, 0xb2, - 0x5b, 0x1d, 0x8f, 0xb4, 0xb2, 0x9b, 0x82, 0x96, 0xcc, 0xb8, 0x9a, 0x86, 0x67, 0xa7, 0x96, 0x8b, - 0x06, 0xc4, 0xa5, 0x36, 0xee, 0x19, 0x51, 0x4b, 0xde, 0xe0, 0x07, 0x09, 0x3f, 0xb5, 0x26, 0xe8, - 0x56, 0xb2, 0xff, 0x6e, 0x24, 0x90, 0x25, 0x13, 0x14, 0xa4, 0x26, 0x07, 0x5f, 0x07, 0x99, 0x33, - 0x34, 0x0c, 0x0e, 0xaf, 0x8d, 0xf1, 0x48, 0xbb, 0x72, 0x86, 0x86, 0x92, 0x04, 0x86, 0x85, 0x6f, - 0x81, 0xa5, 0x73, 0xd3, 0xf1, 0x51, 0x30, 0x66, 0xf0, 0x29, 0x81, 0x03, 0xe4, 0x29, 0x81, 0x03, - 0xee, 0x2d, 0xde, 0x55, 0x4a, 0x7f, 0x50, 0xc0, 0xb7, 0x2e, 0xd5, 0x76, 0x64, 0xed, 0x4b, 0x33, - 0xb5, 0x37, 0x65, 0xed, 0xf3, 0xfb, 0xeb, 0x3c, 0xeb, 0x7e, 0xad, 0x80, 0xab, 0x69, 0xdd, 0xe6, - 0x72, 0xa1, 0x78, 0x20, 0x1b, 0xb3, 0x76, 0xe7, 0x56, 0xd2, 0x18, 0x21, 0x54, 0x68, 0x98, 0x67, - 0xcb, 0x27, 0x0a, 0xd8, 0x4a, 0xed, 0x42, 0x97, 0x33, 0xe6, 0x7f, 0x1c, 0x99, 0x98, 0x35, 0x51, - 0xfd, 0xbe, 0x12, 0x6b, 0xce, 0xc0, 0x56, 0x6a, 0xcf, 0xfa, 0x06, 0x25, 0x9b, 0x9b, 0xab, 0xec, - 0xf7, 0x0a, 0xa8, 0xcc, 0x6b, 0x4f, 0xaf, 0xa4, 0x5a, 0x7f, 0xa3, 0x80, 0xed, 0x99, 0x7d, 0xe5, - 0x55, 0xe4, 0xa5, 0xfa, 0xc7, 0x2c, 0xc8, 0x85, 0xdd, 0x84, 0x8d, 0xbe, 0x4d, 0x31, 0xfa, 0x66, - 0xc5, 0xe8, 0x3b, 0x35, 0xc4, 0x2d, 0x4e, 0x0d, 0x6f, 0x8b, 0xdf, 0x74, 0x78, 0x3b, 0x9e, 0x0c, - 0x6f, 0xe2, 0xf6, 0xf2, 0xe6, 0xec, 0x49, 0xf4, 0x05, 0x06, 0xb8, 0x5f, 0x2a, 0x00, 0xfa, 0xd8, - 0x43, 0xb4, 0x89, 0x2d, 0xf4, 0x11, 0xb2, 0x04, 0xa7, 0x9a, 0xe5, 0x2a, 0xee, 0x5c, 0xa0, 0xe2, - 0x24, 0xc1, 0x24, 0xd4, 0x55, 0xc6, 0x23, 0xed, 0x66, 0x52, 0xa2, 0xa4, 0x3a, 0x45, 0xdf, 0xff, - 0xa3, 0x1f, 0xf7, 0xc1, 0xf5, 0x19, 0x36, 0xbf, 0x0c, 0x75, 0xd5, 0x7f, 0x65, 0xc0, 0x36, 0xaf, - 0xd1, 0xfb, 0x8e, 0xef, 0x51, 0xe4, 0x4e, 0x95, 0x2f, 0x6c, 0x82, 0x95, 0xae, 0x8b, 0xd8, 0xee, - 0xe2, 0x5a, 0x2f, 0xbe, 0xa6, 0x6c, 0x06, 0x15, 0x11, 0xb2, 0xf0, 0x5b, 0x4a, 0xb8, 0x60, 0x76, - 0x89, 0x63, 0x59, 0xb2, 0xeb, 0x69, 0xec, 0x54, 0x15, 0x14, 0xec, 0x62, 0x85, 0x82, 0x5b, 0x75, - 0xd3, 0xe2, 0x17, 0x9a, 0xbc, 0xb8, 0x7c, 0x44, 0x50, 0xf9, 0xf2, 0x11, 0x41, 0xe1, 0xef, 0x14, - 0x76, 0x02, 0x07, 0x7d, 0x20, 0x3a, 0xca, 0x82, 0x3a, 0xd9, 0x4b, 0xd6, 0xc9, 0x4c, 0xd7, 0x27, - 0xdb, 0x4c, 0x12, 0x23, 0x2a, 0xe7, 0x46, 0xe0, 0x66, 0x9a, 0x22, 0x3d, 0x0d, 0x58, 0xfa, 0x54, - 0x01, 0xea, 0x2c, 0x71, 0xaf, 0xa2, 0x4f, 0x55, 0xff, 0x9a, 0x01, 0xa5, 0x34, 0xa7, 0x75, 0x3e, - 0x80, 0x4c, 0xde, 0x29, 0x94, 0x39, 0xef, 0x14, 0x52, 0x75, 0x2c, 0xfe, 0x97, 0xd5, 0xf1, 0xa9, - 0x02, 0x8a, 0x52, 0xe8, 0x78, 0x5a, 0x82, 0x06, 0xd2, 0x48, 0x3a, 0x3b, 0xdb, 0x76, 0x39, 0x6d, - 0xd2, 0x48, 0x57, 0x1e, 0x8f, 0xb4, 0x52, 0x5c, 0xbe, 0xe4, 0x4f, 0x42, 0x77, 0xe9, 0x0b, 0x05, - 0x6c, 0xa5, 0xca, 0xba, 0xdc, 0x2e, 0xfc, 0xf1, 0x74, 0xc2, 0xde, 0x7e, 0x81, 0xca, 0x9b, 0x9b, - 0xbd, 0x5f, 0x2d, 0x82, 0x55, 0x39, 0xdd, 0xf0, 0x03, 0x90, 0x8f, 0xa6, 0x7a, 0x85, 0x07, 0xed, - 0x9d, 0x8b, 0x2b, 0xa4, 0x16, 0x9b, 0xe5, 0x37, 0x82, 0xe4, 0x44, 0x72, 0xf4, 0xe8, 0xb3, 0xf4, - 0xb9, 0x02, 0xd6, 0x66, 0x9f, 0xae, 0xb3, 0x83, 0xf0, 0xd3, 0xe9, 0x20, 0xd4, 0xa4, 0xc3, 0x64, - 0xf2, 0x26, 0x57, 0x1b, 0x9c, 0xf5, 0xf8, 0xe9, 0x12, 0xaa, 0xab, 0x3d, 0xf6, 0x4d, 0x4c, 0x6d, - 0x3a, 0x9c, 0x1b, 0x87, 0xcf, 0x97, 0xc0, 0xc6, 0x01, 0xe9, 0xb4, 0x85, 0xa3, 0x36, 0xee, 0x35, - 0xf1, 0x29, 0x81, 0x77, 0x40, 0xce, 0xb1, 0x4f, 0x11, 0xb5, 0xfb, 0x88, 0x9b, 0x77, 0x45, 0xbc, - 0x77, 0x84, 0x30, 0xf9, 0xbd, 0x23, 0x84, 0xc1, 0x7b, 0x60, 0xd5, 0xa4, 0x46, 0x9f, 0x78, 0xd4, - 0x20, 0xb8, 0x1b, 0x8e, 0x21, 0xbc, 0xe5, 0x98, 0xf4, 0x7d, 0xe2, 0xd1, 0x43, 0xdc, 0x95, 0x39, - 0x41, 0x04, 0x85, 0xdf, 0x07, 0x85, 0x81, 0x8b, 0x18, 0xdc, 0x66, 0x57, 0x98, 0x0c, 0x67, 0xdd, - 0x1e, 0x8f, 0xb4, 0x2d, 0x09, 0x2c, 0xf1, 0xca, 0xd4, 0xf0, 0x01, 0x28, 0x76, 0x09, 0xee, 0xfa, - 0xae, 0x8b, 0x70, 0x77, 0x68, 0x78, 0xe6, 0xa9, 0x78, 0xa8, 0xcb, 0x35, 0x6e, 0x8d, 0x47, 0xda, - 0xb6, 0x84, 0x6b, 0x9b, 0xa7, 0xb2, 0x94, 0xf5, 0x18, 0x8a, 0x5d, 0x3d, 0x26, 0x0f, 0x0e, 0x5d, - 0xc7, 0xf4, 0x3c, 0x83, 0xbf, 0x61, 0x2d, 0x47, 0x57, 0x8f, 0x10, 0x7d, 0x9f, 0x61, 0x5b, 0xd3, - 0x0f, 0x5a, 0x1b, 0x09, 0x24, 0x6c, 0x83, 0x82, 0xe7, 0x77, 0xfa, 0x36, 0x35, 0x78, 0x28, 0x57, - 0xe6, 0x6e, 0xf0, 0xf0, 0xa9, 0x04, 0x08, 0xb6, 0xc9, 0xd3, 0x9e, 0xb4, 0x66, 0xc9, 0x09, 0x35, - 0xa9, 0xb9, 0x28, 0x39, 0x21, 0x4c, 0x4e, 0x4e, 0x08, 0x83, 0xbf, 0x00, 0x9b, 0xa2, 0x84, 0x0d, - 0x17, 0x3d, 0xf5, 0x6d, 0x17, 0xf5, 0x51, 0xf4, 0xba, 0xf4, 0x46, 0xb2, 0xce, 0x0f, 0xf9, 0xaf, - 0x2e, 0xd1, 0x8a, 0xc3, 0x9e, 0x24, 0xe0, 0xf2, 0x61, 0x9f, 0xc4, 0xc2, 0x3a, 0x58, 0x39, 0x47, - 0xae, 0x67, 0x13, 0xac, 0xe6, 0xb9, 0xad, 0x5b, 0xe3, 0x91, 0xb6, 0x11, 0x80, 0x24, 0xde, 0x90, - 0xea, 0x5e, 0xf6, 0x8b, 0x2f, 0x35, 0xa5, 0xfa, 0x5b, 0x05, 0xc0, 0xa4, 0x0d, 0xd0, 0x01, 0xeb, - 0x03, 0x62, 0xc9, 0xa0, 0xe0, 0x48, 0x7d, 0x2d, 0xe9, 0xc2, 0xd1, 0x34, 0xa1, 0x28, 0x86, 0x18, - 0x77, 0x64, 0xc0, 0x83, 0x05, 0x3d, 0x2e, 0xba, 0xb1, 0x06, 0x56, 0xe5, 0x68, 0x55, 0xff, 0xb6, - 0x02, 0xd6, 0x63, 0x52, 0xa1, 0x27, 0x5e, 0xf9, 0xda, 0xc8, 0x41, 0x5d, 0x4a, 0xdc, 0xa0, 0x73, - 0xbc, 0x37, 0xd7, 0x1c, 0x3e, 0x5c, 0x85, 0x5c, 0xa2, 0x7f, 0x94, 0xc6, 0x23, 0xed, 0x9a, 0x2c, - 0x4c, 0x0a, 0xcf, 0x94, 0x12, 0x78, 0x04, 0x72, 0xe6, 0xe9, 0xa9, 0x8d, 0x59, 0x05, 0x88, 0xb6, - 0x70, 0x33, 0x6d, 0xc6, 0xdc, 0x0d, 0x68, 0x44, 0x7d, 0x84, 0x1c, 0x72, 0x7d, 0x84, 0x30, 0x78, - 0x02, 0x0a, 0x94, 0x38, 0xc8, 0x35, 0xa9, 0x4d, 0x70, 0x38, 0x75, 0x96, 0x53, 0x07, 0xd7, 0x09, - 0xd9, 0xe4, 0x34, 0x92, 0x59, 0x75, 0x79, 0x01, 0x09, 0x28, 0x98, 0x18, 0x13, 0x1a, 0x88, 0x5d, - 0x99, 0x35, 0x69, 0xc6, 0x83, 0xb3, 0x1b, 0x31, 0x89, 0xd8, 0xf0, 0x5e, 0x20, 0x89, 0x92, 0x7b, - 0x81, 0x04, 0x9e, 0xda, 0x1b, 0x59, 0x3e, 0x0d, 0xcc, 0xdf, 0x1b, 0x07, 0xa0, 0x18, 0xb6, 0x13, - 0x82, 0x8f, 0x88, 0x63, 0x77, 0x87, 0xfc, 0x21, 0x3e, 0x2f, 0x4e, 0xbc, 0x38, 0x4e, 0x3e, 0xf1, - 0xe2, 0x38, 0xf8, 0x31, 0x98, 0x3c, 0x6a, 0x4c, 0x55, 0xe9, 0x32, 0xcf, 0xd2, 0x4e, 0x5a, 0x40, - 0xf5, 0x14, 0xfa, 0xc6, 0xcd, 0x20, 0xb4, 0xa9, 0xd2, 0xf4, 0x54, 0x28, 0x6c, 0x83, 0xcd, 0xae, - 0xc9, 0x22, 0x1b, 0x35, 0xf3, 0x87, 0x48, 0xb4, 0x88, 0xd5, 0xc6, 0x6b, 0xe3, 0x91, 0x76, 0x2b, - 0x05, 0x2d, 0x79, 0x93, 0xc6, 0x5d, 0xea, 0x81, 0x8d, 0x44, 0xa5, 0xbe, 0x94, 0x91, 0xfd, 0x14, - 0x14, 0xe3, 0x59, 0x7f, 0x29, 0xb3, 0xfa, 0x9f, 0x14, 0xb0, 0x7d, 0xe4, 0x3b, 0x9e, 0xe9, 0xb6, - 0xc3, 0x2a, 0x3c, 0x20, 0x9d, 0x3d, 0x44, 0x4d, 0xdb, 0xf1, 0x98, 0x30, 0xfe, 0x24, 0x11, 0xe8, - 0xe4, 0xc2, 0x38, 0x40, 0x16, 0x26, 0x5e, 0x42, 0xdf, 0x02, 0x4b, 0x8f, 0xe3, 0xb3, 0x78, 0x7c, - 0x24, 0x12, 0x14, 0xf0, 0x36, 0x58, 0x66, 0x67, 0x2c, 0xa2, 0xc1, 0x1c, 0xce, 0xaf, 0x69, 0x02, - 0x22, 0x5f, 0xd3, 0x04, 0xe4, 0xdb, 0x87, 0xa0, 0x20, 0xbd, 0xa8, 0xc0, 0x02, 0x58, 0x39, 0x69, - 0x3d, 0x6c, 0x1d, 0xfe, 0xa4, 0x55, 0x5c, 0x60, 0x8b, 0xa3, 0xfd, 0xd6, 0x5e, 0xb3, 0xf5, 0xa3, - 0xa2, 0xc2, 0x16, 0xfa, 0x49, 0xab, 0xc5, 0x16, 0x8b, 0xf0, 0x0a, 0xc8, 0xb7, 0x4f, 0xee, 0xdf, - 0xdf, 0xdf, 0xdf, 0xdb, 0xdf, 0x2b, 0x66, 0x20, 0x00, 0xcb, 0x3f, 0xdc, 0x6d, 0x3e, 0xda, 0xdf, - 0x2b, 0x66, 0x1b, 0x3f, 0xff, 0xea, 0x59, 0x59, 0xf9, 0xfa, 0x59, 0x59, 0xf9, 0xe7, 0xb3, 0xb2, - 0xf2, 0xd9, 0xf3, 0xf2, 0xc2, 0xd7, 0xcf, 0xcb, 0x0b, 0x7f, 0x7f, 0x5e, 0x5e, 0xf8, 0xd9, 0x7d, - 0xe9, 0xaf, 0x3a, 0xf1, 0xc8, 0x39, 0x70, 0x09, 0xdb, 0x92, 0xc1, 0xaa, 0x7e, 0x89, 0xff, 0x24, - 0x3b, 0xcb, 0xfc, 0x1c, 0x7b, 0xef, 0x3f, 0x01, 0x00, 0x00, 0xff, 0xff, 0xfd, 0xfc, 0x96, 0xed, - 0xc1, 0x1c, 0x00, 0x00, + 0x55, 0x2b, 0x52, 0x12, 0x39, 0x94, 0x25, 0x6a, 0x64, 0xd9, 0x2b, 0xc6, 0xe6, 0xd2, 0x8a, 0x1b, + 0x28, 0x8d, 0x43, 0x36, 0x4e, 0x81, 0x1a, 0x6e, 0x2f, 0xa2, 0xa5, 0xd6, 0xf4, 0x07, 0x25, 0x2f, + 0xa5, 0x16, 0x2d, 0xd0, 0x2c, 0x96, 0xdc, 0x11, 0xbd, 0xd1, 0x72, 0x86, 0xde, 0x9d, 0x55, 0xc3, + 0x9c, 0xdb, 0x43, 0x11, 0x20, 0x0d, 0x8a, 0xb4, 0x0d, 0x50, 0xa0, 0x45, 0x6e, 0xfd, 0x05, 0xed, + 0xa1, 0xb7, 0x9e, 0x7c, 0xcc, 0xb1, 0x27, 0xb6, 0xb0, 0x6f, 0x3c, 0xf7, 0x07, 0x14, 0x33, 0xb3, + 0xcb, 0x1d, 0xee, 0x2e, 0x45, 0x39, 0xa9, 0xab, 0x13, 0x39, 0xef, 0x7b, 0xde, 0x7b, 0xf3, 0xe6, + 0xbd, 0x59, 0x70, 0xd7, 0xc6, 0x14, 0xb9, 0xd8, 0x74, 0x6a, 0x5e, 0xe7, 0x29, 0xb2, 0x7c, 0x07, + 0xb9, 0xd1, 0x3f, 0xd2, 0xfe, 0x10, 0x75, 0xa8, 0x97, 0x00, 0x54, 0xfb, 0x2e, 0xa1, 0x04, 0x16, + 0xe3, 0xf0, 0x92, 0xd6, 0x25, 0xa4, 0xeb, 0xa0, 0x1a, 0xc7, 0xb7, 0xfd, 0xe3, 0x1a, 0xb5, 0x7b, + 0xc8, 0xa3, 0x66, 0xaf, 0x2f, 0x58, 0x4a, 0x5b, 0x27, 0x77, 0xbc, 0xaa, 0x4d, 0x6a, 0x66, 0xdf, + 0xae, 0x75, 0x88, 0x8b, 0x6a, 0xa7, 0xef, 0xd5, 0xba, 0x08, 0x23, 0xd7, 0xa4, 0xc8, 0x0a, 0x68, + 0xbe, 0x1b, 0xd1, 0xf4, 0xcc, 0xce, 0x53, 0x1b, 0x23, 0x77, 0x50, 0xeb, 0x9f, 0x74, 0x39, 0x93, + 0x8b, 0x3c, 0xe2, 0xbb, 0x1d, 0x94, 0xe0, 0x7a, 0xb7, 0x6b, 0xd3, 0xa7, 0x7e, 0xbb, 0xda, 0x21, + 0xbd, 0x5a, 0x97, 0x74, 0x49, 0x64, 0x03, 0x5b, 0xf1, 0x05, 0xff, 0x27, 0xc8, 0xb7, 0xfe, 0x92, + 0x01, 0xb9, 0xbd, 0x8f, 0x50, 0xc7, 0xa7, 0xc4, 0x85, 0x15, 0x30, 0x6f, 0x5b, 0xaa, 0x52, 0x51, + 0xb6, 0xf3, 0xf5, 0xe2, 0x68, 0xa8, 0x2d, 0xdb, 0xd6, 0x2d, 0xd2, 0xb3, 0x29, 0xea, 0xf5, 0xe9, + 0x40, 0x9f, 0xb7, 0x2d, 0xf8, 0x16, 0xc8, 0xf6, 0x09, 0x71, 0xd4, 0x79, 0x4e, 0x03, 0x47, 0x43, + 0x6d, 0x85, 0xad, 0x25, 0x2a, 0x8e, 0x87, 0x3b, 0x60, 0x01, 0x13, 0x0b, 0x79, 0x6a, 0xa6, 0x92, + 0xd9, 0x2e, 0xdc, 0xbe, 0x52, 0x4d, 0xb8, 0xae, 0x49, 0x2c, 0x54, 0x5f, 0x1f, 0x0d, 0xb5, 0x55, + 0x4e, 0x28, 0x49, 0x10, 0x9c, 0xf0, 0x03, 0xb0, 0xd2, 0xb3, 0xb1, 0xdd, 0xf3, 0x7b, 0x0f, 0x48, + 0xbb, 0x65, 0x7f, 0x8c, 0xd4, 0x6c, 0x45, 0xd9, 0x2e, 0xdc, 0x2e, 0x27, 0x65, 0xe9, 0x81, 0x33, + 0x1e, 0xd9, 0x1e, 0xad, 0x5f, 0x79, 0x3e, 0xd4, 0xe6, 0x98, 0x61, 0x93, 0xdc, 0x7a, 0x6c, 0xcd, + 0xe4, 0x3b, 0xa6, 0x47, 0x8f, 0xfa, 0x96, 0x49, 0xd1, 0xa1, 0xdd, 0x43, 0xea, 0x02, 0x97, 0x5f, + 0xaa, 0x8a, 0xe0, 0x55, 0x43, 0xc7, 0x55, 0x0f, 0xc3, 0xe0, 0xd5, 0x4b, 0xa1, 0xec, 0x49, 0xce, + 0xcf, 0xfe, 0xa5, 0x29, 0x7a, 0x0c, 0x06, 0xf7, 0xc1, 0xba, 0x8f, 0x4d, 0xcf, 0xb3, 0xbb, 0x18, + 0x59, 0xc6, 0x87, 0xa4, 0x6d, 0xb8, 0x3e, 0xf6, 0xd4, 0x7c, 0x25, 0xb3, 0x9d, 0xaf, 0x6b, 0xa3, + 0xa1, 0xf6, 0x46, 0x84, 0x7e, 0x40, 0xda, 0xba, 0x8f, 0x65, 0x27, 0xac, 0x25, 0x90, 0x5b, 0xff, + 0xd9, 0x00, 0x59, 0xe6, 0xb5, 0xf3, 0x85, 0x09, 0x9b, 0x3d, 0xa4, 0x2e, 0x47, 0x61, 0x62, 0x6b, + 0x39, 0x4c, 0x6c, 0x0d, 0xef, 0x00, 0xc0, 0x9c, 0xbd, 0xdb, 0x7e, 0x88, 0x06, 0x9e, 0x0a, 0x2b, + 0x99, 0xed, 0xe5, 0xba, 0x3a, 0x1a, 0x6a, 0x97, 0x23, 0xa8, 0xc4, 0x23, 0xd1, 0xc2, 0xc7, 0x20, + 0xcf, 0xf6, 0x6b, 0x78, 0x08, 0x61, 0x9e, 0x0d, 0x67, 0x3b, 0xee, 0x72, 0xe0, 0xb8, 0x1c, 0x63, + 0x6a, 0x21, 0x84, 0xb9, 0xcb, 0xc6, 0x2b, 0xb8, 0x0f, 0xf2, 0x4c, 0xb8, 0x41, 0x07, 0x7d, 0xa4, + 0x66, 0x02, 0x71, 0xa9, 0x39, 0x73, 0x38, 0xe8, 0xa3, 0xfa, 0x95, 0xd1, 0x50, 0x83, 0x38, 0x58, + 0x49, 0x16, 0xe6, 0x42, 0x18, 0xbc, 0x0b, 0x96, 0xc7, 0x02, 0x0d, 0xdb, 0xe2, 0xb9, 0x93, 0x8d, + 0xf6, 0xc6, 0x68, 0x1a, 0x56, 0x7c, 0x6f, 0x02, 0x0a, 0x77, 0xc0, 0x22, 0x35, 0x6d, 0x4c, 0x3d, + 0x75, 0x81, 0x67, 0xef, 0x66, 0x55, 0x9c, 0xc4, 0xaa, 0xd9, 0xb7, 0xab, 0xec, 0xb4, 0x56, 0x4f, + 0xdf, 0xab, 0x1e, 0x32, 0x8a, 0xfa, 0x4a, 0xb0, 0xaf, 0x80, 0x41, 0x0f, 0x7e, 0xe1, 0x01, 0x58, + 0x74, 0xcc, 0x36, 0x72, 0x3c, 0x75, 0x91, 0x8b, 0xd8, 0x4a, 0xdf, 0x4c, 0xf5, 0x11, 0x27, 0xda, + 0xc3, 0xd4, 0x1d, 0xd4, 0x2f, 0x8f, 0x86, 0x5a, 0x51, 0x70, 0x49, 0x86, 0x05, 0x72, 0xa0, 0x01, + 0x56, 0x29, 0xa1, 0xa6, 0x63, 0x84, 0x27, 0xdf, 0x53, 0x97, 0x5e, 0xed, 0x3c, 0x70, 0xf6, 0x10, + 0xe5, 0xe9, 0xb1, 0x35, 0xfc, 0xab, 0x02, 0x6e, 0x9a, 0x8e, 0x43, 0x3a, 0x26, 0x35, 0xdb, 0x0e, + 0x32, 0xda, 0x03, 0xa3, 0xef, 0xda, 0xc4, 0xb5, 0xe9, 0xc0, 0x30, 0xb1, 0x35, 0xd6, 0xab, 0xe6, + 0xf8, 0x8e, 0x7e, 0x30, 0x65, 0x47, 0x3b, 0x91, 0x88, 0xfa, 0xe0, 0x20, 0x10, 0xb0, 0x83, 0xad, + 0x50, 0x91, 0xd8, 0xeb, 0x76, 0x60, 0x54, 0xc5, 0x9c, 0x41, 0xae, 0xcf, 0xa4, 0x80, 0x2e, 0x58, + 0xf7, 0xa8, 0x49, 0xb9, 0xc5, 0xc1, 0x31, 0x63, 0x11, 0xcf, 0x73, 0x33, 0xdf, 0x99, 0x62, 0x66, + 0x8b, 0x71, 0xd4, 0x07, 0xe2, 0x6c, 0x35, 0x2c, 0x61, 0xd5, 0xd5, 0xc0, 0xaa, 0x55, 0x6f, 0x12, + 0xab, 0xc7, 0x01, 0xd0, 0x07, 0xeb, 0x81, 0x5d, 0xc8, 0x0a, 0xf5, 0xda, 0x96, 0x0a, 0xb8, 0xce, + 0x5b, 0x67, 0xbb, 0x06, 0x59, 0x5c, 0x50, 0xa8, 0x54, 0x0d, 0x94, 0x16, 0xcd, 0x18, 0x5a, 0x4f, + 0x40, 0x20, 0x05, 0x70, 0x42, 0xed, 0x33, 0x1f, 0xf9, 0x48, 0x2d, 0x9c, 0x57, 0xeb, 0x13, 0x46, + 0x3e, 0x5d, 0x2b, 0x47, 0xeb, 0x09, 0x08, 0xdb, 0x2c, 0x3a, 0xb5, 0x3b, 0x34, 0x2a, 0x63, 0x86, + 0x6d, 0x79, 0xea, 0xca, 0x99, 0x6a, 0xf7, 0x04, 0x47, 0xe8, 0x31, 0x2f, 0xa6, 0x16, 0xc5, 0xd0, + 0x7a, 0x02, 0x02, 0xbf, 0x54, 0x40, 0x19, 0x13, 0x6c, 0x98, 0x6e, 0xcf, 0xb4, 0x4c, 0x23, 0xda, + 0x78, 0x74, 0x02, 0x2e, 0x71, 0x13, 0xbe, 0x37, 0xc5, 0x84, 0x26, 0xc1, 0x3b, 0x9c, 0x77, 0xec, + 0x82, 0x71, 0xb6, 0x0b, 0x6b, 0xde, 0x0c, 0xac, 0x79, 0x03, 0x4f, 0xa7, 0xd4, 0xcf, 0x42, 0xc2, + 0x1d, 0x70, 0xc9, 0xc7, 0x81, 0x76, 0x96, 0xa1, 0xea, 0x6a, 0x45, 0xd9, 0xce, 0xd5, 0xdf, 0x18, + 0x0d, 0xb5, 0xab, 0x13, 0x08, 0xe9, 0x44, 0x4f, 0x72, 0xc0, 0x4f, 0x14, 0x70, 0x35, 0xdc, 0x91, + 0xe1, 0x7b, 0x66, 0x17, 0x45, 0x91, 0x2d, 0xf2, 0xfd, 0x7d, 0x67, 0xca, 0xfe, 0x42, 0x33, 0x8e, + 0x18, 0xd3, 0x44, 0x74, 0xb7, 0x46, 0x43, 0xad, 0xec, 0xa6, 0xa0, 0x25, 0x33, 0x2e, 0xa7, 0xe1, + 0xd9, 0xad, 0xe5, 0xa2, 0x3e, 0x71, 0xa9, 0x8d, 0xbb, 0x46, 0x54, 0x92, 0xd7, 0xf8, 0x45, 0xc2, + 0x6f, 0xad, 0x31, 0xba, 0x99, 0xac, 0xbf, 0x6b, 0x09, 0x64, 0xc9, 0x04, 0x05, 0xa9, 0xc8, 0xc1, + 0x37, 0x41, 0xe6, 0x04, 0x0d, 0x82, 0xcb, 0x6b, 0x6d, 0x34, 0xd4, 0x2e, 0x9d, 0xa0, 0x81, 0x24, + 0x81, 0x61, 0xe1, 0xdb, 0x60, 0xe1, 0xd4, 0x74, 0x7c, 0x14, 0xb4, 0x19, 0xbc, 0x4b, 0xe0, 0x00, + 0xb9, 0x4b, 0xe0, 0x80, 0xbb, 0xf3, 0x77, 0x94, 0xd2, 0x1f, 0x15, 0xf0, 0xad, 0x73, 0x95, 0x1d, + 0x59, 0xfb, 0xc2, 0x54, 0xed, 0x0d, 0x59, 0xfb, 0xec, 0xfa, 0x3a, 0xcb, 0xba, 0x5f, 0x2b, 0xe0, + 0x72, 0x5a, 0xb5, 0x39, 0x9f, 0x2b, 0xee, 0xcb, 0xc6, 0xac, 0xdc, 0xbe, 0x9e, 0x34, 0x46, 0x08, + 0x15, 0x1a, 0x66, 0xd9, 0xf2, 0x89, 0x02, 0x36, 0x52, 0xab, 0xd0, 0xf9, 0x8c, 0xf9, 0x1f, 0x7b, + 0x26, 0x66, 0x4d, 0x94, 0xbf, 0x17, 0x62, 0xcd, 0x09, 0xd8, 0x48, 0xad, 0x59, 0x5f, 0x23, 0x65, + 0x73, 0x33, 0x95, 0xfd, 0x5e, 0x01, 0x95, 0x59, 0xe5, 0xe9, 0x42, 0xb2, 0xf5, 0x37, 0x0a, 0xd8, + 0x9c, 0x5a, 0x57, 0x2e, 0x22, 0x2e, 0x5b, 0x7f, 0xca, 0x82, 0x5c, 0x58, 0x4d, 0x58, 0xeb, 0xdb, + 0x10, 0xad, 0x6f, 0x56, 0xb4, 0xbe, 0x13, 0x4d, 0xdc, 0xfc, 0x44, 0xf3, 0x36, 0xff, 0x75, 0x9b, + 0xb7, 0xc3, 0x71, 0xf3, 0x26, 0xa6, 0x97, 0xb7, 0xa6, 0x77, 0xa2, 0xaf, 0xd0, 0xc0, 0xfd, 0x52, + 0x01, 0xd0, 0xc7, 0x1e, 0xa2, 0x0d, 0x6c, 0xa1, 0x8f, 0x90, 0x25, 0x38, 0xd5, 0x2c, 0x57, 0x71, + 0xfb, 0x0c, 0x15, 0x47, 0x09, 0x26, 0xa1, 0xae, 0x32, 0x1a, 0x6a, 0xd7, 0x92, 0x12, 0x25, 0xd5, + 0x29, 0xfa, 0xfe, 0x1f, 0xf5, 0xb8, 0x07, 0xae, 0x4e, 0xb1, 0xf9, 0x75, 0xa8, 0xdb, 0x7a, 0xbe, + 0x08, 0x36, 0x79, 0x8e, 0xde, 0x73, 0x7c, 0x8f, 0x22, 0x77, 0x22, 0x7d, 0x61, 0x03, 0x2c, 0x75, + 0x5c, 0xc4, 0x4e, 0x17, 0xd7, 0x7a, 0xf6, 0x98, 0xb2, 0x1e, 0x64, 0x44, 0xc8, 0xc2, 0xa7, 0x94, + 0x70, 0xc1, 0xec, 0x12, 0xd7, 0xb2, 0x64, 0xd7, 0xb3, 0xd8, 0xad, 0x2a, 0x28, 0xd8, 0x60, 0x85, + 0x82, 0xa9, 0xba, 0x61, 0xf1, 0x81, 0x26, 0x2f, 0x86, 0x8f, 0x08, 0x2a, 0x0f, 0x1f, 0x11, 0x14, + 0xfe, 0x4e, 0x61, 0x37, 0x70, 0x50, 0x07, 0xa2, 0xab, 0x2c, 0xc8, 0x93, 0xdd, 0x64, 0x9e, 0x4c, + 0xdd, 0xfa, 0xf8, 0x98, 0x49, 0x62, 0x44, 0xe6, 0x5c, 0x0f, 0xb6, 0x99, 0xaa, 0x48, 0xd1, 0xd3, + 0xc0, 0xf0, 0x6f, 0x0a, 0xb8, 0x96, 0x02, 0xbf, 0xe7, 0x98, 0x9e, 0xd7, 0x34, 0xf9, 0xf4, 0xcc, + 0x0c, 0x7c, 0xfc, 0x0d, 0x0d, 0x1c, 0xcb, 0x13, 0x96, 0xde, 0x0c, 0x2c, 0x3d, 0x53, 0xb5, 0x7e, + 0x26, 0xb6, 0xf4, 0xa9, 0x02, 0xd4, 0x69, 0xae, 0xb8, 0x90, 0x1a, 0xfb, 0x07, 0x05, 0xdc, 0x98, + 0xb9, 0xf5, 0x0b, 0xa9, 0xb5, 0x7f, 0xcf, 0x80, 0x52, 0x5a, 0xa4, 0x74, 0xde, 0xd6, 0x8d, 0x5f, + 0x7f, 0x94, 0x19, 0xaf, 0x3f, 0xd2, 0x99, 0x9b, 0xff, 0x86, 0x67, 0xee, 0x53, 0x05, 0x14, 0xa5, + 0xe8, 0xf2, 0x5c, 0x0a, 0xca, 0x72, 0x3d, 0xb9, 0xd9, 0xe9, 0xb6, 0xcb, 0xb9, 0x26, 0x35, 0xca, + 0xe5, 0xd1, 0x50, 0x2b, 0xc5, 0xe5, 0x4b, 0xfb, 0x49, 0xe8, 0x2e, 0x7d, 0xa1, 0x80, 0x8d, 0x54, + 0x59, 0xe7, 0x0b, 0xd8, 0x8f, 0x27, 0x03, 0xf6, 0xce, 0x2b, 0x1c, 0x97, 0x99, 0xd1, 0xfb, 0xd5, + 0x3c, 0x58, 0x96, 0xc3, 0x0d, 0x3f, 0x00, 0xf9, 0x68, 0x56, 0x52, 0xb8, 0xd3, 0xde, 0x3d, 0x3b, + 0x43, 0xaa, 0xb1, 0x09, 0x69, 0x2d, 0x08, 0x4e, 0x24, 0x47, 0x8f, 0xfe, 0x96, 0x3e, 0x57, 0xc0, + 0xca, 0xf4, 0x9e, 0x65, 0xba, 0x13, 0x7e, 0x3a, 0xe9, 0x84, 0xaa, 0x74, 0x45, 0x8f, 0x5f, 0x3a, + 0xab, 0xfd, 0x93, 0x2e, 0xbf, 0xb3, 0x43, 0x75, 0xd5, 0x27, 0xbe, 0x89, 0xa9, 0x4d, 0x07, 0x33, + 0xfd, 0xf0, 0xf9, 0x02, 0x58, 0x7b, 0x40, 0xda, 0x2d, 0xb1, 0x51, 0x1b, 0x77, 0x1b, 0xf8, 0x98, + 0xc0, 0xdb, 0x20, 0xe7, 0xd8, 0xc7, 0x88, 0xda, 0x3d, 0xc4, 0xcd, 0xbb, 0x24, 0x5e, 0x91, 0x42, + 0x98, 0xfc, 0x8a, 0x14, 0xc2, 0xe0, 0x5d, 0xb0, 0x6c, 0x52, 0xa3, 0x47, 0x3c, 0x6a, 0x10, 0xdc, + 0x09, 0x9b, 0x3b, 0x5e, 0xc8, 0x4d, 0xfa, 0x98, 0x78, 0x74, 0x1f, 0x77, 0x64, 0x4e, 0x10, 0x41, + 0xe1, 0xf7, 0x41, 0xa1, 0xef, 0x22, 0x06, 0xb7, 0xd9, 0x60, 0x98, 0xe1, 0xac, 0x9b, 0xa3, 0xa1, + 0xb6, 0x21, 0x81, 0x25, 0x5e, 0x99, 0x1a, 0xde, 0x07, 0xc5, 0x0e, 0xc1, 0x1d, 0xdf, 0x75, 0x11, + 0xee, 0x0c, 0x0c, 0xcf, 0x3c, 0x16, 0xcf, 0x9f, 0xb9, 0xfa, 0xf5, 0xd1, 0x50, 0xdb, 0x94, 0x70, + 0x2d, 0xf3, 0x58, 0x96, 0xb2, 0x1a, 0x43, 0xb1, 0x81, 0x6e, 0xfc, 0x8c, 0xd3, 0x61, 0x15, 0xc6, + 0xe0, 0x2f, 0x83, 0x8b, 0xd1, 0x40, 0xd7, 0x8f, 0xd7, 0x1f, 0x79, 0xa0, 0x4b, 0x20, 0x61, 0x0b, + 0x14, 0x3c, 0xbf, 0xdd, 0xb3, 0xa9, 0xc1, 0x5d, 0xb9, 0x34, 0xf3, 0x80, 0x87, 0x0f, 0x50, 0x40, + 0xb0, 0x8d, 0x1f, 0x4c, 0xa5, 0x35, 0x0b, 0x4e, 0xa8, 0x49, 0xcd, 0x45, 0xc1, 0x09, 0x61, 0x72, + 0x70, 0x42, 0x18, 0xfc, 0x05, 0x58, 0x17, 0x29, 0x6c, 0xb8, 0xe8, 0x99, 0x6f, 0xbb, 0xa8, 0x87, + 0xa2, 0x37, 0xbb, 0x9b, 0xc9, 0x3c, 0xdf, 0xe7, 0xbf, 0xba, 0x44, 0x2b, 0x5a, 0x28, 0x92, 0x80, + 0xcb, 0x2d, 0x54, 0x12, 0x0b, 0x6b, 0x60, 0xe9, 0x14, 0xb9, 0x9e, 0x4d, 0xb0, 0x9a, 0xe7, 0xb6, + 0x6e, 0x8c, 0x86, 0xda, 0x5a, 0x00, 0x92, 0x78, 0x43, 0xaa, 0xbb, 0xd9, 0x2f, 0xbe, 0xd4, 0x94, + 0xad, 0xdf, 0x2a, 0x00, 0x26, 0x6d, 0x80, 0x0e, 0x58, 0xed, 0x13, 0x4b, 0x06, 0x05, 0x8d, 0xca, + 0x8d, 0xe4, 0x16, 0x0e, 0x26, 0x09, 0x45, 0x32, 0xc4, 0xb8, 0x23, 0x03, 0xee, 0xcf, 0xe9, 0x71, + 0xd1, 0xf5, 0x15, 0xb0, 0x2c, 0x7b, 0x6b, 0xeb, 0x1f, 0x4b, 0x60, 0x35, 0x26, 0x15, 0x7a, 0xe2, + 0xed, 0xb4, 0x85, 0x1c, 0xd4, 0xa1, 0xc4, 0x0d, 0x2a, 0xc7, 0xfb, 0x33, 0xcd, 0xe1, 0x2d, 0x6b, + 0xc8, 0x25, 0xea, 0x47, 0x69, 0x34, 0xd4, 0xae, 0xc8, 0xc2, 0x24, 0xf7, 0x4c, 0x28, 0x81, 0x07, + 0x20, 0x67, 0x1e, 0x1f, 0xdb, 0x98, 0x65, 0x80, 0x28, 0x0b, 0xd7, 0xd2, 0x3a, 0xf7, 0x9d, 0x80, + 0x46, 0xe4, 0x47, 0xc8, 0x21, 0xe7, 0x47, 0x08, 0x83, 0x47, 0xa0, 0x40, 0x89, 0x83, 0x5c, 0x93, + 0xda, 0x04, 0x87, 0xbd, 0x7c, 0x39, 0x75, 0x1c, 0x18, 0x93, 0x8d, 0x6f, 0x23, 0x99, 0x55, 0x97, + 0x17, 0x90, 0x80, 0x82, 0x89, 0x31, 0xa1, 0x81, 0xd8, 0xa5, 0x69, 0xfd, 0x7b, 0xdc, 0x39, 0x3b, + 0x11, 0x93, 0xf0, 0x0d, 0xaf, 0x05, 0x92, 0x28, 0xb9, 0x16, 0x48, 0xe0, 0x89, 0xb3, 0x91, 0xe5, + 0x7d, 0xca, 0xec, 0xb3, 0xf1, 0x00, 0x14, 0xc3, 0x72, 0x42, 0xf0, 0x01, 0x71, 0xec, 0xce, 0x80, + 0x7f, 0xde, 0xc8, 0x8b, 0x1b, 0x2f, 0x8e, 0x93, 0x6f, 0xbc, 0x38, 0x0e, 0x7e, 0x0c, 0xc6, 0x4f, + 0x45, 0x13, 0x59, 0xba, 0xc8, 0xa3, 0xb4, 0x9d, 0xe6, 0x50, 0x3d, 0x85, 0xbe, 0x7e, 0x2d, 0x70, + 0x6d, 0xaa, 0x34, 0x3d, 0x15, 0x0a, 0x5b, 0x60, 0xbd, 0x63, 0x32, 0xcf, 0x46, 0xc5, 0xfc, 0x21, + 0x12, 0x25, 0x62, 0xb9, 0x7e, 0x63, 0x34, 0xd4, 0xae, 0xa7, 0xa0, 0xa5, 0xdd, 0xa4, 0x71, 0x97, + 0xba, 0x60, 0x2d, 0x91, 0xa9, 0xaf, 0x65, 0x10, 0x3a, 0x06, 0xc5, 0x78, 0xd4, 0x5f, 0xcb, 0x04, + 0xf4, 0x67, 0x05, 0x6c, 0x1e, 0xf8, 0x8e, 0x67, 0xba, 0xad, 0x30, 0x0b, 0x1f, 0x90, 0xf6, 0x2e, + 0xa2, 0xa6, 0xed, 0x78, 0x4c, 0x18, 0x7f, 0xe8, 0x09, 0x74, 0x72, 0x61, 0x1c, 0x20, 0x0b, 0x13, + 0xef, 0xcb, 0x6f, 0x83, 0x85, 0x27, 0xf1, 0x09, 0x27, 0xde, 0x12, 0x09, 0x0a, 0x78, 0x0b, 0x2c, + 0xb2, 0x3b, 0x16, 0xd1, 0x60, 0xba, 0xe1, 0xc3, 0xaf, 0x80, 0xc8, 0xc3, 0xaf, 0x80, 0x7c, 0x7b, + 0x1f, 0x14, 0xa4, 0x77, 0x2a, 0x58, 0x00, 0x4b, 0x47, 0xcd, 0x87, 0xcd, 0xfd, 0x9f, 0x34, 0x8b, + 0x73, 0x6c, 0x71, 0xb0, 0xd7, 0xdc, 0x6d, 0x34, 0x7f, 0x54, 0x54, 0xd8, 0x42, 0x3f, 0x6a, 0x36, + 0xd9, 0x62, 0x1e, 0x5e, 0x02, 0xf9, 0xd6, 0xd1, 0xbd, 0x7b, 0x7b, 0x7b, 0xbb, 0x7b, 0xbb, 0xc5, + 0x0c, 0x04, 0x60, 0xf1, 0x87, 0x3b, 0x8d, 0x47, 0x7b, 0xbb, 0xc5, 0x6c, 0xfd, 0xe7, 0xcf, 0x5f, + 0x94, 0x95, 0xaf, 0x5e, 0x94, 0x95, 0x7f, 0xbf, 0x28, 0x2b, 0x9f, 0xbd, 0x2c, 0xcf, 0x7d, 0xf5, + 0xb2, 0x3c, 0xf7, 0xcf, 0x97, 0xe5, 0xb9, 0x9f, 0xdd, 0x93, 0x3e, 0x80, 0x8a, 0xa7, 0xe3, 0xbe, + 0x4b, 0xd8, 0x91, 0x0c, 0x56, 0xb5, 0x73, 0x7c, 0xe9, 0x6d, 0x2f, 0xf2, 0x7b, 0xec, 0xfd, 0xff, + 0x06, 0x00, 0x00, 0xff, 0xff, 0xff, 0x66, 0x0a, 0x62, 0x17, 0x1e, 0x00, 0x00, } func (m *Executor) Marshal() (dAtA []byte, err error) { @@ -1616,6 +1627,30 @@ func (m *QueueClusterResourceUsage) MarshalToSizedBuffer(dAtA []byte) (int, erro _ = i var l int _ = l + if len(m.ResourcesByPriorityClassName) > 0 { + for k := range m.ResourcesByPriorityClassName { + v := m.ResourcesByPriorityClassName[k] + baseI := i + { + size, err := (&v).MarshalToSizedBuffer(dAtA[:i]) + if err != nil { + return 0, err + } + i -= size + i = encodeVarintSchedulerobjects(dAtA, i, uint64(size)) + } + i-- + dAtA[i] = 0x12 + i -= len(k) + copy(dAtA[i:], k) + i = encodeVarintSchedulerobjects(dAtA, i, uint64(len(k))) + i-- + dAtA[i] = 0xa + i = encodeVarintSchedulerobjects(dAtA, i, uint64(baseI-i)) + i-- + dAtA[i] = 0x2a + } + } if len(m.ResourcesByPriority) > 0 { for k := range m.ResourcesByPriority { v := m.ResourcesByPriority[k] @@ -1652,12 +1687,12 @@ func (m *QueueClusterResourceUsage) MarshalToSizedBuffer(dAtA []byte) (int, erro i-- dAtA[i] = 0x12 } - n12, err12 := github_com_gogo_protobuf_types.StdTimeMarshalTo(m.Created, dAtA[i-github_com_gogo_protobuf_types.SizeOfStdTime(m.Created):]) - if err12 != nil { - return 0, err12 + n13, err13 := github_com_gogo_protobuf_types.StdTimeMarshalTo(m.Created, dAtA[i-github_com_gogo_protobuf_types.SizeOfStdTime(m.Created):]) + if err13 != nil { + return 0, err13 } - i -= n12 - i = encodeVarintSchedulerobjects(dAtA, i, uint64(n12)) + i -= n13 + i = encodeVarintSchedulerobjects(dAtA, i, uint64(n13)) i-- dAtA[i] = 0xa return len(dAtA) - i, nil @@ -1709,12 +1744,12 @@ func (m *ClusterResourceUsageReport) MarshalToSizedBuffer(dAtA []byte) (int, err dAtA[i] = 0x1a } } - n14, err14 := github_com_gogo_protobuf_types.StdTimeMarshalTo(m.Created, dAtA[i-github_com_gogo_protobuf_types.SizeOfStdTime(m.Created):]) - if err14 != nil { - return 0, err14 + n15, err15 := github_com_gogo_protobuf_types.StdTimeMarshalTo(m.Created, dAtA[i-github_com_gogo_protobuf_types.SizeOfStdTime(m.Created):]) + if err15 != nil { + return 0, err15 } - i -= n14 - i = encodeVarintSchedulerobjects(dAtA, i, uint64(n14)) + i -= n15 + i = encodeVarintSchedulerobjects(dAtA, i, uint64(n15)) i-- dAtA[i] = 0x12 if len(m.Pool) > 0 { @@ -1804,12 +1839,12 @@ func (m *JobSchedulingInfo) MarshalToSizedBuffer(dAtA []byte) (int, error) { i-- dAtA[i] = 0x40 } - n16, err16 := github_com_gogo_protobuf_types.StdTimeMarshalTo(m.SubmitTime, dAtA[i-github_com_gogo_protobuf_types.SizeOfStdTime(m.SubmitTime):]) - if err16 != nil { - return 0, err16 + n17, err17 := github_com_gogo_protobuf_types.StdTimeMarshalTo(m.SubmitTime, dAtA[i-github_com_gogo_protobuf_types.SizeOfStdTime(m.SubmitTime):]) + if err17 != nil { + return 0, err17 } - i -= n16 - i = encodeVarintSchedulerobjects(dAtA, i, uint64(n16)) + i -= n17 + i = encodeVarintSchedulerobjects(dAtA, i, uint64(n17)) i-- dAtA[i] = 0x3a if len(m.PriorityClassName) > 0 { @@ -2307,6 +2342,15 @@ func (m *QueueClusterResourceUsage) Size() (n int) { n += mapEntrySize + 1 + sovSchedulerobjects(uint64(mapEntrySize)) } } + if len(m.ResourcesByPriorityClassName) > 0 { + for k, v := range m.ResourcesByPriorityClassName { + _ = k + _ = v + l = v.Size() + mapEntrySize := 1 + len(k) + sovSchedulerobjects(uint64(len(k))) + 1 + l + sovSchedulerobjects(uint64(l)) + n += mapEntrySize + 1 + sovSchedulerobjects(uint64(mapEntrySize)) + } + } return n } @@ -4663,6 +4707,135 @@ func (m *QueueClusterResourceUsage) Unmarshal(dAtA []byte) error { } m.ResourcesByPriority[mapkey] = *mapvalue iNdEx = postIndex + case 5: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field ResourcesByPriorityClassName", wireType) + } + var msglen int + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSchedulerobjects + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + msglen |= int(b&0x7F) << shift + if b < 0x80 { + break + } + } + if msglen < 0 { + return ErrInvalidLengthSchedulerobjects + } + postIndex := iNdEx + msglen + if postIndex < 0 { + return ErrInvalidLengthSchedulerobjects + } + if postIndex > l { + return io.ErrUnexpectedEOF + } + if m.ResourcesByPriorityClassName == nil { + m.ResourcesByPriorityClassName = make(map[string]ResourceList) + } + var mapkey string + mapvalue := &ResourceList{} + for iNdEx < postIndex { + entryPreIndex := iNdEx + var wire uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSchedulerobjects + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + wire |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } + fieldNum := int32(wire >> 3) + if fieldNum == 1 { + var stringLenmapkey uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSchedulerobjects + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + stringLenmapkey |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } + intStringLenmapkey := int(stringLenmapkey) + if intStringLenmapkey < 0 { + return ErrInvalidLengthSchedulerobjects + } + postStringIndexmapkey := iNdEx + intStringLenmapkey + if postStringIndexmapkey < 0 { + return ErrInvalidLengthSchedulerobjects + } + if postStringIndexmapkey > l { + return io.ErrUnexpectedEOF + } + mapkey = string(dAtA[iNdEx:postStringIndexmapkey]) + iNdEx = postStringIndexmapkey + } else if fieldNum == 2 { + var mapmsglen int + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowSchedulerobjects + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + mapmsglen |= int(b&0x7F) << shift + if b < 0x80 { + break + } + } + if mapmsglen < 0 { + return ErrInvalidLengthSchedulerobjects + } + postmsgIndex := iNdEx + mapmsglen + if postmsgIndex < 0 { + return ErrInvalidLengthSchedulerobjects + } + if postmsgIndex > l { + return io.ErrUnexpectedEOF + } + mapvalue = &ResourceList{} + if err := mapvalue.Unmarshal(dAtA[iNdEx:postmsgIndex]); err != nil { + return err + } + iNdEx = postmsgIndex + } else { + iNdEx = entryPreIndex + skippy, err := skipSchedulerobjects(dAtA[iNdEx:]) + if err != nil { + return err + } + if (skippy < 0) || (iNdEx+skippy) < 0 { + return ErrInvalidLengthSchedulerobjects + } + if (iNdEx + skippy) > postIndex { + return io.ErrUnexpectedEOF + } + iNdEx += skippy + } + } + m.ResourcesByPriorityClassName[mapkey] = *mapvalue + iNdEx = postIndex default: iNdEx = preIndex skippy, err := skipSchedulerobjects(dAtA[iNdEx:]) diff --git a/internal/scheduler/schedulerobjects/schedulerobjects.proto b/internal/scheduler/schedulerobjects/schedulerobjects.proto index c9865a28d51..f87f88275e4 100644 --- a/internal/scheduler/schedulerobjects/schedulerobjects.proto +++ b/internal/scheduler/schedulerobjects/schedulerobjects.proto @@ -101,13 +101,13 @@ message NodeType { map unsetIndexedLabels = 4; } -// Captures the resource usage of a particular queue -// in a given cluster. +// Captures the resource usage of a particular queue in a given cluster. message QueueClusterResourceUsage { google.protobuf.Timestamp created = 1 [(gogoproto.nullable) = false, (gogoproto.stdtime) = true]; string queue = 2; string executorId = 3; - map resourcesByPriority = 4 [(gogoproto.nullable) = false]; + map resourcesByPriority = 4 [(gogoproto.nullable) = false, deprecated = true]; + map resourcesByPriorityClassName = 5 [(gogoproto.nullable) = false]; } // A collection of QueueClusterResourceUsage diff --git a/internal/scheduler/schedulerobjects/schedulinginfo.go b/internal/scheduler/schedulerobjects/schedulinginfo.go new file mode 100644 index 00000000000..ef69dfd7e3d --- /dev/null +++ b/internal/scheduler/schedulerobjects/schedulinginfo.go @@ -0,0 +1,10 @@ +package schedulerobjects + +func (info *JobSchedulingInfo) GetPodRequirements() *PodRequirements { + for _, oreq := range info.ObjectRequirements { + if preq := oreq.GetPodRequirements(); preq != nil { + return preq + } + } + return nil +} diff --git a/internal/scheduler/scheduling_algo.go b/internal/scheduler/scheduling_algo.go index 1025d6b21c6..0ad7039b0ba 100644 --- a/internal/scheduler/scheduling_algo.go +++ b/internal/scheduler/scheduling_algo.go @@ -146,7 +146,7 @@ func (l *FairSchedulingAlgo) Schedule( maps.Copy(overallSchedulerResult.NodeIdByJobId, schedulerResult.NodeIdByJobId) // Update accounting. - accounting.totalAllocationByPoolAndQueue[executor.Pool] = sctx.AllocatedByQueueAndPriority() + accounting.allocationByPoolAndQueueAndPriorityClass[executor.Pool] = sctx.AllocatedByQueueAndPriority() // Update result to mark this executor as scheduled l.previousScheduleClusterId = executor.Id @@ -175,14 +175,15 @@ func (it *JobQueueIteratorAdapter) Next() (interfaces.LegacySchedulerJob, error) } type fairSchedulingAlgoContext struct { - priorityFactorByQueue map[string]float64 - totalCapacity schedulerobjects.ResourceList - jobsByExecutorId map[string][]*jobdb.Job - nodeIdByJobId map[string]string - jobIdsByGangId map[string]map[string]bool - gangIdByJobId map[string]string - totalAllocationByPoolAndQueue map[string]map[string]schedulerobjects.QuantityByPriorityAndResourceType - executors []*schedulerobjects.Executor + priorityFactorByQueue map[string]float64 + isActiveByQueueName map[string]bool + totalCapacity schedulerobjects.ResourceList + jobsByExecutorId map[string][]*jobdb.Job + nodeIdByJobId map[string]string + jobIdsByGangId map[string]map[string]bool + gangIdByJobId map[string]string + allocationByPoolAndQueueAndPriorityClass map[string]map[string]schedulerobjects.QuantityByTAndResourceType[string] + executors []*schedulerobjects.Executor } // This function will return executors in the order they should be scheduled in @@ -242,11 +243,13 @@ func (l *FairSchedulingAlgo) newFairSchedulingAlgoContext(ctx context.Context, t } // Create a map of jobs associated with each executor. + isActiveByQueueName := make(map[string]bool, len(queues)) jobsByExecutorId := make(map[string][]*jobdb.Job) nodeIdByJobId := make(map[string]string) jobIdsByGangId := make(map[string]map[string]bool) gangIdByJobId := make(map[string]string) for _, job := range jobDb.GetAll(txn) { + isActiveByQueueName[job.Queue()] = true if job.Queued() { continue } @@ -264,7 +267,7 @@ func (l *FairSchedulingAlgo) newFairSchedulingAlgoContext(ctx context.Context, t } jobsByExecutorId[executorId] = append(jobsByExecutorId[executorId], job) nodeIdByJobId[job.Id()] = nodeId - gangId, _, isGangJob, err := GangIdAndCardinalityFromLegacySchedulerJob(job, l.config.Preemption.PriorityClasses) + gangId, _, isGangJob, err := GangIdAndCardinalityFromLegacySchedulerJob(job) if err != nil { return nil, err } @@ -280,21 +283,22 @@ func (l *FairSchedulingAlgo) newFairSchedulingAlgoContext(ctx context.Context, t } // Used to calculate fair share. - totalAllocationByPoolAndQueue := l.totalAllocationByPoolAndQueue(executors, jobsByExecutorId) + totalAllocationByPoolAndQueue := l.aggregateAllocationByPoolAndQueueAndPriorityClass(executors, jobsByExecutorId) // Filter out any executor that isn't acknowledging jobs in a timely fashion // Note that we do this after aggregating allocation across clusters for fair share. executors = l.filterLaggingExecutors(executors, jobsByExecutorId) return &fairSchedulingAlgoContext{ - priorityFactorByQueue: priorityFactorByQueue, - totalCapacity: totalCapacity, - jobsByExecutorId: jobsByExecutorId, - nodeIdByJobId: nodeIdByJobId, - jobIdsByGangId: jobIdsByGangId, - gangIdByJobId: gangIdByJobId, - totalAllocationByPoolAndQueue: totalAllocationByPoolAndQueue, - executors: executors, + priorityFactorByQueue: priorityFactorByQueue, + isActiveByQueueName: isActiveByQueueName, + totalCapacity: totalCapacity, + jobsByExecutorId: jobsByExecutorId, + nodeIdByJobId: nodeIdByJobId, + jobIdsByGangId: jobIdsByGangId, + gangIdByJobId: gangIdByJobId, + allocationByPoolAndQueueAndPriorityClass: totalAllocationByPoolAndQueue, + executors: executors, }, nil } @@ -307,9 +311,9 @@ func (l *FairSchedulingAlgo) scheduleOnExecutor( db *jobdb.JobDb, ) (*SchedulerResult, *schedulercontext.SchedulingContext, error) { nodeDb, err := l.constructNodeDb( - executor.Nodes, - accounting.jobsByExecutorId[executor.Id], l.config.Preemption.PriorityClasses, + accounting.jobsByExecutorId[executor.Id], + executor.Nodes, ) if err != nil { return nil, nil, err @@ -322,12 +326,23 @@ func (l *FairSchedulingAlgo) scheduleOnExecutor( l.config.ResourceScarcity, accounting.totalCapacity, ) + if l.config.FairnessModel == configuration.DominantResourceFairness { + sctx.EnableDominantResourceFairness(l.config.DominantResourceFairnessResourcesToConsider) + } for queue, priorityFactor := range accounting.priorityFactorByQueue { - var allocatedByPriority schedulerobjects.QuantityByPriorityAndResourceType - if allocatedByQueueAndPriority := accounting.totalAllocationByPoolAndQueue[executor.Pool]; allocatedByQueueAndPriority != nil { - allocatedByPriority = allocatedByQueueAndPriority[queue] + if !accounting.isActiveByQueueName[queue] { + // To ensure fair share is computed only from active queues, i.e., queues with jobs queued or running. + continue + } + var allocatedByPriorityClass schedulerobjects.QuantityByTAndResourceType[string] + if allocatedByQueueAndPriorityClass := accounting.allocationByPoolAndQueueAndPriorityClass[executor.Pool]; allocatedByQueueAndPriorityClass != nil { + allocatedByPriorityClass = allocatedByQueueAndPriorityClass[queue] + } + var weight float64 = 1 + if priorityFactor > 0 { + weight = 1 / priorityFactor } - if err := sctx.AddQueueSchedulingContext(queue, priorityFactor, allocatedByPriority); err != nil { + if err := sctx.AddQueueSchedulingContext(queue, weight, allocatedByPriorityClass); err != nil { return nil, nil, err } } @@ -342,6 +357,7 @@ func (l *FairSchedulingAlgo) scheduleOnExecutor( constraints, l.config.Preemption.NodeEvictionProbability, l.config.Preemption.NodeOversubscriptionEvictionProbability, + l.config.Preemption.ProtectedFractionOfFairShare, &schedulerJobRepositoryAdapter{ txn: txn, db: db, @@ -416,48 +432,44 @@ func (repo *schedulerJobRepositoryAdapter) GetExistingJobsByIds(ids []string) ([ } // constructNodeDb constructs a node db with all jobs bound to it. -func (l *FairSchedulingAlgo) constructNodeDb(nodes []*schedulerobjects.Node, jobs []*jobdb.Job, priorityClasses map[string]configuration.PriorityClass) (*nodedb.NodeDb, error) { +func (l *FairSchedulingAlgo) constructNodeDb(priorityClasses map[string]configuration.PriorityClass, jobs []*jobdb.Job, nodes []*schedulerobjects.Node) (*nodedb.NodeDb, error) { + nodeDb, err := nodedb.NewNodeDb( + priorityClasses, + l.config.MaxExtraNodesToConsider, + l.indexedResources, + l.config.IndexedTaints, + l.config.IndexedNodeLabels, + ) + if err != nil { + return nil, err + } + txn := nodeDb.Txn(true) + defer txn.Abort() nodesByName := make(map[string]*schedulerobjects.Node, len(nodes)) for _, node := range nodes { nodesByName[node.Name] = node } + jobsByNodeName := make(map[string][]*jobdb.Job) for _, job := range jobs { if job.InTerminalState() || !job.HasRuns() { continue } - assignedNode := job.LatestRun().Node() - node, ok := nodesByName[assignedNode] - if !ok { + nodeName := job.LatestRun().Node() + if _, ok := nodesByName[nodeName]; !ok { log.Warnf( "job %s assigned to node %s on executor %s, but no such node found", - job.Id(), assignedNode, job.LatestRun().Executor(), + job.Id(), nodeName, job.LatestRun().Executor(), ) continue } - req := PodRequirementFromLegacySchedulerJob(job, l.config.Preemption.PriorityClasses) - if req == nil { - log.Errorf("no pod spec found for job %s", job.Id()) - continue - } - node, err := nodedb.BindPodToNode(req, node) - if err != nil { + jobsByNodeName[nodeName] = append(jobsByNodeName[nodeName], job) + } + for _, node := range nodes { + if err := nodeDb.CreateAndInsertWithJobDbJobsWithTxn(txn, jobsByNodeName[node.Name], node); err != nil { return nil, err } - nodesByName[node.Name] = node - } - nodeDb, err := nodedb.NewNodeDb( - priorityClasses, - l.config.MaxExtraNodesToConsider, - l.indexedResources, - l.config.IndexedTaints, - l.config.IndexedNodeLabels, - ) - if err != nil { - return nil, err - } - if err := nodeDb.UpsertMany(maps.Values(nodesByName)); err != nil { - return nil, err } + txn.Commit() return nodeDb, nil } @@ -520,30 +532,22 @@ func (l *FairSchedulingAlgo) filterLaggingExecutors( return activeExecutors } -func (l *FairSchedulingAlgo) totalAllocationByPoolAndQueue(executors []*schedulerobjects.Executor, jobsByExecutorId map[string][]*jobdb.Job) map[string]map[string]schedulerobjects.QuantityByPriorityAndResourceType { - rv := make(map[string]map[string]schedulerobjects.QuantityByPriorityAndResourceType) +func (l *FairSchedulingAlgo) aggregateAllocationByPoolAndQueueAndPriorityClass(executors []*schedulerobjects.Executor, jobsByExecutorId map[string][]*jobdb.Job) map[string]map[string]schedulerobjects.QuantityByTAndResourceType[string] { + rv := make(map[string]map[string]schedulerobjects.QuantityByTAndResourceType[string]) for _, executor := range executors { allocationByQueue := rv[executor.Pool] if allocationByQueue == nil { - allocationByQueue = make(map[string]schedulerobjects.QuantityByPriorityAndResourceType) + allocationByQueue = make(map[string]schedulerobjects.QuantityByTAndResourceType[string]) rv[executor.Pool] = allocationByQueue } for _, job := range jobsByExecutorId[executor.Id] { queue := job.Queue() allocation := allocationByQueue[queue] if allocation == nil { - allocation = make(schedulerobjects.QuantityByPriorityAndResourceType) + allocation = make(schedulerobjects.QuantityByTAndResourceType[string]) allocationByQueue[queue] = allocation } - jobSchedulingInfo := job.JobSchedulingInfo() - if jobSchedulingInfo != nil { - priorityClass, ok := l.priorityClasses[jobSchedulingInfo.PriorityClassName] - if ok { - allocation.AddResourceList(priorityClass.Priority, jobSchedulingInfo.GetTotalResourceRequest()) - } else { - log.Errorf("job %s has unknown priority class name %s; ignoring the resources allocated to this job", job.Id(), jobSchedulingInfo.PriorityClassName) - } - } + allocation.AddV1ResourceList(job.GetPriorityClassName(), job.GetResourceRequirements().Requests) } } return rv diff --git a/internal/scheduler/scheduling_algo_test.go b/internal/scheduler/scheduling_algo_test.go index 99560a23d56..b27a4756f3f 100644 --- a/internal/scheduler/scheduling_algo_test.go +++ b/internal/scheduler/scheduling_algo_test.go @@ -2,6 +2,9 @@ package scheduler import ( "context" + "fmt" + "math" + "math/rand" "testing" "time" @@ -15,7 +18,6 @@ import ( "github.com/armadaproject/armada/internal/scheduler/database" "github.com/armadaproject/armada/internal/scheduler/jobdb" schedulermocks "github.com/armadaproject/armada/internal/scheduler/mocks" - "github.com/armadaproject/armada/internal/scheduler/nodedb" "github.com/armadaproject/armada/internal/scheduler/schedulerobjects" "github.com/armadaproject/armada/internal/scheduler/testfixtures" ) @@ -49,7 +51,7 @@ func TestLegacySchedulingAlgo_TestSchedule(t *testing.T) { }, queues: []*database.Queue{testfixtures.TestDbQueue()}, - queuedJobs: testfixtures.N16CpuJobs(testfixtures.TestQueue, testfixtures.PriorityClass3, 10), + queuedJobs: testfixtures.N16Cpu128GiJobs(testfixtures.TestQueue, testfixtures.PriorityClass3, 10), expectedScheduledIndices: map[string][]int{ "executor1": {0, 1}, "executor2": {2, 3}, @@ -64,7 +66,7 @@ func TestLegacySchedulingAlgo_TestSchedule(t *testing.T) { }, queues: []*database.Queue{testfixtures.TestDbQueue()}, - queuedJobs: testfixtures.N16CpuJobs(testfixtures.TestQueue, testfixtures.PriorityClass3, 10), + queuedJobs: testfixtures.N16Cpu128GiJobs(testfixtures.TestQueue, testfixtures.PriorityClass3, 10), expectedScheduledIndices: map[string][]int{ "executor1": {0, 1}, }, @@ -78,12 +80,12 @@ func TestLegacySchedulingAlgo_TestSchedule(t *testing.T) { }, queues: []*database.Queue{testfixtures.TestDbQueue()}, - existingJobs: testfixtures.N16CpuJobs(testfixtures.TestQueue, testfixtures.PriorityClass3, 2), + existingJobs: testfixtures.N16Cpu128GiJobs(testfixtures.TestQueue, testfixtures.PriorityClass3, 2), existingUnacknowledgedIndices: map[string]map[string][]int{ "executor1": {"executor1-node": {0, 1}}, }, - queuedJobs: testfixtures.N16CpuJobs(testfixtures.TestQueue, testfixtures.PriorityClass3, 10), + queuedJobs: testfixtures.N16Cpu128GiJobs(testfixtures.TestQueue, testfixtures.PriorityClass3, 10), expectedScheduledIndices: map[string][]int{ "executor2": {0, 1}, @@ -98,12 +100,12 @@ func TestLegacySchedulingAlgo_TestSchedule(t *testing.T) { }, queues: []*database.Queue{testfixtures.TestDbQueue()}, - existingJobs: testfixtures.N16CpuJobs(testfixtures.TestQueue, testfixtures.PriorityClass3, 2), + existingJobs: testfixtures.N16Cpu128GiJobs(testfixtures.TestQueue, testfixtures.PriorityClass3, 2), existingRunningIndices: map[string]map[string][]int{ "executor1": {"executor1-node": {0, 1}}, }, - queuedJobs: testfixtures.N16CpuJobs(testfixtures.TestQueue, testfixtures.PriorityClass3, 10), + queuedJobs: testfixtures.N16Cpu128GiJobs(testfixtures.TestQueue, testfixtures.PriorityClass3, 10), expectedScheduledIndices: map[string][]int{ "executor2": {0, 1}, @@ -111,8 +113,8 @@ func TestLegacySchedulingAlgo_TestSchedule(t *testing.T) { }, "user is at usage cap before scheduling": { schedulingConfig: testfixtures.WithPerPriorityLimitsConfig( - map[int32]map[string]float64{ - testfixtures.TestPriorityClasses[testfixtures.PriorityClass3].Priority: {"cpu": 0.5}, + map[string]map[string]float64{ + testfixtures.PriorityClass3: {"cpu": 0.5}, }, testfixtures.TestSchedulingConfig(), ), @@ -123,19 +125,19 @@ func TestLegacySchedulingAlgo_TestSchedule(t *testing.T) { }, queues: []*database.Queue{testfixtures.TestDbQueue()}, - existingJobs: testfixtures.N16CpuJobs(testfixtures.TestQueue, testfixtures.PriorityClass3, 2), + existingJobs: testfixtures.N16Cpu128GiJobs(testfixtures.TestQueue, testfixtures.PriorityClass3, 2), existingRunningIndices: map[string]map[string][]int{ "executor1": {"executor1-node": {0, 1}}, }, - queuedJobs: testfixtures.N16CpuJobs(testfixtures.TestQueue, testfixtures.PriorityClass3, 10), + queuedJobs: testfixtures.N16Cpu128GiJobs(testfixtures.TestQueue, testfixtures.PriorityClass3, 10), expectedScheduledIndices: nil, }, "user hits usage cap during scheduling": { schedulingConfig: testfixtures.WithPerPriorityLimitsConfig( - map[int32]map[string]float64{ - testfixtures.TestPriorityClasses[testfixtures.PriorityClass3].Priority: {"cpu": 0.5}, + map[string]map[string]float64{ + testfixtures.PriorityClass3: {"cpu": 0.5}, }, testfixtures.TestSchedulingConfig(), ), @@ -146,12 +148,12 @@ func TestLegacySchedulingAlgo_TestSchedule(t *testing.T) { }, queues: []*database.Queue{testfixtures.TestDbQueue()}, - existingJobs: testfixtures.N16CpuJobs(testfixtures.TestQueue, testfixtures.PriorityClass3, 1), + existingJobs: testfixtures.N16Cpu128GiJobs(testfixtures.TestQueue, testfixtures.PriorityClass3, 1), existingRunningIndices: map[string]map[string][]int{ "executor1": {"executor1-node": {0}}, }, - queuedJobs: testfixtures.N16CpuJobs(testfixtures.TestQueue, testfixtures.PriorityClass3, 10), + queuedJobs: testfixtures.N16Cpu128GiJobs(testfixtures.TestQueue, testfixtures.PriorityClass3, 10), expectedScheduledIndices: map[string][]int{ "executor1": {0}, @@ -174,13 +176,13 @@ func TestLegacySchedulingAlgo_TestSchedule(t *testing.T) { executors: []*schedulerobjects.Executor{}, queues: []*database.Queue{testfixtures.TestDbQueue()}, - queuedJobs: testfixtures.N16CpuJobs(testfixtures.TestQueue, testfixtures.PriorityClass3, 10), + queuedJobs: testfixtures.N16Cpu128GiJobs(testfixtures.TestQueue, testfixtures.PriorityClass3, 10), expectedScheduledIndices: nil, }, "computation of allocated resources does not confuse priority class with per-queue priority": { schedulingConfig: testfixtures.WithPerPriorityLimitsConfig( - map[int32]map[string]float64{ - testfixtures.TestPriorityClasses[testfixtures.PriorityClass3].Priority: {"cpu": 0.5}, + map[string]map[string]float64{ + testfixtures.PriorityClass3: {"cpu": 0.5}, }, testfixtures.TestSchedulingConfig(), ), @@ -188,7 +190,7 @@ func TestLegacySchedulingAlgo_TestSchedule(t *testing.T) { executors: []*schedulerobjects.Executor{testfixtures.Test1Node32CoreExecutor("executor1")}, queues: []*database.Queue{testfixtures.TestDbQueue()}, - existingJobs: []*jobdb.Job{testfixtures.Test16CpuJob(testfixtures.TestQueue, testfixtures.PriorityClass3).WithPriority(0)}, + existingJobs: []*jobdb.Job{testfixtures.Test16Cpu128GiJob(testfixtures.TestQueue, testfixtures.PriorityClass3).WithPriority(0)}, existingRunningIndices: map[string]map[string][]int{ "executor1": {"executor1-node": {0}}, }, @@ -199,7 +201,7 @@ func TestLegacySchedulingAlgo_TestSchedule(t *testing.T) { // than the priority class number of the two jobs (i.e., 3); if the scheduler were // to use the per-queue priority instead of the priority class number in its // accounting, then it would schedule this job. - testfixtures.Test16CpuJob(testfixtures.TestQueue, testfixtures.PriorityClass3).WithPriority(1), + testfixtures.Test16Cpu128GiJob(testfixtures.TestQueue, testfixtures.PriorityClass3).WithPriority(1), }, expectedScheduledIndices: nil, @@ -210,12 +212,12 @@ func TestLegacySchedulingAlgo_TestSchedule(t *testing.T) { executors: []*schedulerobjects.Executor{testfixtures.Test1Node32CoreExecutor("executor1")}, queues: []*database.Queue{{Name: "queue1", Weight: 100}}, - existingJobs: testfixtures.N16CpuJobs("queue1", testfixtures.PriorityClass0, 2), + existingJobs: testfixtures.N16Cpu128GiJobs("queue1", testfixtures.PriorityClass0, 2), existingRunningIndices: map[string]map[string][]int{ "executor1": {"executor1-node": {0, 1}}, }, - queuedJobs: testfixtures.N16CpuJobs("queue1", testfixtures.PriorityClass1, 2), + queuedJobs: testfixtures.N16Cpu128GiJobs("queue1", testfixtures.PriorityClass1, 2), expectedPreemptedIndices: []int{0, 1}, expectedScheduledIndices: map[string][]int{ @@ -228,12 +230,12 @@ func TestLegacySchedulingAlgo_TestSchedule(t *testing.T) { executors: []*schedulerobjects.Executor{testfixtures.Test1Node32CoreExecutor("executor1")}, queues: []*database.Queue{{Name: "queue1", Weight: 100}, {Name: "queue2", Weight: 100}}, - existingJobs: testfixtures.N16CpuJobs("queue1", testfixtures.PriorityClass0, 2), + existingJobs: testfixtures.N16Cpu128GiJobs("queue1", testfixtures.PriorityClass0, 2), existingRunningIndices: map[string]map[string][]int{ "executor1": {"executor1-node": {0, 1}}, }, - queuedJobs: testfixtures.N16CpuJobs("queue2", testfixtures.PriorityClass1, 2), + queuedJobs: testfixtures.N16Cpu128GiJobs("queue2", testfixtures.PriorityClass1, 2), expectedPreemptedIndices: []int{0, 1}, expectedScheduledIndices: map[string][]int{ @@ -246,12 +248,12 @@ func TestLegacySchedulingAlgo_TestSchedule(t *testing.T) { executors: []*schedulerobjects.Executor{testfixtures.Test1Node32CoreExecutor("executor1")}, queues: []*database.Queue{{Name: "queue1", Weight: 100}, {Name: "queue2", Weight: 100}}, - existingJobs: testfixtures.N16CpuJobs("queue1", testfixtures.PriorityClass0, 2), + existingJobs: testfixtures.N16Cpu128GiJobs("queue1", testfixtures.PriorityClass0, 2), existingRunningIndices: map[string]map[string][]int{ "executor1": {"executor1-node": {0, 1}}, }, - queuedJobs: testfixtures.N16CpuJobs("queue2", testfixtures.PriorityClass0, 2), + queuedJobs: testfixtures.N16Cpu128GiJobs("queue2", testfixtures.PriorityClass0, 2), expectedPreemptedIndices: []int{1}, expectedScheduledIndices: map[string][]int{ @@ -264,7 +266,7 @@ func TestLegacySchedulingAlgo_TestSchedule(t *testing.T) { executors: []*schedulerobjects.Executor{testfixtures.Test1Node32CoreExecutor("executor1")}, queues: []*database.Queue{{Name: "queue1", Weight: 100}}, - queuedJobs: testfixtures.WithGangAnnotationsJobs(testfixtures.N16CpuJobs("queue1", testfixtures.PriorityClass0, 2)), + queuedJobs: testfixtures.WithGangAnnotationsJobs(testfixtures.N16Cpu128GiJobs("queue1", testfixtures.PriorityClass0, 2)), expectedScheduledIndices: map[string][]int{ "executor1": {0, 1}, @@ -279,7 +281,7 @@ func TestLegacySchedulingAlgo_TestSchedule(t *testing.T) { }, queues: []*database.Queue{{Name: "queue1", Weight: 100}}, - queuedJobs: testfixtures.WithGangAnnotationsJobs(testfixtures.N16CpuJobs("queue1", testfixtures.PriorityClass0, 3)), + queuedJobs: testfixtures.WithGangAnnotationsJobs(testfixtures.N16Cpu128GiJobs("queue1", testfixtures.PriorityClass0, 3)), expectedScheduledIndices: nil, }, @@ -292,12 +294,12 @@ func TestLegacySchedulingAlgo_TestSchedule(t *testing.T) { }, queues: []*database.Queue{{Name: "queue1", Weight: 100}, {Name: "queue2", Weight: 100}}, - existingJobs: testfixtures.WithGangAnnotationsJobs(testfixtures.N16CpuJobs("queue1", testfixtures.PriorityClass0, 2)), + existingJobs: testfixtures.WithGangAnnotationsJobs(testfixtures.N16Cpu128GiJobs("queue1", testfixtures.PriorityClass0, 2)), existingRunningIndices: map[string]map[string][]int{ "executor1": {"executor1-node": {0, 1}}, }, - queuedJobs: testfixtures.N16CpuJobs("queue2", testfixtures.PriorityClass1, 4), + queuedJobs: testfixtures.N16Cpu128GiJobs("queue2", testfixtures.PriorityClass1, 4), expectedPreemptedIndices: []int{0, 1}, expectedScheduledIndices: map[string][]int{ @@ -311,12 +313,12 @@ func TestLegacySchedulingAlgo_TestSchedule(t *testing.T) { executors: []*schedulerobjects.Executor{testfixtures.Test1Node32CoreExecutor("executor1")}, queues: []*database.Queue{{Name: "queue1", Weight: 100}, {Name: "queue2", Weight: 100}}, - existingJobs: testfixtures.WithGangAnnotationsJobs(testfixtures.N16CpuJobs("queue1", testfixtures.PriorityClass0, 2)), + existingJobs: testfixtures.WithGangAnnotationsJobs(testfixtures.N16Cpu128GiJobs("queue1", testfixtures.PriorityClass0, 2)), existingRunningIndices: map[string]map[string][]int{ "executor1": {"executor1-node": {0, 1}}, }, - queuedJobs: testfixtures.N16CpuJobs("queue2", testfixtures.PriorityClass0, 1), + queuedJobs: testfixtures.N16Cpu128GiJobs("queue2", testfixtures.PriorityClass0, 1), expectedPreemptedIndices: []int{0, 1}, expectedScheduledIndices: map[string][]int{ @@ -366,19 +368,12 @@ func TestLegacySchedulingAlgo_TestSchedule(t *testing.T) { for executorId, jobsByNodeName := range tc.existingRunningIndices { for nodeName, jobIndices := range jobsByNodeName { node := nodes[executorId][nodeName] - for _, i := range jobIndices { job := tc.existingJobs[i].WithQueued(false).WithNewRun(executorId, nodeName) jobsToUpsert = append(jobsToUpsert, job) run := job.LatestRun() node.StateByJobRunId[run.Id().String()] = schedulerobjects.JobRunState_RUNNING - - req := PodRequirementFromLegacySchedulerJob(job, tc.schedulingConfig.Preemption.PriorityClasses) - node, err = nodedb.BindPodToNode(req, node) - require.NoError(t, err) } - - nodes[executorId][nodeName] = node } } @@ -422,15 +417,14 @@ func TestLegacySchedulingAlgo_TestSchedule(t *testing.T) { assert.Equal(t, len(jobIndices), sctx.NumScheduledJobs) expectedScheduledResources := schedulerobjects.ResourceList{} - expectedScheduledResourcesByPriority := schedulerobjects.QuantityByPriorityAndResourceType{} + expectedScheduledResourcesByPriorityClass := make(schedulerobjects.QuantityByTAndResourceType[string]) for _, i := range jobIndices { job := tc.queuedJobs[i] - req := job.JobSchedulingInfo().ObjectRequirements[0].GetPodRequirements() - expectedScheduledResources.AddV1ResourceList(req.ResourceRequirements.Requests) - expectedScheduledResourcesByPriority.AddV1ResourceList(req.Priority, req.ResourceRequirements.Requests) + expectedScheduledResources.AddV1ResourceList(job.GetResourceRequirements().Requests) + expectedScheduledResourcesByPriorityClass.AddV1ResourceList(job.GetPriorityClassName(), job.GetResourceRequirements().Requests) } - assert.Equal(t, expectedScheduledResources, sctx.ScheduledResources) - assert.Equal(t, expectedScheduledResourcesByPriority, sctx.ScheduledResourcesByPriority) + assert.True(t, expectedScheduledResources.Equal(sctx.ScheduledResources)) + assert.True(t, expectedScheduledResourcesByPriorityClass.Equal(sctx.ScheduledResourcesByPriorityClass)) } scheduledJobs := ScheduledJobsFromSchedulerResult[*jobdb.Job](schedulerResult) @@ -469,25 +463,24 @@ func TestLegacySchedulingAlgo_TestSchedule(t *testing.T) { assert.Equal(t, len(tc.expectedPreemptedIndices), numPreemptedJobs) expectedPreemptedResources := schedulerobjects.ResourceList{} - expectedPreemptedResourcesByPriority := schedulerobjects.QuantityByPriorityAndResourceType{} + expectedPreemptedResourcesByPriorityClass := make(schedulerobjects.QuantityByTAndResourceType[string]) for _, i := range tc.expectedPreemptedIndices { job := tc.existingJobs[i] - req := job.JobSchedulingInfo().ObjectRequirements[0].GetPodRequirements() - expectedPreemptedResources.AddV1ResourceList(req.ResourceRequirements.Requests) - expectedPreemptedResourcesByPriority.AddV1ResourceList(req.Priority, req.ResourceRequirements.Requests) + expectedPreemptedResources.AddV1ResourceList(job.GetResourceRequirements().Requests) + expectedPreemptedResourcesByPriorityClass.AddV1ResourceList(job.GetPriorityClassName(), job.GetResourceRequirements().Requests) } preemptedResources := schedulerobjects.ResourceList{} - preemptedResourcesByPriority := schedulerobjects.QuantityByPriorityAndResourceType{} + preemptedResourcesByPriority := make(schedulerobjects.QuantityByTAndResourceType[string]) for _, sctx := range schedulingContextByExecutor { for resourceType, quantity := range sctx.EvictedResources.Resources { preemptedResources.AddQuantity(resourceType, quantity) } - for p, rl := range sctx.EvictedResourcesByPriority { + for p, rl := range sctx.EvictedResourcesByPriorityClass { preemptedResourcesByPriority.AddResourceList(p, rl) } } - assert.Equal(t, expectedPreemptedResources, preemptedResources) - assert.Equal(t, expectedPreemptedResourcesByPriority, preemptedResourcesByPriority) + assert.True(t, expectedPreemptedResources.Equal(preemptedResources)) + assert.True(t, expectedPreemptedResourcesByPriorityClass.Equal(preemptedResourcesByPriority)) }) } } @@ -680,3 +673,35 @@ func TestLegacySchedulingAlgo_TestSchedule_ExecutorOrdering(t *testing.T) { }) } } + +func BenchmarkNodeDbConstruction(b *testing.B) { + for e := 1; e <= 4; e++ { + numNodes := int(math.Pow10(e)) + b.Run(fmt.Sprintf("%d nodes", numNodes), func(b *testing.B) { + jobs := testfixtures.N1Cpu4GiJobs("queue-alice", testfixtures.PriorityClass0, 32*numNodes) + nodes := testfixtures.N32CpuNodes(numNodes, testfixtures.TestPriorities) + for i, node := range nodes { + for j := 32 * i; j < 32*(i+1); j++ { + jobs[j] = jobs[j].WithNewRun("executor-01", node.Name) + } + } + rand.Shuffle(len(jobs), func(i, j int) { jobs[i], jobs[j] = jobs[j], jobs[i] }) + b.ResetTimer() + for n := 0; n < b.N; n++ { + b.StopTimer() + algo, err := NewFairSchedulingAlgo( + testfixtures.TestSchedulingConfig(), + time.Second*5, + nil, + nil, + nil, + ) + require.NoError(b, err) + b.StartTimer() + + _, err = algo.constructNodeDb(testfixtures.TestPriorityClasses, jobs, nodes) + require.NoError(b, err) + } + }) + } +} diff --git a/internal/scheduler/submitcheck.go b/internal/scheduler/submitcheck.go index 700cb541cb8..77acce6ac81 100644 --- a/internal/scheduler/submitcheck.go +++ b/internal/scheduler/submitcheck.go @@ -15,7 +15,10 @@ import ( "k8s.io/apimachinery/pkg/util/clock" "github.com/armadaproject/armada/internal/armada/configuration" + armadaslices "github.com/armadaproject/armada/internal/common/slices" + schedulercontext "github.com/armadaproject/armada/internal/scheduler/context" "github.com/armadaproject/armada/internal/scheduler/database" + "github.com/armadaproject/armada/internal/scheduler/jobdb" "github.com/armadaproject/armada/internal/scheduler/nodedb" "github.com/armadaproject/armada/internal/scheduler/schedulerobjects" "github.com/armadaproject/armada/pkg/api" @@ -34,8 +37,8 @@ type schedulingResult struct { const maxJobSchedulingResults = 10000 type SubmitScheduleChecker interface { - CheckPodRequirements(podRequirement *schedulerobjects.PodRequirements) (bool, string) CheckApiJobs(jobs []*api.Job) (bool, string) + CheckJobDbJobs(jobs []*jobdb.Job) (bool, string) } type SubmitChecker struct { @@ -132,51 +135,41 @@ func (srv *SubmitChecker) updateExecutors(ctx context.Context) { srv.jobSchedulingResultsCache.Purge() } -func (srv *SubmitChecker) CheckPodRequirements(req *schedulerobjects.PodRequirements) (bool, string) { - schedulingResult := srv.getSchedulingResult(req) - if !schedulingResult.isSchedulable { - return schedulingResult.isSchedulable, fmt.Sprintf("requirements unschedulable:\n%s", schedulingResult.reason) - } - return true, "" +func (srv *SubmitChecker) CheckApiJobs(jobs []*api.Job) (bool, string) { + return srv.check(schedulercontext.JobSchedulingContextsFromJobs(srv.priorityClasses, jobs)) } -func (srv *SubmitChecker) CheckApiJobs(jobs []*api.Job) (bool, string) { +func (srv *SubmitChecker) CheckJobDbJobs(jobs []*jobdb.Job) (bool, string) { + return srv.check(schedulercontext.JobSchedulingContextsFromJobs(srv.priorityClasses, jobs)) +} + +func (srv *SubmitChecker) check(jctxs []*schedulercontext.JobSchedulingContext) (bool, string) { // First, check if all jobs can be scheduled individually. - for i, job := range jobs { - req := PodRequirementFromLegacySchedulerJob(job, srv.priorityClasses) - schedulingResult := srv.getSchedulingResult(req) + for i, jctx := range jctxs { + schedulingResult := srv.getIndividualSchedulingResult(jctx) if !schedulingResult.isSchedulable { return schedulingResult.isSchedulable, fmt.Sprintf("%d-th job unschedulable:\n%s", i, schedulingResult.reason) } } // Then, check if all gangs can be scheduled. - for gangId, jobs := range GroupJobsByAnnotation(srv.gangIdAnnotation, jobs) { + for gangId, jctxsInGang := range armadaslices.GroupByFunc( + jctxs, + func(jctx *schedulercontext.JobSchedulingContext) string { + return jctx.Job.GetAnnotations()[srv.gangIdAnnotation] + }, + ) { if gangId == "" { continue } - reqs := PodRequirementsFromLegacySchedulerJobs(jobs, srv.priorityClasses) - schedulingResult := srv.check(reqs) - if !schedulingResult.isSchedulable { + if schedulingResult := srv.getSchedulingResult(jctxsInGang); !schedulingResult.isSchedulable { return schedulingResult.isSchedulable, fmt.Sprintf("gang %s is unschedulable:\n%s", gangId, schedulingResult.reason) } } return true, "" } -func GroupJobsByAnnotation(annotation string, jobs []*api.Job) map[string][]*api.Job { - rv := make(map[string][]*api.Job) - for _, job := range jobs { - if len(job.Annotations) == 0 { - rv[""] = append(rv[""], job) - } else { - value := job.Annotations[annotation] - rv[value] = append(rv[value], job) - } - } - return rv -} - -func (srv *SubmitChecker) getSchedulingResult(req *schedulerobjects.PodRequirements) schedulingResult { +func (srv *SubmitChecker) getIndividualSchedulingResult(jctx *schedulercontext.JobSchedulingContext) schedulingResult { + req := jctx.PodRequirements srv.mu.Lock() schedulingKey := srv.schedulingKeyGenerator.Key( req.NodeSelector, @@ -190,7 +183,7 @@ func (srv *SubmitChecker) getSchedulingResult(req *schedulerobjects.PodRequireme if obj, ok := srv.jobSchedulingResultsCache.Get(schedulingKey); ok { result = obj.(schedulingResult) } else { - result = srv.check([]*schedulerobjects.PodRequirements{req}) + result = srv.getSchedulingResult([]*schedulercontext.JobSchedulingContext{jctx}) srv.jobSchedulingResultsCache.Add(schedulingKey, result) } if !result.isSchedulable { @@ -199,9 +192,9 @@ func (srv *SubmitChecker) getSchedulingResult(req *schedulerobjects.PodRequireme return schedulingResult{isSchedulable: true} } -// Check if a set of pods can be scheduled onto some cluster. -func (srv *SubmitChecker) check(reqs []*schedulerobjects.PodRequirements) schedulingResult { - if len(reqs) == 0 { +// Check if a set of jobs can be scheduled onto some cluster. +func (srv *SubmitChecker) getSchedulingResult(jctxs []*schedulercontext.JobSchedulingContext) schedulingResult { + if len(jctxs) == 0 { return schedulingResult{isSchedulable: true, reason: ""} } @@ -210,19 +203,23 @@ func (srv *SubmitChecker) check(reqs []*schedulerobjects.PodRequirements) schedu srv.mu.Lock() executorById := maps.Clone(srv.executorById) srv.mu.Unlock() - executorById = srv.filterStaleNodeDbs(executorById) + executorById = srv.filterStaleExecutors(executorById) if len(executorById) == 0 { return schedulingResult{isSchedulable: false, reason: "no executor clusters available"} } - canSchedule := false + isSchedulable := false var sb strings.Builder for id, executor := range executorById { nodeDb := executor.nodeDb txn := nodeDb.Txn(true) - reports, ok, err := nodeDb.ScheduleManyWithTxn(txn, reqs) + // TODO: This doesn't account for per-queue limits or the NodeUniformityLabel. + // We should create a GangScheduler for this instead. + ok, err := nodeDb.ScheduleManyWithTxn(txn, jctxs) txn.Abort() + isSchedulable = isSchedulable || ok + sb.WriteString(id) if err != nil { sb.WriteString(err.Error()) @@ -230,31 +227,35 @@ func (srv *SubmitChecker) check(reqs []*schedulerobjects.PodRequirements) schedu continue } - canSchedule = canSchedule || ok numSuccessfullyScheduled := 0 - for _, report := range reports { - if report.Node != nil { + for _, jctx := range jctxs { + pctx := jctx.PodSchedulingContext + if pctx != nil && pctx.NodeId != "" { numSuccessfullyScheduled++ } } - if len(reqs) == 1 { + if len(jctxs) == 1 { sb.WriteString(":\n") - for _, report := range reports { - sb.WriteString(report.String()) + for _, jctx := range jctxs { + pctx := jctx.PodSchedulingContext + if pctx == nil { + continue + } + sb.WriteString(pctx.String()) sb.WriteString("\n") } sb.WriteString("---") sb.WriteString("\n") } else { sb.WriteString(":") - sb.WriteString(fmt.Sprintf(" %d out of %d pods schedulable\n", numSuccessfullyScheduled, len(reqs))) + sb.WriteString(fmt.Sprintf(" %d out of %d pods schedulable\n", numSuccessfullyScheduled, len(jctxs))) } } - return schedulingResult{isSchedulable: canSchedule, reason: sb.String()} + return schedulingResult{isSchedulable: isSchedulable, reason: sb.String()} } -func (srv *SubmitChecker) filterStaleNodeDbs(executorsById map[string]minimalExecutor) map[string]minimalExecutor { +func (srv *SubmitChecker) filterStaleExecutors(executorsById map[string]minimalExecutor) map[string]minimalExecutor { rv := make(map[string]minimalExecutor) for id, executor := range executorsById { if srv.clock.Since(executor.updateTime) < srv.executorTimeout { @@ -279,7 +280,14 @@ func (srv *SubmitChecker) constructNodeDb(nodes []*schedulerobjects.Node) (*node if err != nil { return nil, err } - err = nodeDb.UpsertMany(nodes) + txn := nodeDb.Txn(true) + defer txn.Abort() + for _, node := range nodes { + if err := nodeDb.CreateAndInsertWithJobDbJobsWithTxn(txn, nil, node); err != nil { + return nil, err + } + } + txn.Commit() if err != nil { return nil, err } diff --git a/internal/scheduler/submitcheck_test.go b/internal/scheduler/submitcheck_test.go index 9949090d491..a95f3d9abbf 100644 --- a/internal/scheduler/submitcheck_test.go +++ b/internal/scheduler/submitcheck_test.go @@ -15,13 +15,14 @@ import ( "github.com/armadaproject/armada/internal/armada/configuration" "github.com/armadaproject/armada/internal/common/util" + "github.com/armadaproject/armada/internal/scheduler/jobdb" schedulermocks "github.com/armadaproject/armada/internal/scheduler/mocks" "github.com/armadaproject/armada/internal/scheduler/schedulerobjects" "github.com/armadaproject/armada/internal/scheduler/testfixtures" "github.com/armadaproject/armada/pkg/api" ) -func TestSubmitChecker_CheckPodRequirements(t *testing.T) { +func TestSubmitChecker_CheckJobDbJobs(t *testing.T) { defaultTimeout := 15 * time.Minute baseTime := time.Now().UTC() expiredTime := baseTime.Add(-defaultTimeout).Add(-1 * time.Second) @@ -30,42 +31,42 @@ func TestSubmitChecker_CheckPodRequirements(t *testing.T) { executorTimout time.Duration config configuration.SchedulingConfig executors []*schedulerobjects.Executor - podRequirement *schedulerobjects.PodRequirements + job *jobdb.Job expectPass bool }{ "one job schedules": { executorTimout: defaultTimeout, config: testfixtures.TestSchedulingConfig(), executors: []*schedulerobjects.Executor{testExecutor(baseTime)}, - podRequirement: testfixtures.Test1CpuPodReqs("queue", util.ULID(), 1), + job: testfixtures.Test1Cpu4GiJob("queue", testfixtures.PriorityClass1), expectPass: true, }, "no jobs schedule due to resources": { executorTimout: defaultTimeout, config: testfixtures.TestSchedulingConfig(), executors: []*schedulerobjects.Executor{testExecutor(baseTime)}, - podRequirement: testfixtures.Test32CpuPodReqs("queue", util.ULID(), 1), + job: testfixtures.Test32Cpu256GiJob("queue", testfixtures.PriorityClass1), expectPass: false, }, "no jobs schedule due to selector": { executorTimout: defaultTimeout, config: testfixtures.TestSchedulingConfig(), executors: []*schedulerobjects.Executor{testExecutor(baseTime)}, - podRequirement: testfixtures.WithNodeSelectorPodReq(map[string]string{"foo": "bar"}, testfixtures.Test1CpuPodReqs("queue", util.ULID(), 1)), + job: testfixtures.WithNodeSelectorJob(map[string]string{"foo": "bar"}, testfixtures.Test1Cpu4GiJob("queue", testfixtures.PriorityClass1)), expectPass: false, }, "no jobs schedule due to executor timeout": { executorTimout: defaultTimeout, config: testfixtures.TestSchedulingConfig(), executors: []*schedulerobjects.Executor{testExecutor(expiredTime)}, - podRequirement: testfixtures.Test1CpuPodReqs("queue", util.ULID(), 1), + job: testfixtures.Test1Cpu4GiJob("queue", testfixtures.PriorityClass1), expectPass: false, }, "multiple executors, 1 expired": { executorTimout: defaultTimeout, config: testfixtures.TestSchedulingConfig(), executors: []*schedulerobjects.Executor{testExecutor(expiredTime), testExecutor(baseTime)}, - podRequirement: testfixtures.Test1CpuPodReqs("queue", util.ULID(), 1), + job: testfixtures.Test1Cpu4GiJob("queue", testfixtures.PriorityClass1), expectPass: true, }, } @@ -81,12 +82,12 @@ func TestSubmitChecker_CheckPodRequirements(t *testing.T) { submitCheck := NewSubmitChecker(tc.executorTimout, tc.config, mockExecutorRepo) submitCheck.clock = fakeClock submitCheck.updateExecutors(ctx) - result, msg := submitCheck.CheckPodRequirements(tc.podRequirement) - assert.Equal(t, tc.expectPass, result) + isSchedulable, reason := submitCheck.CheckJobDbJobs([]*jobdb.Job{tc.job}) + assert.Equal(t, tc.expectPass, isSchedulable) if !tc.expectPass { - assert.NotEqual(t, "", msg) + assert.NotEqual(t, "", reason) } - logrus.Info(msg) + logrus.Info(reason) }) } } diff --git a/internal/scheduler/testfixtures/testfixtures.go b/internal/scheduler/testfixtures/testfixtures.go index 898e30c0b88..6396103ca4a 100644 --- a/internal/scheduler/testfixtures/testfixtures.go +++ b/internal/scheduler/testfixtures/testfixtures.go @@ -18,7 +18,6 @@ import ( "github.com/armadaproject/armada/internal/armada/configuration" "github.com/armadaproject/armada/internal/common/util" - schedulerconfig "github.com/armadaproject/armada/internal/scheduler/configuration" "github.com/armadaproject/armada/internal/scheduler/database" "github.com/armadaproject/armada/internal/scheduler/jobdb" "github.com/armadaproject/armada/internal/scheduler/schedulerobjects" @@ -88,21 +87,33 @@ func ContextWithDefaultLogger(ctx context.Context) context.Context { func TestSchedulingConfig() configuration.SchedulingConfig { return configuration.SchedulingConfig{ - ResourceScarcity: map[string]float64{"cpu": 1, "memory": 0}, + ResourceScarcity: map[string]float64{"cpu": 1}, Preemption: configuration.PreemptionConfig{ PriorityClasses: maps.Clone(TestPriorityClasses), DefaultPriorityClass: TestDefaultPriorityClass, NodeEvictionProbability: 1.0, NodeOversubscriptionEvictionProbability: 1.0, }, - IndexedResources: TestResources, + IndexedResources: TestResources, + IndexedNodeLabels: TestIndexedNodeLabels, + DominantResourceFairnessResourcesToConsider: TestResourceNames, ExecutorTimeout: 15 * time.Minute, MaxUnacknowledgedJobsPerExecutor: math.MaxInt, } } -func WithMaxUnacknowledgedJobsPerExecutor(i uint, config configuration.SchedulingConfig) configuration.SchedulingConfig { - config.MaxUnacknowledgedJobsPerExecutor = i +func WithMaxUnacknowledgedJobsPerExecutorConfig(v uint, config configuration.SchedulingConfig) configuration.SchedulingConfig { + config.MaxUnacknowledgedJobsPerExecutor = v + return config +} + +func WithProtectedFractionOfFairShareConfig(v float64, config configuration.SchedulingConfig) configuration.SchedulingConfig { + config.Preemption.ProtectedFractionOfFairShare = v + return config +} + +func WithDominantResourceFairnessConfig(config configuration.SchedulingConfig) configuration.SchedulingConfig { + config.FairnessModel = configuration.DominantResourceFairness return config } @@ -126,12 +137,18 @@ func WithRoundLimitsPoolConfig(limits map[string]map[string]float64, config conf return config } -func WithPerPriorityLimitsConfig(limits map[int32]map[string]float64, config configuration.SchedulingConfig) configuration.SchedulingConfig { - for k, v := range config.Preemption.PriorityClasses { - config.Preemption.PriorityClasses[k] = configuration.PriorityClass{ - Priority: v.Priority, - Preemptible: v.Preemptible, - MaximumResourceFractionPerQueue: limits[v.Priority], +func WithPerPriorityLimitsConfig(limits map[string]map[string]float64, config configuration.SchedulingConfig) configuration.SchedulingConfig { + for priorityClassName, limit := range limits { + priorityClass, ok := config.Preemption.PriorityClasses[priorityClassName] + if !ok { + panic(fmt.Sprintf("no priority class with name %s", priorityClassName)) + } + // We need to make a copy to avoid mutating the priorityClasses, which are used by other tests too. + config.Preemption.PriorityClasses[priorityClassName] = configuration.PriorityClass{ + Priority: priorityClass.Priority, + Preemptible: priorityClass.Preemptible, + MaximumResourceFractionPerQueue: limit, + MaximumResourceFractionPerQueueByPool: priorityClass.MaximumResourceFractionPerQueueByPool, } } return config @@ -210,8 +227,20 @@ func WithNodeSelectorPodReq(selector map[string]string, req *schedulerobjects.Po return req } -func WithNodeAffinityPodReqs(nodeSelectorTerms []v1.NodeSelectorTerm, reqs []*schedulerobjects.PodRequirements) []*schedulerobjects.PodRequirements { - for _, req := range reqs { +func WithNodeUniformityLabelAnnotationJobs(label string, jobs []*jobdb.Job) []*jobdb.Job { + for _, job := range jobs { + req := job.PodRequirements() + if req.Annotations == nil { + req.Annotations = make(map[string]string) + } + req.Annotations[configuration.GangNodeUniformityLabelAnnotation] = label + } + return jobs +} + +func WithNodeAffinityJobs(nodeSelectorTerms []v1.NodeSelectorTerm, jobs []*jobdb.Job) []*jobdb.Job { + for _, job := range jobs { + req := job.PodRequirements() if req.Affinity == nil { req.Affinity = &v1.Affinity{} } @@ -226,7 +255,7 @@ func WithNodeAffinityPodReqs(nodeSelectorTerms []v1.NodeSelectorTerm, reqs []*sc nodeSelectorTerms..., ) } - return reqs + return jobs } func WithGangAnnotationsPodReqs(reqs []*schedulerobjects.PodRequirements) []*schedulerobjects.PodRequirements { @@ -248,25 +277,34 @@ func WithAnnotationsPodReqs(annotations map[string]string, reqs []*schedulerobje return reqs } -func WithRequestsPodReqs(rl schedulerobjects.ResourceList, reqs []*schedulerobjects.PodRequirements) []*schedulerobjects.PodRequirements { - for _, req := range reqs { - maps.Copy( - req.ResourceRequirements.Requests, - schedulerobjects.V1ResourceListFromResourceList(rl), - ) +func WithRequestsJobs(rl schedulerobjects.ResourceList, jobs []*jobdb.Job) []*jobdb.Job { + for _, job := range jobs { + for _, req := range job.JobSchedulingInfo().GetObjectRequirements() { + maps.Copy( + req.GetPodRequirements().ResourceRequirements.Requests, + schedulerobjects.V1ResourceListFromResourceList(rl), + ) + } } - return reqs + return jobs } func WithNodeSelectorJobs(selector map[string]string, jobs []*jobdb.Job) []*jobdb.Job { for _, job := range jobs { - for _, req := range job.GetRequirements(nil).GetObjectRequirements() { + for _, req := range job.JobSchedulingInfo().GetObjectRequirements() { req.GetPodRequirements().NodeSelector = maps.Clone(selector) } } return jobs } +func WithNodeSelectorJob(selector map[string]string, job *jobdb.Job) *jobdb.Job { + for _, req := range job.JobSchedulingInfo().GetObjectRequirements() { + req.GetPodRequirements().NodeSelector = maps.Clone(selector) + } + return job +} + func WithGangAnnotationsJobs(jobs []*jobdb.Job) []*jobdb.Job { gangId := uuid.NewString() gangCardinality := fmt.Sprintf("%d", len(jobs)) @@ -278,7 +316,7 @@ func WithGangAnnotationsJobs(jobs []*jobdb.Job) []*jobdb.Job { func WithAnnotationsJobs(annotations map[string]string, jobs []*jobdb.Job) []*jobdb.Job { for _, job := range jobs { - for _, req := range job.GetRequirements(nil).GetObjectRequirements() { + for _, req := range job.JobSchedulingInfo().GetObjectRequirements() { if req.GetPodRequirements().Annotations == nil { req.GetPodRequirements().Annotations = make(map[string]string) } @@ -288,26 +326,34 @@ func WithAnnotationsJobs(annotations map[string]string, jobs []*jobdb.Job) []*jo return jobs } -func N1CpuJobs(queue string, priorityClassName string, n int) []*jobdb.Job { +func N1Cpu4GiJobs(queue string, priorityClassName string, n int) []*jobdb.Job { rv := make([]*jobdb.Job, n) for i := 0; i < n; i++ { - rv[i] = Test1CpuJob(queue, priorityClassName) + rv[i] = Test1Cpu4GiJob(queue, priorityClassName) } return rv } -func N16CpuJobs(queue string, priorityClassName string, n int) []*jobdb.Job { +func N1Cpu16GiJobs(queue string, priorityClassName string, n int) []*jobdb.Job { rv := make([]*jobdb.Job, n) for i := 0; i < n; i++ { - rv[i] = Test16CpuJob(queue, priorityClassName) + rv[i] = Test1Cpu16GiJob(queue, priorityClassName) } return rv } -func N32CpuJobs(queue string, priorityClassName string, n int) []*jobdb.Job { +func N16Cpu128GiJobs(queue string, priorityClassName string, n int) []*jobdb.Job { rv := make([]*jobdb.Job, n) for i := 0; i < n; i++ { - rv[i] = Test32CpuJob(queue, priorityClassName) + rv[i] = Test16Cpu128GiJob(queue, priorityClassName) + } + return rv +} + +func N32Cpu256GiJobs(queue string, priorityClassName string, n int) []*jobdb.Job { + rv := make([]*jobdb.Job, n) + for i := 0; i < n; i++ { + rv[i] = Test32Cpu256GiJob(queue, priorityClassName) } return rv } @@ -357,19 +403,24 @@ func TestJob(queue string, jobId ulid.ULID, priorityClassName string, req *sched ) } -func Test1CpuJob(queue string, priorityClassName string) *jobdb.Job { +func Test1Cpu4GiJob(queue string, priorityClassName string) *jobdb.Job { + jobId := util.ULID() + return TestJob(queue, jobId, priorityClassName, Test1Cpu4GiPodReqs(queue, jobId, extractPriority(priorityClassName))) +} + +func Test1Cpu16GiJob(queue string, priorityClassName string) *jobdb.Job { jobId := util.ULID() - return TestJob(queue, jobId, priorityClassName, Test1CpuPodReqs(queue, jobId, extractPriority(priorityClassName))) + return TestJob(queue, jobId, priorityClassName, Test1Cpu16GiPodReqs(queue, jobId, extractPriority(priorityClassName))) } -func Test16CpuJob(queue string, priorityClassName string) *jobdb.Job { +func Test16Cpu128GiJob(queue string, priorityClassName string) *jobdb.Job { jobId := util.ULID() - return TestJob(queue, jobId, priorityClassName, Test16CpuPodReqs(queue, jobId, extractPriority(priorityClassName))) + return TestJob(queue, jobId, priorityClassName, Test16Cpu128GiPodReqs(queue, jobId, extractPriority(priorityClassName))) } -func Test32CpuJob(queue string, priorityClassName string) *jobdb.Job { +func Test32Cpu256GiJob(queue string, priorityClassName string) *jobdb.Job { jobId := util.ULID() - return TestJob(queue, jobId, priorityClassName, Test32CpuPodReqs(queue, jobId, extractPriority(priorityClassName))) + return TestJob(queue, jobId, priorityClassName, Test32Cpu256GiPodReqs(queue, jobId, extractPriority(priorityClassName))) } func Test1GpuJob(queue string, priorityClassName string) *jobdb.Job { @@ -380,7 +431,7 @@ func Test1GpuJob(queue string, priorityClassName string) *jobdb.Job { func N1CpuPodReqs(queue string, priority int32, n int) []*schedulerobjects.PodRequirements { rv := make([]*schedulerobjects.PodRequirements, n) for i := 0; i < n; i++ { - rv[i] = Test1CpuPodReqs(queue, util.ULID(), priority) + rv[i] = Test1Cpu4GiPodReqs(queue, util.ULID(), priority) } return rv } @@ -388,7 +439,7 @@ func N1CpuPodReqs(queue string, priority int32, n int) []*schedulerobjects.PodRe func N16CpuPodReqs(queue string, priority int32, n int) []*schedulerobjects.PodRequirements { rv := make([]*schedulerobjects.PodRequirements, n) for i := 0; i < n; i++ { - rv[i] = Test16CpuPodReqs(queue, util.ULID(), priority) + rv[i] = Test16Cpu128GiPodReqs(queue, util.ULID(), priority) } return rv } @@ -396,7 +447,7 @@ func N16CpuPodReqs(queue string, priority int32, n int) []*schedulerobjects.PodR func N32CpuPodReqs(queue string, priority int32, n int) []*schedulerobjects.PodRequirements { rv := make([]*schedulerobjects.PodRequirements, n) for i := 0; i < n; i++ { - rv[i] = Test32CpuPodReqs(queue, util.ULID(), priority) + rv[i] = Test32Cpu256GiPodReqs(queue, util.ULID(), priority) } return rv } @@ -413,15 +464,12 @@ func TestPodReqs(queue string, jobId ulid.ULID, priority int32, requests v1.Reso return &schedulerobjects.PodRequirements{ Priority: priority, ResourceRequirements: v1.ResourceRequirements{Requests: requests}, - Annotations: map[string]string{ - schedulerconfig.JobIdAnnotation: jobId.String(), - schedulerconfig.QueueAnnotation: queue, - }, - NodeSelector: make(map[string]string), + Annotations: make(map[string]string), + NodeSelector: make(map[string]string), } } -func Test1CpuPodReqs(queue string, jobId ulid.ULID, priority int32) *schedulerobjects.PodRequirements { +func Test1Cpu4GiPodReqs(queue string, jobId ulid.ULID, priority int32) *schedulerobjects.PodRequirements { return TestPodReqs( queue, jobId, @@ -433,7 +481,19 @@ func Test1CpuPodReqs(queue string, jobId ulid.ULID, priority int32) *schedulerob ) } -func Test16CpuPodReqs(queue string, jobId ulid.ULID, priority int32) *schedulerobjects.PodRequirements { +func Test1Cpu16GiPodReqs(queue string, jobId ulid.ULID, priority int32) *schedulerobjects.PodRequirements { + return TestPodReqs( + queue, + jobId, + priority, + v1.ResourceList{ + "cpu": resource.MustParse("1"), + "memory": resource.MustParse("16Gi"), + }, + ) +} + +func Test16Cpu128GiPodReqs(queue string, jobId ulid.ULID, priority int32) *schedulerobjects.PodRequirements { req := TestPodReqs( queue, jobId, @@ -452,7 +512,7 @@ func Test16CpuPodReqs(queue string, jobId ulid.ULID, priority int32) *schedulero return req } -func Test32CpuPodReqs(queue string, jobId ulid.ULID, priority int32) *schedulerobjects.PodRequirements { +func Test32Cpu256GiPodReqs(queue string, jobId ulid.ULID, priority int32) *schedulerobjects.PodRequirements { req := TestPodReqs( queue, jobId, @@ -500,10 +560,7 @@ func TestUnitReqs(priority int32) *schedulerobjects.PodRequirements { "memory": resource.MustParse("1Gi"), }, }, - Annotations: map[string]string{ - schedulerconfig.JobIdAnnotation: util.NewULID(), - schedulerconfig.QueueAnnotation: TestQueue, - }, + Annotations: make(map[string]string), NodeSelector: make(map[string]string), } } diff --git a/internal/scheduleringester/dbops.go b/internal/scheduleringester/dbops.go index f4b448dec47..b4300cc868a 100644 --- a/internal/scheduleringester/dbops.go +++ b/internal/scheduleringester/dbops.go @@ -99,7 +99,7 @@ func AppendDbOperation(ops []DbOperation, op DbOperation) []DbOperation { break } } - return discardNilOps(ops) // TODO: Can be made more efficient. + return discardNilOps(ops) } func discardNilOps(ops []DbOperation) []DbOperation { diff --git a/internal/scheduleringester/instructions.go b/internal/scheduleringester/instructions.go index 3ff86553121..fa0e70c09a9 100644 --- a/internal/scheduleringester/instructions.go +++ b/internal/scheduleringester/instructions.go @@ -48,7 +48,7 @@ func NewInstructionConverter( func (c *InstructionConverter) Convert(_ context.Context, sequencesWithIds *ingest.EventSequencesWithIds) *DbOperationsWithMessageIds { operations := make([]DbOperation, 0) for _, es := range sequencesWithIds.EventSequences { - for _, op := range c.convertSequence(es) { + for _, op := range c.dbOperationsFromEventSequence(es) { operations = AppendDbOperation(operations, op) } } @@ -58,14 +58,13 @@ func (c *InstructionConverter) Convert(_ context.Context, sequencesWithIds *inge } } -func (c *InstructionConverter) convertSequence(es *armadaevents.EventSequence) []DbOperation { +func (c *InstructionConverter) dbOperationsFromEventSequence(es *armadaevents.EventSequence) []DbOperation { meta := eventSequenceCommon{ queue: es.Queue, jobset: es.JobSetName, user: es.UserId, groups: es.Groups, } - operations := make([]DbOperation, 0, len(es.Events)) for idx, event := range es.Events { eventTime := time.Now().UTC() @@ -117,7 +116,7 @@ func (c *InstructionConverter) convertSequence(es *armadaevents.EventSequence) [ } if err != nil { c.metrics.RecordPulsarMessageError(metrics.PulsarMessageErrorProcessing) - log.WithError(err).Warnf("Could not convert event at index %d.", idx) + log.WithError(err).Errorf("Could not convert event at index %d.", idx) } else { operations = append(operations, operationsFromEvent...) } @@ -148,7 +147,7 @@ func (c *InstructionConverter) handleSubmitJob(job *armadaevents.SubmitJob, subm // Produce a minimal representation of the job for the scheduler. // To avoid the scheduler needing to load the entire job spec. - schedulingInfo, err := c.schedulingInfoFromSubmitJob(job) + schedulingInfo, err := c.schedulingInfoFromSubmitJob(job, submitTime) if err != nil { return nil, err } @@ -357,13 +356,15 @@ func (c *InstructionConverter) handlePartitionMarker(pm *armadaevents.PartitionM // schedulingInfoFromSubmitJob returns a minimal representation of a job // containing only the info needed by the scheduler. -func (c *InstructionConverter) schedulingInfoFromSubmitJob(submitJob *armadaevents.SubmitJob) (*schedulerobjects.JobSchedulingInfo, error) { +func (c *InstructionConverter) schedulingInfoFromSubmitJob(submitJob *armadaevents.SubmitJob, submitTime time.Time) (*schedulerobjects.JobSchedulingInfo, error) { // Component common to all jobs. schedulingInfo := &schedulerobjects.JobSchedulingInfo{ Lifetime: submitJob.Lifetime, AtMostOnce: submitJob.AtMostOnce, Preemptible: submitJob.Preemptible, ConcurrencySafe: submitJob.ConcurrencySafe, + SubmitTime: submitTime, + Priority: submitJob.Priority, Version: 0, } @@ -371,12 +372,16 @@ func (c *InstructionConverter) schedulingInfoFromSubmitJob(submitJob *armadaeven switch object := submitJob.MainObject.Object.(type) { case *armadaevents.KubernetesMainObject_PodSpec: podSpec := object.PodSpec.PodSpec - requirements := &schedulerobjects.ObjectRequirements_PodRequirements{ - PodRequirements: adapters.PodRequirementsFromPodSpec(podSpec, c.priorityClasses), - } + schedulingInfo.PriorityClassName = podSpec.PriorityClassName + podRequirements := adapters.PodRequirementsFromPodSpec(podSpec, c.priorityClasses) + podRequirements.Annotations = submitJob.ObjectMeta.Annotations schedulingInfo.ObjectRequirements = append( schedulingInfo.ObjectRequirements, - &schedulerobjects.ObjectRequirements{Requirements: requirements}, + &schedulerobjects.ObjectRequirements{ + Requirements: &schedulerobjects.ObjectRequirements_PodRequirements{ + PodRequirements: podRequirements, + }, + }, ) default: return nil, errors.Errorf("unsupported object type %T", object) diff --git a/internal/scheduleringester/instructions_test.go b/internal/scheduleringester/instructions_test.go index 45debe5c30d..99ecde22f1a 100644 --- a/internal/scheduleringester/instructions_test.go +++ b/internal/scheduleringester/instructions_test.go @@ -201,7 +201,7 @@ func TestConvertSequence(t *testing.T) { t.Run(name, func(t *testing.T) { converter := InstructionConverter{m, f.PriorityClasses, compressor} es := f.NewEventSequence(tc.events...) - results := converter.convertSequence(es) + results := converter.dbOperationsFromEventSequence(es) assertOperationsEqual(t, tc.expected, results) }) } @@ -272,11 +272,14 @@ func assertErrorMessagesEqual(t *testing.T, expectedBytes []byte, actualBytes [] func getExpectedSubmitMessageSchedulingInfo(t *testing.T) *schedulerobjects.JobSchedulingInfo { expectedSubmitSchedulingInfo := &schedulerobjects.JobSchedulingInfo{ - Lifetime: 0, - AtMostOnce: true, - Preemptible: true, - ConcurrencySafe: true, - Version: 0, + Lifetime: 0, + AtMostOnce: true, + Preemptible: true, + ConcurrencySafe: true, + Version: 0, + PriorityClassName: "test-priority", + Priority: 3, + SubmitTime: f.BaseTime, ObjectRequirements: []*schedulerobjects.ObjectRequirements{ { Requirements: &schedulerobjects.ObjectRequirements_PodRequirements{ diff --git a/internal/scheduleringester/schedulerdb.go b/internal/scheduleringester/schedulerdb.go index ad34ada01c9..9b944836c2e 100644 --- a/internal/scheduleringester/schedulerdb.go +++ b/internal/scheduleringester/schedulerdb.go @@ -42,10 +42,9 @@ func NewSchedulerDb( } } -// Store persists all operations in the database. Note that: -// - this function will retry until it either succeeds or a terminal error is encountered -// - this function will take out a postgres lock to ensure that other ingesters are not writing to the database -// at the same time (for details, see acquireLock()) +// Store persists all operations in the database. +// This function retires until it either succeeds or encounters a terminal error. +// This function locks the postgres table to avoid write conflicts; see acquireLock() for details. func (s *SchedulerDb) Store(ctx context.Context, instructions *DbOperationsWithMessageIds) error { return ingest.WithRetry(func() (bool, error) { err := s.db.BeginTxFunc(ctx, pgx.TxOptions{ @@ -53,36 +52,38 @@ func (s *SchedulerDb) Store(ctx context.Context, instructions *DbOperationsWithM AccessMode: pgx.ReadWrite, DeferrableMode: pgx.Deferrable, }, func(tx pgx.Tx) error { - // First acquire the write lock lockCtx, cancel := context.WithTimeout(ctx, s.lockTimeout) defer cancel() - err := s.acquireLock(lockCtx, tx) - if err != nil { + // The lock is released automatically on transaction rollback/commit. + if err := s.acquireLock(lockCtx, tx); err != nil { return err } - // Now insert the ops for _, dbOp := range instructions.Ops { - err := s.WriteDbOp(ctx, tx, dbOp) - if err != nil { + if err := s.WriteDbOp(ctx, tx, dbOp); err != nil { return err } } - return err + return nil }) return true, err }, s.initialBackOff, s.maxBackOff) } -// acquireLock acquires the armada_scheduleringester_lock, which prevents two ingesters writing to the db at the same -// time. This is necessary because: -// - when rows are inserted into the database they are stamped with a sequence number -// - the scheduler relies on this sequence number increasing to ensure it has fetched all updated rows -// - concurrent transactions will result in sequence numbers being interleaved across transactions. -// - the interleaved sequences may result in the scheduler seeing sequence numbers that do not strictly increase over time. +// acquireLock acquires a postgres advisory lock, thus preventing concurrent writes. +// This is necessary to ensure sequence numbers assigned to each inserted row are monotonically increasing. +// Such a sequence number is assigned to each inserted row by a postgres function. +// +// Hence, if rows are inserted across multiple transactions concurrently, +// sequence numbers may be interleaved between transactions and the slower transaction may insert +// rows with sequence numbers smaller than those already written. +// +// The scheduler relies on these sequence numbers to only fetch new or updated rows in each update cycle. func (s *SchedulerDb) acquireLock(ctx context.Context, tx pgx.Tx) error { const lockId = 8741339439634283896 - _, err := tx.Exec(ctx, "SELECT pg_advisory_xact_lock($1)", lockId) - return errors.Wrapf(err, "Could not obtain lock") + if _, err := tx.Exec(ctx, "SELECT pg_advisory_xact_lock($1)", lockId); err != nil { + return errors.Wrapf(err, "could not obtain lock") + } + return nil } func (s *SchedulerDb) WriteDbOp(ctx context.Context, tx pgx.Tx, op DbOperation) error { diff --git a/magefiles/main.go b/magefiles/main.go index 498594ca333..5a074eebd49 100644 --- a/magefiles/main.go +++ b/magefiles/main.go @@ -167,11 +167,14 @@ func LocalDevStop() { // Build the lookout UI from internal/lookout/ui func UI() error { + timeTaken := time.Now() mg.Deps(yarnCheck) mg.Deps(yarnInstall) mg.Deps(yarnOpenAPI) mg.Deps(yarnBuild) + + fmt.Println("Time to build UI:", time.Since(timeTaken)) return nil } diff --git a/makefile b/makefile index 28d16d7e25b..81c86bbe1ef 100644 --- a/makefile +++ b/makefile @@ -402,14 +402,7 @@ tests: gotestsum docker run -d --name=postgres $(DOCKER_NET) -p 5432:5432 -e POSTGRES_PASSWORD=psw postgres:14.2 sleep 3 function tearDown { docker rm -f redis postgres; }; trap tearDown EXIT - $(GOTESTSUM) -- $(shell go list ./internal/... | grep -v 'jobservice/repository') \ - -coverprofile internal_coverage.xml -v 2>&1 | tee test_reports/internal.txt - env JSDBTYPE=sqlite $(GOTESTSUM) -- -v \ - ./internal/jobservice/repository/... 2>&1 | tee -a test_reports/internal.txt - env JSDBTYPE=postgres $(GOTESTSUM) -- -v \ - ./internal/jobservice/repository/... 2>&1 | tee -a test_reports/internal.txt - $(GOTESTSUM) -- -coverprofile pkg_coverage.xml -v ./pkg... 2>&1 | tee test_reports/pkg.txt - $(GOTESTSUM) -- -coverprofile cmd_coverage.xml -v ./cmd... 2>&1 | tee test_reports/cmd.txt + $(GOTESTSUM) --format short-verbose --junitfile test-reports/unit-tests.xml --jsonfile test-reports/unit-tests.json -- -coverprofile=test-reports/coverage.out -covermode=atomic ./cmd/... ./pkg/... $(go list ./internal/... | grep -v 'jobservice/repository') .ONESHELL: lint-fix: diff --git a/pkg/api/util.go b/pkg/api/util.go index 45e047bfe36..96fc848519e 100644 --- a/pkg/api/util.go +++ b/pkg/api/util.go @@ -102,11 +102,22 @@ func JobRunStateFromApiJobState(s JobState) schedulerobjects.JobRunState { return schedulerobjects.JobRunState_UNKNOWN } -func NewNodeTypeFromNodeInfo(nodeInfo *NodeInfo, indexedTaints map[string]interface{}, indexedLabels map[string]interface{}) *schedulerobjects.NodeType { - return schedulerobjects.NewNodeType(nodeInfo.GetTaints(), nodeInfo.GetLabels(), indexedTaints, indexedLabels) +func (job *Job) GetPerQueuePriority() uint32 { + priority := job.Priority + if priority < 0 { + return 0 + } + if priority > math.MaxUint32 { + return math.MaxUint32 + } + return uint32(math.Round(priority)) } -func (job *Job) GetRequirements(priorityClasses map[string]configuration.PriorityClass) *schedulerobjects.JobSchedulingInfo { +func (job *Job) GetSubmitTime() time.Time { + return job.Created +} + +func (job *Job) GetPodRequirements(priorityClasses map[string]configuration.PriorityClass) *schedulerobjects.PodRequirements { podSpec := job.GetMainPodSpec() priority, ok := PriorityFromPodSpec(podSpec, priorityClasses) @@ -121,7 +132,8 @@ func (job *Job) GetRequirements(priorityClasses map[string]configuration.Priorit if podSpec.PreemptionPolicy != nil { preemptionPolicy = string(*podSpec.PreemptionPolicy) } - podRequirements := &schedulerobjects.PodRequirements{ + + return &schedulerobjects.PodRequirements{ NodeSelector: podSpec.NodeSelector, Affinity: podSpec.Affinity, Tolerations: podSpec.Tolerations, @@ -130,18 +142,6 @@ func (job *Job) GetRequirements(priorityClasses map[string]configuration.Priorit PreemptionPolicy: preemptionPolicy, ResourceRequirements: job.GetResourceRequirements(), } - return &schedulerobjects.JobSchedulingInfo{ - PriorityClassName: podSpec.PriorityClassName, - Priority: LogSubmitPriorityFromApiPriority(job.GetPriority()), - SubmitTime: job.GetCreated(), - ObjectRequirements: []*schedulerobjects.ObjectRequirements{ - { - Requirements: &schedulerobjects.ObjectRequirements_PodRequirements{ - PodRequirements: podRequirements, - }, - }, - }, - } } // SchedulingResourceRequirementsFromPodSpec returns resource requests and limits necessary for scheduling a pod. @@ -249,19 +249,6 @@ func (job *Job) GetJobSet() string { return job.JobSetId } -// LogSubmitPriorityFromApiPriority returns the uint32 representation of the priority included with a submitted job, -// or an error if the conversion fails. -func LogSubmitPriorityFromApiPriority(priority float64) uint32 { - if priority < 0 { - priority = 0 - } - if priority > math.MaxUint32 { - priority = math.MaxUint32 - } - priority = math.Round(priority) - return uint32(priority) -} - func (job *Job) GetMainPodSpec() *v1.PodSpec { if job.PodSpec != nil { return job.PodSpec diff --git a/pkg/api/util_test.go b/pkg/api/util_test.go index 13147463501..6b49ee91c20 100644 --- a/pkg/api/util_test.go +++ b/pkg/api/util_test.go @@ -264,10 +264,10 @@ func QuantityWithMilliValue(v int64) resource.Quantity { return q } -func TestJobGetRequirements(t *testing.T) { +func TestJobGetPodRequirements(t *testing.T) { tests := map[string]struct { job *Job - expected *schedulerobjects.JobSchedulingInfo + expected *schedulerobjects.PodRequirements }{ "queue priority": { job: &Job{ @@ -277,20 +277,11 @@ func TestJobGetRequirements(t *testing.T) { // PriorityClassName: , }, }, - expected: &schedulerobjects.JobSchedulingInfo{ - Priority: 10, - ObjectRequirements: []*schedulerobjects.ObjectRequirements{ - { - &schedulerobjects.ObjectRequirements_PodRequirements{ - PodRequirements: &schedulerobjects.PodRequirements{ - PreemptionPolicy: string(v1.PreemptLowerPriority), - ResourceRequirements: v1.ResourceRequirements{ - Requests: make(v1.ResourceList), - Limits: make(v1.ResourceList), - }, - }, - }, - }, + expected: &schedulerobjects.PodRequirements{ + PreemptionPolicy: string(v1.PreemptLowerPriority), + ResourceRequirements: v1.ResourceRequirements{ + Requests: make(v1.ResourceList), + Limits: make(v1.ResourceList), }, }, }, @@ -300,21 +291,12 @@ func TestJobGetRequirements(t *testing.T) { PriorityClassName: PriorityClass1, }, }, - expected: &schedulerobjects.JobSchedulingInfo{ - PriorityClassName: PriorityClass1, - ObjectRequirements: []*schedulerobjects.ObjectRequirements{ - { - &schedulerobjects.ObjectRequirements_PodRequirements{ - PodRequirements: &schedulerobjects.PodRequirements{ - Priority: 1, - PreemptionPolicy: string(v1.PreemptLowerPriority), - ResourceRequirements: v1.ResourceRequirements{ - Requests: make(v1.ResourceList), - Limits: make(v1.ResourceList), - }, - }, - }, - }, + expected: &schedulerobjects.PodRequirements{ + Priority: 1, + PreemptionPolicy: string(v1.PreemptLowerPriority), + ResourceRequirements: v1.ResourceRequirements{ + Requests: make(v1.ResourceList), + Limits: make(v1.ResourceList), }, }, }, @@ -324,19 +306,11 @@ func TestJobGetRequirements(t *testing.T) { PreemptionPolicy: pointerFromValue(v1.PreemptNever), }, }, - expected: &schedulerobjects.JobSchedulingInfo{ - ObjectRequirements: []*schedulerobjects.ObjectRequirements{ - { - &schedulerobjects.ObjectRequirements_PodRequirements{ - PodRequirements: &schedulerobjects.PodRequirements{ - PreemptionPolicy: string(v1.PreemptNever), - ResourceRequirements: v1.ResourceRequirements{ - Requests: make(v1.ResourceList), - Limits: make(v1.ResourceList), - }, - }, - }, - }, + expected: &schedulerobjects.PodRequirements{ + PreemptionPolicy: string(v1.PreemptNever), + ResourceRequirements: v1.ResourceRequirements{ + Requests: make(v1.ResourceList), + Limits: make(v1.ResourceList), }, }, }, @@ -366,41 +340,33 @@ func TestJobGetRequirements(t *testing.T) { }, }, }, - expected: &schedulerobjects.JobSchedulingInfo{ - ObjectRequirements: []*schedulerobjects.ObjectRequirements{ - { - &schedulerobjects.ObjectRequirements_PodRequirements{ - PodRequirements: &schedulerobjects.PodRequirements{ - NodeSelector: map[string]string{"label": "value"}, - Affinity: &v1.Affinity{ - NodeAffinity: &v1.NodeAffinity{ - RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{ - NodeSelectorTerms: []v1.NodeSelectorTerm{ - { - MatchExpressions: []v1.NodeSelectorRequirement{ - { - Key: "affinityKey", - }, - }, - }, - }, + expected: &schedulerobjects.PodRequirements{ + NodeSelector: map[string]string{"label": "value"}, + Affinity: &v1.Affinity{ + NodeAffinity: &v1.NodeAffinity{ + RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{ + NodeSelectorTerms: []v1.NodeSelectorTerm{ + { + MatchExpressions: []v1.NodeSelectorRequirement{ + { + Key: "affinityKey", }, }, }, - Tolerations: []v1.Toleration{ - { - Key: "tolerationKey", - }, - }, - PreemptionPolicy: string(v1.PreemptLowerPriority), - ResourceRequirements: v1.ResourceRequirements{ - Requests: make(v1.ResourceList), - Limits: make(v1.ResourceList), - }, }, }, }, }, + Tolerations: []v1.Toleration{ + { + Key: "tolerationKey", + }, + }, + PreemptionPolicy: string(v1.PreemptLowerPriority), + ResourceRequirements: v1.ResourceRequirements{ + Requests: make(v1.ResourceList), + Limits: make(v1.ResourceList), + }, }, }, "annotations": { @@ -408,20 +374,12 @@ func TestJobGetRequirements(t *testing.T) { Annotations: map[string]string{"key": "value"}, PodSpec: &v1.PodSpec{}, }, - expected: &schedulerobjects.JobSchedulingInfo{ - ObjectRequirements: []*schedulerobjects.ObjectRequirements{ - { - &schedulerobjects.ObjectRequirements_PodRequirements{ - PodRequirements: &schedulerobjects.PodRequirements{ - Annotations: map[string]string{"key": "value"}, - PreemptionPolicy: string(v1.PreemptLowerPriority), - ResourceRequirements: v1.ResourceRequirements{ - Requests: make(v1.ResourceList), - Limits: make(v1.ResourceList), - }, - }, - }, - }, + expected: &schedulerobjects.PodRequirements{ + Annotations: map[string]string{"key": "value"}, + PreemptionPolicy: string(v1.PreemptLowerPriority), + ResourceRequirements: v1.ResourceRequirements{ + Requests: make(v1.ResourceList), + Limits: make(v1.ResourceList), }, }, }, @@ -438,19 +396,11 @@ func TestJobGetRequirements(t *testing.T) { }, }, }, - expected: &schedulerobjects.JobSchedulingInfo{ - ObjectRequirements: []*schedulerobjects.ObjectRequirements{ - { - &schedulerobjects.ObjectRequirements_PodRequirements{ - PodRequirements: &schedulerobjects.PodRequirements{ - PreemptionPolicy: string(v1.PreemptLowerPriority), - ResourceRequirements: v1.ResourceRequirements{ - Requests: v1.ResourceList{"foo": QuantityWithMilliValue(1000)}, - Limits: v1.ResourceList{"bar": QuantityWithMilliValue(2000)}, - }, - }, - }, - }, + expected: &schedulerobjects.PodRequirements{ + PreemptionPolicy: string(v1.PreemptLowerPriority), + ResourceRequirements: v1.ResourceRequirements{ + Requests: v1.ResourceList{"foo": QuantityWithMilliValue(1000)}, + Limits: v1.ResourceList{"bar": QuantityWithMilliValue(2000)}, }, }, }, @@ -461,19 +411,11 @@ func TestJobGetRequirements(t *testing.T) { Requests: v1.ResourceList{"foo": resource.MustParse("1")}, }, }, - expected: &schedulerobjects.JobSchedulingInfo{ - ObjectRequirements: []*schedulerobjects.ObjectRequirements{ - { - &schedulerobjects.ObjectRequirements_PodRequirements{ - PodRequirements: &schedulerobjects.PodRequirements{ - PreemptionPolicy: string(v1.PreemptLowerPriority), - ResourceRequirements: v1.ResourceRequirements{ - Requests: v1.ResourceList{"foo": resource.MustParse("1")}, - Limits: nil, - }, - }, - }, - }, + expected: &schedulerobjects.PodRequirements{ + PreemptionPolicy: string(v1.PreemptLowerPriority), + ResourceRequirements: v1.ResourceRequirements{ + Requests: v1.ResourceList{"foo": resource.MustParse("1")}, + Limits: nil, }, }, }, @@ -484,26 +426,18 @@ func TestJobGetRequirements(t *testing.T) { Limits: v1.ResourceList{"foo": resource.MustParse("1")}, }, }, - expected: &schedulerobjects.JobSchedulingInfo{ - ObjectRequirements: []*schedulerobjects.ObjectRequirements{ - { - &schedulerobjects.ObjectRequirements_PodRequirements{ - PodRequirements: &schedulerobjects.PodRequirements{ - PreemptionPolicy: string(v1.PreemptLowerPriority), - ResourceRequirements: v1.ResourceRequirements{ - Requests: nil, - Limits: v1.ResourceList{"foo": resource.MustParse("1")}, - }, - }, - }, - }, + expected: &schedulerobjects.PodRequirements{ + PreemptionPolicy: string(v1.PreemptLowerPriority), + ResourceRequirements: v1.ResourceRequirements{ + Requests: nil, + Limits: v1.ResourceList{"foo": resource.MustParse("1")}, }, }, }, } for name, tc := range tests { t.Run(name, func(t *testing.T) { - assert.Equal(t, tc.expected, tc.job.GetRequirements(TestPriorityClasses)) + assert.Equal(t, tc.expected, tc.job.GetPodRequirements(TestPriorityClasses)) }) } } diff --git a/third_party/airflow/pyproject.toml b/third_party/airflow/pyproject.toml index e9eab1dd64b..4df39bb2167 100644 --- a/third_party/airflow/pyproject.toml +++ b/third_party/airflow/pyproject.toml @@ -21,7 +21,8 @@ readme = "README.md" [project.optional-dependencies] format = ["black==23.3.0", "flake8==6.0.0", "pylint==2.17.4"] test = ["pytest==7.3.1", "coverage>=6.5.0", "pytest-asyncio==0.21.0"] -docs = ["sphinx", "sphinx-jekyll-builder"] +# note(JayF): sphinx-jekyll-builder was broken by sphinx-markdown-builder 0.6 -- so pin to 0.5.5 +docs = ["sphinx==7.0.1", "sphinx-jekyll-builder==0.3.0", "sphinx-toolbox==3.2.0b1", "sphinx-markdown-builder==0.5.5"] [build-system] requires = ["setuptools"] From 2ac8d91fbc07d1e9ea325e2d7ecb2e3bb1e310a8 Mon Sep 17 00:00:00 2001 From: Dejan Pejchev Date: Wed, 5 Jul 2023 17:17:06 +0200 Subject: [PATCH 07/14] rename main to master branch in release rc job --- .github/workflows/release-rc.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/release-rc.yml b/.github/workflows/release-rc.yml index 8e57272fb69..9179749e952 100644 --- a/.github/workflows/release-rc.yml +++ b/.github/workflows/release-rc.yml @@ -27,7 +27,7 @@ jobs: ref='${{ github.event.workflow_run.head_branch }}' sha='${{ github.event.workflow_run.head_sha }}' case $ref in - main) + master) [ $(git branch --contains=$sha main | wc -l) -eq 1 ] && [ $(git rev-list --count $sha..main) -le 2 ] ;; From 5d398f9b731fd161f0911c92a25443d9f423045e Mon Sep 17 00:00:00 2001 From: Dejan Zele Pejchev Date: Wed, 5 Jul 2023 17:20:44 +0200 Subject: [PATCH 08/14] Feat/release ci fix (#45) --- .github/workflows/release-rc.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/release-rc.yml b/.github/workflows/release-rc.yml index 8e57272fb69..9179749e952 100644 --- a/.github/workflows/release-rc.yml +++ b/.github/workflows/release-rc.yml @@ -27,7 +27,7 @@ jobs: ref='${{ github.event.workflow_run.head_branch }}' sha='${{ github.event.workflow_run.head_sha }}' case $ref in - main) + master) [ $(git branch --contains=$sha main | wc -l) -eq 1 ] && [ $(git rev-list --count $sha..main) -le 2 ] ;; From 7183abef7f07f81c95eb1d6cbe77c316ffe12f9a Mon Sep 17 00:00:00 2001 From: Dejan Pejchev Date: Wed, 5 Jul 2023 17:56:48 +0200 Subject: [PATCH 09/14] change release jobs to target master --- .github/workflows/release-rc.yml | 4 ++-- .github/workflows/release.yml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/release-rc.yml b/.github/workflows/release-rc.yml index 9179749e952..0c915281e51 100644 --- a/.github/workflows/release-rc.yml +++ b/.github/workflows/release-rc.yml @@ -28,8 +28,8 @@ jobs: sha='${{ github.event.workflow_run.head_sha }}' case $ref in master) - [ $(git branch --contains=$sha main | wc -l) -eq 1 ] && - [ $(git rev-list --count $sha..main) -le 2 ] + [ $(git branch --contains=$sha master | wc -l) -eq 1 ] && + [ $(git rev-list --count $sha..master) -le 2 ] ;; *) false diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index a00fe10def2..0e783211ee1 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -35,7 +35,7 @@ jobs: exit 1 fi [ $(git rev-parse refs/tags/$ref) == $sha ] && - [ $(git branch --contains=$sha main | wc -l) -eq 1 ] + [ $(git branch --contains=$sha master | wc -l) -eq 1 ] ;; *) false From b6becaa1465342a61f7ad57250b626d4f049b4d7 Mon Sep 17 00:00:00 2001 From: Dejan Pejchev Date: Wed, 5 Jul 2023 20:25:46 +0200 Subject: [PATCH 10/14] final polish for release workflow --- .github/workflows/ci.yml | 2 ++ .github/workflows/release-rc.yml | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index cb3a5723961..576405791ec 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -2,6 +2,8 @@ name: CI on: push: + tags: + - v* branches-ignore: - gh-pages pull_request: diff --git a/.github/workflows/release-rc.yml b/.github/workflows/release-rc.yml index 0c915281e51..bf87ad1b66d 100644 --- a/.github/workflows/release-rc.yml +++ b/.github/workflows/release-rc.yml @@ -12,7 +12,7 @@ permissions: jobs: validate: - if: github.event.workflow_run.event == 'push' && github.event.workflow_run.conclusion == 'success' + if: github.event.workflow_run.event == 'push' && github.event.workflow_run.conclusion == 'success' && github.repository_owner == 'armadaproject' name: "Validate revision" runs-on: ubuntu-22.04 @@ -40,7 +40,7 @@ jobs: exit 1 fi release: - if: github.event.workflow_run.event == 'push' && github.event.workflow_run.conclusion == 'success' + if: github.event.workflow_run.event == 'push' && github.event.workflow_run.conclusion == 'success' && github.repository_owner == 'armadaproject' name: Release needs: validate runs-on: "ubuntu-22.04" From 284d27a2a77bbf124392070a77e266f2d84b86fe Mon Sep 17 00:00:00 2001 From: Dejan Pejchev Date: Wed, 5 Jul 2023 20:29:56 +0200 Subject: [PATCH 11/14] remove redundat validate workflow --- .github/workflows/release-validate.yml | 56 -------------------------- 1 file changed, 56 deletions(-) delete mode 100644 .github/workflows/release-validate.yml diff --git a/.github/workflows/release-validate.yml b/.github/workflows/release-validate.yml deleted file mode 100644 index be90f63d1ee..00000000000 --- a/.github/workflows/release-validate.yml +++ /dev/null @@ -1,56 +0,0 @@ -name: Validate Release - -on: - push: - tags: - - v* - -permissions: - contents: write - -jobs: - compare_tags: - if: github.repository_owner == 'armadaproject' - name: "Compare tags" - runs-on: ubuntu-22.04 - - steps: - - name: "Checkout" - uses: "actions/checkout@v3" - with: - fetch-depth: 0 - - - name: Compare tags - env: - ALLOWED_BRANCH: "master" - run: | - ref=${{ github.ref }} - tag=${ref#refs/tags/} - echo "Current tag: $tag" - sha=${{ github.sha }} - echo "Current sha: $sha" - result=0 - case $tag in - v?*) - latest_tag_commit=$(git rev-parse refs/tags/$tag^{}) - git branch --contains=$sha $ALLOWED_BRANCH >> /dev/null - branch_contains_commit=$? - - if [[ $branch_contains_commit -eq 0 && "$latest_tag_commit" == "$sha" ]]; then - result=0 - else - result=1 - fi - ;; - *) - echo "Invalid tag $tag" - result=1 - ;; - esac - if [ $result -ne 0 ]; then - echo "Latest tag ($tag) does not match the current commit ($sha)." - echo "::error ::Invalid ref $ref $sha" - exit 1 - else - echo "Latest tag ($tag) matches the current commit ($sha)." - fi From bd57eb54cb8565b76305e1649ce09fd0bd67c874 Mon Sep 17 00:00:00 2001 From: Dejan Pejchev Date: Thu, 6 Jul 2023 15:02:40 +0200 Subject: [PATCH 12/14] fix minor issues in release workflows --- .github/workflows/release-rc.yml | 18 ++++++++---------- .github/workflows/release.yml | 31 ++++++++++++++----------------- 2 files changed, 22 insertions(+), 27 deletions(-) diff --git a/.github/workflows/release-rc.yml b/.github/workflows/release-rc.yml index bf87ad1b66d..c4dcc8053bd 100644 --- a/.github/workflows/release-rc.yml +++ b/.github/workflows/release-rc.yml @@ -22,25 +22,23 @@ jobs: with: fetch-depth: 0 + # The given ref should belong to the master branch. + # If it's master, it shouldn't be more than 2 commits away (in case another push happened in the meantime). + # If it starts with 'v', it should be a tag and belong to the master branch. + # Anything else is invalid. - name: Validate ref run: | ref='${{ github.event.workflow_run.head_branch }}' sha='${{ github.event.workflow_run.head_sha }}' - case $ref in - master) - [ $(git branch --contains=$sha master | wc -l) -eq 1 ] && - [ $(git rev-list --count $sha..master) -le 2 ] - ;; - *) - false - ;; - esac + + [ "$ref" == "master" ] && + [ $(git branch --contains=$sha master | wc -l) -eq 1 ] && + [ $(git rev-list --count $sha..master) -le 2 ] if [ $? -ne 0 ]; then echo "::error ::Invalid ref $ref $sha" exit 1 fi release: - if: github.event.workflow_run.event == 'push' && github.event.workflow_run.conclusion == 'success' && github.repository_owner == 'armadaproject' name: Release needs: validate runs-on: "ubuntu-22.04" diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 0e783211ee1..98b29b6d7c0 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -22,32 +22,29 @@ jobs: with: fetch-depth: 0 + # The given ref should belong to the master branch. + # If it's master, it shouldn't be more than 2 commits away (in case another push happened in the meantime). + # If it starts with 'v', it should be a tag and belong to the master branch. + # Anything else is invalid. - name: Validate ref run: | ref='${{ github.event.workflow_run.head_branch }}' sha='${{ github.event.workflow_run.head_sha }}' - case $ref in - v?*) - semver_pattern="^v[0-9]+\.[0-9]+\.[0-9]+$" - # Check if the tag/branch name matches the semver pattern - if [[ ! $ref =~ $semver_pattern ]]; then - echo "::error ::Invalid ref $ref. It must be in semver format vX.Y.Z!" - exit 1 - fi - [ $(git rev-parse refs/tags/$ref) == $sha ] && - [ $(git branch --contains=$sha master | wc -l) -eq 1 ] - ;; - *) - false - ;; - esac + + semver_pattern="^v[0-9]+\.[0-9]+\.[0-9]+$" + # Check if the tag/branch name matches the semver pattern + if [[ ! $ref =~ $semver_pattern ]]; then + echo "::error ::Invalid ref $ref $sha: must be in semver format vX.Y.Z!" + exit 1 + fi + + [ $(git rev-parse refs/tags/$ref) == $sha ] && + [ $(git branch --contains=$sha master | wc -l) -eq 1 ] if [ $? -ne 0 ]; then echo "::error ::Invalid ref $ref $sha" exit 1 fi - release: - if: github.event.workflow_run.event == 'push' && github.event.workflow_run.conclusion == 'success' && github.repository_owner == 'armadaproject' name: "Release" needs: validate runs-on: ubuntu-22.04 From 0d0901f153e380ddeec1e12bd2311d47bfa12386 Mon Sep 17 00:00:00 2001 From: Dejan Pejchev Date: Thu, 6 Jul 2023 15:10:39 +0200 Subject: [PATCH 13/14] fix error comments in release workflow --- .github/workflows/release-rc.yml | 2 +- .github/workflows/release.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/release-rc.yml b/.github/workflows/release-rc.yml index c4dcc8053bd..4b943ba8ef5 100644 --- a/.github/workflows/release-rc.yml +++ b/.github/workflows/release-rc.yml @@ -35,7 +35,7 @@ jobs: [ $(git branch --contains=$sha master | wc -l) -eq 1 ] && [ $(git rev-list --count $sha..master) -le 2 ] if [ $? -ne 0 ]; then - echo "::error ::Invalid ref $ref $sha" + echo "::error ::Invalid ref $ref $sha: must be a merge to master branch and not more than 2 commits away" exit 1 fi release: diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 98b29b6d7c0..063adba0700 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -34,14 +34,14 @@ jobs: semver_pattern="^v[0-9]+\.[0-9]+\.[0-9]+$" # Check if the tag/branch name matches the semver pattern if [[ ! $ref =~ $semver_pattern ]]; then - echo "::error ::Invalid ref $ref $sha: must be in semver format vX.Y.Z!" + echo "::error ::Invalid ref $ref $sha: must be in semver format vX.Y.Z" exit 1 fi [ $(git rev-parse refs/tags/$ref) == $sha ] && [ $(git branch --contains=$sha master | wc -l) -eq 1 ] if [ $? -ne 0 ]; then - echo "::error ::Invalid ref $ref $sha" + echo "::error ::Invalid ref $ref $sha: must be a tag and belong to the master branch" exit 1 fi release: From 9699756bc7e1d241710a35a1a48b8d7521bfe539 Mon Sep 17 00:00:00 2001 From: Dejan Pejchev Date: Thu, 6 Jul 2023 16:51:36 +0200 Subject: [PATCH 14/14] update release workflow and polish it --- .github/workflows/release-rc.yml | 1 - .github/workflows/release.yml | 13 +++---------- 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/.github/workflows/release-rc.yml b/.github/workflows/release-rc.yml index 4b943ba8ef5..2aca8fb1f67 100644 --- a/.github/workflows/release-rc.yml +++ b/.github/workflows/release-rc.yml @@ -24,7 +24,6 @@ jobs: # The given ref should belong to the master branch. # If it's master, it shouldn't be more than 2 commits away (in case another push happened in the meantime). - # If it starts with 'v', it should be a tag and belong to the master branch. # Anything else is invalid. - name: Validate ref run: | diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 063adba0700..798ecc0e448 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -23,25 +23,18 @@ jobs: fetch-depth: 0 # The given ref should belong to the master branch. - # If it's master, it shouldn't be more than 2 commits away (in case another push happened in the meantime). - # If it starts with 'v', it should be a tag and belong to the master branch. + # If it starts with 'v', it should be a tag, belong to the master branch and match the semver regex. # Anything else is invalid. - name: Validate ref run: | ref='${{ github.event.workflow_run.head_branch }}' sha='${{ github.event.workflow_run.head_sha }}' - semver_pattern="^v[0-9]+\.[0-9]+\.[0-9]+$" - # Check if the tag/branch name matches the semver pattern - if [[ ! $ref =~ $semver_pattern ]]; then - echo "::error ::Invalid ref $ref $sha: must be in semver format vX.Y.Z" - exit 1 - fi - + [[ $ref =~ ^v[0-9]+\.[0-9]+\.[0-9]+$ ]] && [ $(git rev-parse refs/tags/$ref) == $sha ] && [ $(git branch --contains=$sha master | wc -l) -eq 1 ] if [ $? -ne 0 ]; then - echo "::error ::Invalid ref $ref $sha: must be a tag and belong to the master branch" + echo "::error ::Invalid ref $ref $sha: must be a tag, belong to the master branch and match the semver regex" exit 1 fi release: