Skip to content

Commit

Permalink
Merge branch 'master' into f/chrisma/remove-uuid-event-ingester
Browse files Browse the repository at this point in the history
  • Loading branch information
d80tb7 committed Aug 24, 2024
2 parents 88e1aa3 + 523771f commit 574ecbe
Show file tree
Hide file tree
Showing 35 changed files with 1,357 additions and 1,157 deletions.
17 changes: 1 addition & 16 deletions config/scheduler/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,28 +9,13 @@ queueRefreshPeriod: 10s
disableSubmitCheck: false
metrics:
port: 9000
jobStateMetricsResetInterval: 12h
refreshInterval: 30s
metrics:
scheduleCycleTimeHistogramSettings:
start: 10.0
factor: 1.1
count: 110
reconcileCycleTimeHistogramSettings:
start: 10.0
factor: 1.1
count: 110
schedulerMetrics:
trackedResourceNames:
- "cpu"
- "memory"
- "ephemeral-storage"
- "nvidia.com/gpu"
resourceRenaming:
nvidia.com/gpu: "gpu"
amd.com/gpu: "gpu"
ephemeral-storage: "ephemeralStorage"
matchedRegexIndexByErrorMessageCacheSize: 100
resetInterval: "1h"
pulsar:
URL: "pulsar://pulsar:6650"
jobsetEventsTopic: "events"
Expand Down
42 changes: 39 additions & 3 deletions docs/python_airflow_operator.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ This class provides integration with Airflow and Armada
## armada.operators.armada module


### _class_ armada.operators.armada.ArmadaOperator(name, channel_args, armada_queue, job_request, job_set_prefix='', lookout_url_template=None, poll_interval=30, container_logs=None, k8s_token_retriever=None, deferrable=False, job_acknowledgement_timeout=300, \*\*kwargs)
### _class_ armada.operators.armada.ArmadaOperator(name, channel_args, armada_queue, job_request, job_set_prefix='', lookout_url_template=None, poll_interval=30, container_logs=None, k8s_token_retriever=None, deferrable=False, job_acknowledgement_timeout=300, dry_run=False, \*\*kwargs)
Bases: `BaseOperator`, `LoggingMixin`

An Airflow operator that manages Job submission to Armada.
Expand All @@ -33,7 +33,7 @@ and handles job cancellation if the Airflow task is killed.
* **armada_queue** (*str*) –


* **job_request** (*JobSubmitRequestItem*) –
* **job_request** (*JobSubmitRequestItem** | **Callable**[**[**Context**, **jinja2.Environment**]**, **JobSubmitRequestItem**]*) –


* **job_set_prefix** (*Optional**[**str**]*) –
Expand All @@ -57,6 +57,9 @@ and handles job cancellation if the Airflow task is killed.
* **job_acknowledgement_timeout** (*int*) –


* **dry_run** (*bool*) –



#### execute(context)
Submits the job to Armada and polls for completion.
Expand Down Expand Up @@ -91,6 +94,8 @@ operator needs to be cleaned up, or it will leave ghost processes behind.



#### operator_extra_links(_: Collection[BaseOperatorLink_ _ = (LookoutLink(),_ )

#### _property_ pod_manager(_: KubernetesPodLogManage_ )

#### render_template_fields(context, jinja_env=None)
Expand Down Expand Up @@ -136,7 +141,7 @@ Initializes a new ArmadaOperator.
* **armada_queue** (*str*) – The name of the Armada queue to which the job will be submitted.


* **job_request** (*JobSubmitRequestItem*) – The job to be submitted to Armada.
* **job_request** (*JobSubmitRequestItem** | **Callable**[**[**Context**, **jinja2.Environment**]**, **JobSubmitRequestItem**]*) – The job to be submitted to Armada.


* **job_set_prefix** (*Optional**[**str**]*) – A string to prepend to the jobSet name.
Expand All @@ -160,8 +165,39 @@ for asynchronous execution.
:param job_acknowledgement_timeout: The timeout in seconds to wait for a job to be
acknowledged by Armada.
:type job_acknowledgement_timeout: int
:param dry_run: Run Operator in dry-run mode - render Armada request and terminate.
:type dry_run: bool
:param kwargs: Additional keyword arguments to pass to the BaseOperator.


### _class_ armada.operators.armada.LookoutLink()
Bases: `BaseOperatorLink`


#### get_link(operator, \*, ti_key)
Link to external system.

Note: The old signature of this function was `(self, operator, dttm: datetime)`. That is still
supported at runtime but is deprecated.


* **Parameters**


* **operator** (*BaseOperator*) – The Airflow operator object this link is associated to.


* **ti_key** (*TaskInstanceKey*) – TaskInstance ID to return link for.



* **Returns**

link to external system



#### name(_ = 'Lookout_ )
## armada.triggers.armada module

## armada.auth module
Expand Down
7 changes: 3 additions & 4 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ require (
github.com/go-openapi/spec v0.20.14
github.com/gogo/protobuf v1.3.2
github.com/golang/protobuf v1.5.4
github.com/google/go-cmp v0.5.9 // indirect
github.com/google/go-cmp v0.6.0 // indirect
github.com/google/uuid v1.6.0
github.com/grpc-ecosystem/go-grpc-middleware v1.4.0
github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0
Expand All @@ -35,7 +35,7 @@ require (
github.com/oklog/ulid v1.3.1
github.com/patrickmn/go-cache v2.1.0+incompatible
github.com/pkg/errors v0.9.1
github.com/prometheus/client_golang v1.17.0
github.com/prometheus/client_golang v1.19.1
github.com/renstrom/shortuuid v3.0.0+incompatible
github.com/sirupsen/logrus v1.9.3
github.com/spf13/cobra v1.8.0
Expand Down Expand Up @@ -78,7 +78,7 @@ require (
github.com/magefile/mage v1.14.0
github.com/minio/highwayhash v1.0.2
github.com/openconfig/goyang v1.2.0
github.com/prometheus/common v0.45.0
github.com/prometheus/common v0.48.0
github.com/redis/go-redis/extra/redisprometheus/v9 v9.0.5
github.com/redis/go-redis/v9 v9.5.1
github.com/segmentio/fasthash v1.0.3
Expand Down Expand Up @@ -161,7 +161,6 @@ require (
github.com/mailru/easyjson v0.7.7 // indirect
github.com/mattn/go-isatty v0.0.18 // indirect
github.com/mattn/go-runewidth v0.0.15 // indirect
github.com/matttproud/golang_protobuf_extensions/v2 v2.0.0 // indirect
github.com/microcosm-cc/bluemonday v1.0.25 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
Expand Down
13 changes: 6 additions & 7 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -239,8 +239,9 @@ github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/
github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-cmp v0.5.8/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38=
github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
github.com/google/gofuzz v1.1.0 h1:Hsa8mG0dQ46ij8Sl2AYJDUv1oA9/d6Vk+3LG99Oe02g=
github.com/google/gofuzz v1.1.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
Expand Down Expand Up @@ -372,8 +373,6 @@ github.com/mattn/go-runewidth v0.0.15 h1:UNAjwbU9l54TA3KzvqLGxwWjHmMgBUVhBiTjelZ
github.com/mattn/go-runewidth v0.0.15/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
github.com/mattn/go-zglob v0.0.4 h1:LQi2iOm0/fGgu80AioIJ/1j9w9Oh+9DZ39J4VAGzHQM=
github.com/mattn/go-zglob v0.0.4/go.mod h1:MxxjyoXXnMxfIpxTK2GAkw1w8glPsQILx3N5wrKakiY=
github.com/matttproud/golang_protobuf_extensions/v2 v2.0.0 h1:jWpvCLoY8Z/e3VKvlsiIGKtc+UG6U5vzxaoagmhXfyg=
github.com/matttproud/golang_protobuf_extensions/v2 v2.0.0/go.mod h1:QUyp042oQthUoa9bqDv0ER0wrtXnBruoNd7aNjkbP+k=
github.com/microcosm-cc/bluemonday v1.0.25 h1:4NEwSfiJ+Wva0VxN5B8OwMicaJvD8r9tlJWm9rtloEg=
github.com/microcosm-cc/bluemonday v1.0.25/go.mod h1:ZIOjCQp1OrzBBPIJmfX4qDYFuhU02nx4bn030ixfHLE=
github.com/minio/highwayhash v1.0.2 h1:Aak5U0nElisjDCfPSG79Tgzkn2gl66NxOMspRrKnA/g=
Expand Down Expand Up @@ -437,13 +436,13 @@ github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRI
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/pquerna/cachecontrol v0.1.0 h1:yJMy84ti9h/+OEWa752kBTKv4XC30OtVVHYv/8cTqKc=
github.com/pquerna/cachecontrol v0.1.0/go.mod h1:NrUG3Z7Rdu85UNR3vm7SOsl1nFIeSiQnrHV5K9mBcUI=
github.com/prometheus/client_golang v1.17.0 h1:rl2sfwZMtSthVU752MqfjQozy7blglC+1SOtjMAMh+Q=
github.com/prometheus/client_golang v1.17.0/go.mod h1:VeL+gMmOAxkS2IqfCq0ZmHSL+LjWfWDUmp1mBz9JgUY=
github.com/prometheus/client_golang v1.19.1 h1:wZWJDwK+NameRJuPGDhlnFgx8e8HN3XHQeLaYJFJBOE=
github.com/prometheus/client_golang v1.19.1/go.mod h1:mP78NwGzrVks5S2H6ab8+ZZGJLZUq1hoULYBAYBw1Ho=
github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
github.com/prometheus/client_model v0.5.0 h1:VQw1hfvPvk3Uv6Qf29VrPF32JB6rtbgI6cYPYQjL0Qw=
github.com/prometheus/client_model v0.5.0/go.mod h1:dTiFglRmd66nLR9Pv9f0mZi7B7fk5Pm3gvsjB5tr+kI=
github.com/prometheus/common v0.45.0 h1:2BGz0eBc2hdMDLnO/8n0jeB3oPrt2D08CekT0lneoxM=
github.com/prometheus/common v0.45.0/go.mod h1:YJmSTw9BoKxJplESWWxlbyttQR4uaEcGyv9MZjVOJsY=
github.com/prometheus/common v0.48.0 h1:QO8U2CdOzSn1BBsmXJXduaaW+dY/5QLjfB8svtSzKKE=
github.com/prometheus/common v0.48.0/go.mod h1:0/KsvlIEfPQCQ5I2iNSAWKPZziNCvRs5EC6ILDTlAPc=
github.com/prometheus/procfs v0.12.0 h1:jluTpSng7V9hY0O2R9DzzJHYb2xULk9VTR1V1R/k6Bo=
github.com/prometheus/procfs v0.12.0/go.mod h1:pcuDEFsWDnvcgNzo4EEweacyhjeA9Zk3cnaOZAZEfOo=
github.com/redis/go-redis/extra/redisprometheus/v9 v9.0.5 h1:kvl0LOTQD23VR1R7A9vDti9msfV6mOE2+j6ngYkFsfg=
Expand Down
47 changes: 11 additions & 36 deletions internal/scheduler/configuration/configuration.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,7 @@ type Configuration struct {
// Configuration controlling leader election
Leader LeaderConfig
// Configuration controlling metrics
Metrics LegacyMetricsConfig
// Configuration for new scheduler metrics.
// Due to replace metrics configured via the above entry.
SchedulerMetrics MetricsConfig
Metrics MetricsConfig
// Scheduler configuration (this is shared with the old scheduler)
Scheduling SchedulingConfig
Auth authconfig.AuthConfig
Expand Down Expand Up @@ -69,28 +66,6 @@ func (c Configuration) Validate() error {
return validate.Struct(c)
}

type MetricsConfig struct {
// If true, disable metric collection and publishing.
Disabled bool
// Regexes used for job error categorisation.
// Specifically, the subCategory label for job failure counters is the first regex that matches the job error.
// If no regex matches, the subCategory label is the empty string.
TrackedErrorRegexes []string
// Metrics are exported for these resources.
TrackedResourceNames []v1.ResourceName
// Optionally rename resources in exported metrics.
// E.g., if ResourceRenaming["nvidia.com/gpu"] = "gpu", then metrics for resource "nvidia.com/gpu" use resource name "gpu" instead.
// This can be used to avoid illegal Prometheus metric names (e.g., for "nvidia.com/gpu" as "/" is not allowed).
// Allowed characters in resource names are [a-zA-Z_:][a-zA-Z0-9_:]*
// It can also be used to track multiple resources within the same metric, e.g., "nvidia.com/gpu" and "amd.com/gpu".
ResourceRenaming map[v1.ResourceName]string
// The first matching regex of each error message is cached in an LRU cache.
// This setting controls the cache size.
MatchedRegexIndexByErrorMessageCacheSize uint64
// Reset metrics this often. Resetting periodically ensures inactive time series are garbage-collected.
ResetInterval time.Duration
}

type LeaderConfig struct {
// Valid modes are "standalone" or "kubernetes"
Mode string `validate:"required"`
Expand Down Expand Up @@ -128,16 +103,16 @@ type HttpConfig struct {
Port int `validate:"required"`
}

// TODO: ALl this needs to be unified with MetricsConfig
type LegacyMetricsConfig struct {
Port uint16
RefreshInterval time.Duration
Metrics SchedulerMetricsConfig
}

type SchedulerMetricsConfig struct {
ScheduleCycleTimeHistogramSettings HistogramConfig
ReconcileCycleTimeHistogramSettings HistogramConfig
type MetricsConfig struct {
Port uint16
RefreshInterval time.Duration
JobStateMetricsResetInterval time.Duration
// Regexes used for job error categorisation.
// Specifically, the subCategory label for job failure counters is the first regex that matches the job error.
// If no regex matches, the subCategory label is the empty string.
TrackedErrorRegexes []string
// Metrics are exported for these resources.
TrackedResourceNames []v1.ResourceName
}

type HistogramConfig struct {
Expand Down
14 changes: 14 additions & 0 deletions internal/scheduler/context/scheduling.go
Original file line number Diff line number Diff line change
Expand Up @@ -378,3 +378,17 @@ func (sctx *SchedulingContext) AllocatedByQueueAndPriority() map[string]schedule
}
return rv
}

// FairnessError returns the cumulative delta between adjusted fair share and actual share for all users who
// are below their fair share
func (sctx *SchedulingContext) FairnessError() float64 {
fairnessError := 0.0
for _, qctx := range sctx.QueueSchedulingContexts {
actualShare := sctx.FairnessCostProvider.UnweightedCostFromQueue(qctx)
delta := qctx.AdjustedFairShare - actualShare
if delta > 0 {
fairnessError += delta
}
}
return fairnessError
}
87 changes: 72 additions & 15 deletions internal/scheduler/context/scheduling_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package context

import (
"fmt"
"testing"

"github.com/stretchr/testify/assert"
Expand Down Expand Up @@ -58,21 +59,11 @@ func TestSchedulingContextAccounting(t *testing.T) {
}

func TestCalculateFairShares(t *testing.T) {
zeroCpu := schedulerobjects.ResourceList{
Resources: map[string]resource.Quantity{"cpu": resource.MustParse("0")},
}
oneCpu := schedulerobjects.ResourceList{
Resources: map[string]resource.Quantity{"cpu": resource.MustParse("1")},
}
fortyCpu := schedulerobjects.ResourceList{
Resources: map[string]resource.Quantity{"cpu": resource.MustParse("40")},
}
oneHundredCpu := schedulerobjects.ResourceList{
Resources: map[string]resource.Quantity{"cpu": resource.MustParse("100")},
}
oneThousandCpu := schedulerobjects.ResourceList{
Resources: map[string]resource.Quantity{"cpu": resource.MustParse("1000")},
}
zeroCpu := cpu(0)
oneCpu := cpu(1)
fortyCpu := cpu(40)
oneHundredCpu := cpu(100)
oneThousandCpu := cpu(1000)
tests := map[string]struct {
availableResources schedulerobjects.ResourceList
queueCtxs map[string]*QueueSchedulingContext
Expand Down Expand Up @@ -208,6 +199,66 @@ func TestCalculateFairShares(t *testing.T) {
}
}

func TestCalculateFairnessError(t *testing.T) {
tests := map[string]struct {
availableResources schedulerobjects.ResourceList
queueCtxs map[string]*QueueSchedulingContext
expected float64
}{
"one queue, no error": {
availableResources: cpu(100),
queueCtxs: map[string]*QueueSchedulingContext{
"queueA": {Allocated: cpu(50), AdjustedFairShare: 0.5},
},
expected: 0,
},
"two queues, no error": {
availableResources: cpu(100),
queueCtxs: map[string]*QueueSchedulingContext{
"queueA": {Allocated: cpu(50), AdjustedFairShare: 0.5},
"queueB": {Allocated: cpu(50), AdjustedFairShare: 0.5},
},
expected: 0,
},
"one queue with error": {
availableResources: cpu(100),
queueCtxs: map[string]*QueueSchedulingContext{
"queueA": {Allocated: cpu(40), AdjustedFairShare: 0.5},
},
expected: 0.1,
},
"two queues with error": {
availableResources: cpu(100),
queueCtxs: map[string]*QueueSchedulingContext{
"queueA": {Allocated: cpu(40), AdjustedFairShare: 0.5},
"queueB": {Allocated: cpu(10), AdjustedFairShare: 0.5},
},
expected: 0.5,
},
"above fair share is not counted": {
availableResources: cpu(100),
queueCtxs: map[string]*QueueSchedulingContext{
"queueA": {Allocated: cpu(100), AdjustedFairShare: 0.5},
},
expected: 0.0,
},
"empty": {
availableResources: cpu(100),
queueCtxs: map[string]*QueueSchedulingContext{},
expected: 0.0,
},
}
for name, tc := range tests {
t.Run(name, func(t *testing.T) {
fairnessCostProvider, err := fairness.NewDominantResourceFairness(tc.availableResources, configuration.SchedulingConfig{DominantResourceFairnessResourcesToConsider: []string{"cpu"}})
require.NoError(t, err)
sctx := NewSchedulingContext("pool", fairnessCostProvider, nil, tc.availableResources)
sctx.QueueSchedulingContexts = tc.queueCtxs
assert.InDelta(t, tc.expected, sctx.FairnessError(), 0.00001)
})
}
}

func testNSmallCpuJobSchedulingContext(queue, priorityClassName string, n int) []*JobSchedulingContext {
rv := make([]*JobSchedulingContext, n)
for i := 0; i < n; i++ {
Expand All @@ -226,3 +277,9 @@ func testSmallCpuJobSchedulingContext(queue, priorityClassName string) *JobSched
GangInfo: EmptyGangInfo(job),
}
}

func cpu(n int) schedulerobjects.ResourceList {
return schedulerobjects.ResourceList{
Resources: map[string]resource.Quantity{"cpu": resource.MustParse(fmt.Sprintf("%d", n))},
}
}
Loading

0 comments on commit 574ecbe

Please sign in to comment.