armadaproject · severinson · Sep 19, 2023
diff --git a/.github/workflows/autoupdate.yml b/.github/workflows/autoupdate.yml
@@ -15,7 +15,7 @@ jobs:
       - uses: docker://chinthakagodawita/autoupdate-action:v1
         env:
           GITHUB_TOKEN: '${{ secrets.GITHUB_TOKEN }}'
-	  PR_LABELS: "auto-update"
+          PR_LABELS: "auto-update"
           MERGE_MSG: "Branch was auto-updated."
           RETRY_COUNT: "5"
           RETRY_SLEEP: "300"

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -2,10 +2,10 @@ name: CI
 
 on:
   push:
+    branches:
+      - master
     tags:
       - v*
-    branches-ignore:
-      - gh-pages
   pull_request:
     branches-ignore:
       - gh-pages

diff --git a/.github/workflows/not-airflow-operator.yml b/.github/workflows/not-airflow-operator.yml
diff --git a/.github/workflows/not-python-client.yml b/.github/workflows/not-python-client.yml
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -214,7 +214,7 @@ jobs:
 
           echo -e "### Git status" >> $GITHUB_STEP_SUMMARY
           if [[ "$changed" -gt 0 ]]; then
-            echo -e "Generated proto files are out of date. Please run 'make proto' and commit the changes." >> $GITHUB_STEP_SUMMARY
+            echo -e "Generated proto files are out of date. Please run 'mage proto' and commit the changes." >> $GITHUB_STEP_SUMMARY
 
             git status -s -uno >> $GITHUB_STEP_SUMMARY
 

diff --git a/client/python/CONTRIBUTING.md b/client/python/CONTRIBUTING.md
@@ -26,7 +26,7 @@ workflow for contributing. First time contributors can follow the guide below to
 Unlike most python projects, the Armada python client contains a large quantity of generated code. This code must be
 generated in order to compile and develop against the client.
 
-From the root of the repository, run `make python`. This will generate python code needed to build
+From the root of the repository, run `mage buildPython`. This will generate python code needed to build
 and use the client. This command needs to be re-run anytime an API change is committed (e.g. a change to a `*.proto`
 file).
 

diff --git a/client/python/README.md b/client/python/README.md
@@ -26,5 +26,5 @@ Before beginning, ensure you have:
 - Network access to fetch docker images and go dependencies.
 
 To generate all needed code, and install the python client:
-1) From the root of the repository, run `make python`
+1) From the root of the repository, run `mage buildPython`
 2) Install the client using `pip install client/python`. It's strongly recommended you do this inside a virtualenv.
diff --git a/client/python/docs/README.md b/client/python/docs/README.md
@@ -9,13 +9,13 @@ Usage
 
 Easy way:
 - Ensure all protobufs files needed for the client are generated by running
-  `make python` from the repository root.
+  `mage buildPython` from the repository root.
 - `tox -e docs` will create a valid virtual environment and use it to generate
   documentation. The generated files will be placed under `build/jekyll/*.md`.
 
 Manual way:
 - Ensure all protobufs files needed for the client are generated by running
-  `make python` from the repository root.
+  `mage buildPython` from the repository root.
 - Create a virtual environment containing all the deps listed in `tox.ini`
   under `[testenv:docs]`.
 - Run `poetry install -v` from inside `client/python` to install the client

diff --git a/cmd/scheduler/cmd/prune_database.go b/cmd/scheduler/cmd/prune_database.go
@@ -1,13 +1,13 @@
 package cmd
 
 import (
-	"context"
 	"time"
 
 	"github.com/pkg/errors"
 	"github.com/spf13/cobra"
 	"k8s.io/apimachinery/pkg/util/clock"
 
+	"github.com/armadaproject/armada/internal/common/armadacontext"
 	"github.com/armadaproject/armada/internal/common/database"
 	schedulerdb "github.com/armadaproject/armada/internal/scheduler/database"
 )
@@ -57,7 +57,7 @@ func pruneDatabase(cmd *cobra.Command, _ []string) error {
 		return errors.WithMessagef(err, "Failed to connect to database")
 	}
 
-	ctx, cancel := context.WithTimeout(context.Background(), timeout)
+	ctx, cancel := armadacontext.WithTimeout(armadacontext.Background(), timeout)
 	defer cancel()
 	return schedulerdb.PruneDb(ctx, db, batchSize, expireAfter, clock.RealClock{})
 }
diff --git a/docs/developer/manual-localdev.md b/docs/developer/manual-localdev.md
@@ -28,7 +28,7 @@ mage BootstrapTools
 # Compile .pb.go files from .proto files
 # (only necessary after changing a .proto file).
 mage proto
-make dotnet
+mage dotnet
 
 # Build the Docker images containing all Armada components.
 # Only the main "bundle" is needed for quickly testing Armada.

diff --git a/docs/python_airflow_operator.md b/docs/python_airflow_operator.md
@@ -239,9 +239,27 @@ Reports the result of the job and returns.
 
 
 
+#### serialize()
+Get a serialized version of this object.
+
+
+* **Returns**
+
+    A dict of keyword arguments used when instantiating
+
+
+
+* **Return type**
+
+    dict
+
+
+this object.
+
+
 #### template_fields(_: Sequence[str_ _ = ('job_request_items',_ )
 
-### _class_ armada.operators.armada_deferrable.ArmadaJobCompleteTrigger(job_id, job_service_channel_args, armada_queue, job_set_id, airflow_task_name)
+### _class_ armada.operators.armada_deferrable.ArmadaJobCompleteTrigger(job_id, job_service_channel_args, armada_queue, job_set_id, airflow_task_name, poll_interval=30)
 Bases: `BaseTrigger`
 
 An airflow trigger that monitors the job state of an armada job.
@@ -269,6 +287,9 @@ Triggers when the job is complete.
     belongs.
 
 
+    * **poll_interval** (*int*) – How often to poll jobservice to get status.
+
+
 
 * **Returns**
 
@@ -281,7 +302,7 @@ Runs the trigger. Meant to be called by an airflow triggerer process.
 
 
 #### serialize()
-Returns the information needed to reconstruct this Trigger.
+Return the information needed to reconstruct this Trigger.
 
 
 * **Returns**
@@ -664,7 +685,7 @@ A terminated event is SUCCEEDED, FAILED or CANCELLED
 
 
 
-### _async_ armada.operators.utils.search_for_job_complete_async(armada_queue, job_set_id, airflow_task_name, job_id, job_service_client, log, time_out_for_failure=7200)
+### _async_ armada.operators.utils.search_for_job_complete_async(armada_queue, job_set_id, airflow_task_name, job_id, job_service_client, log, poll_interval, time_out_for_failure=7200)
 Poll JobService cache asyncronously until you get a terminated event.
 
 A terminated event is SUCCEEDED, FAILED or CANCELLED
@@ -689,6 +710,9 @@ A terminated event is SUCCEEDED, FAILED or CANCELLED
     It is optional only for testing
 
 
+    * **poll_interval** (*int*) – How often to poll jobservice to get status.
+
+
     * **time_out_for_failure** (*int*) – The amount of time a job
     can be in job_id_not_found
     before we decide it was a invalid job

diff --git a/internal/armada/server/lease.go b/internal/armada/server/lease.go
@@ -344,7 +344,7 @@ func (q *AggregatedQueueServer) getJobs(ctx *armadacontext.Context, req *api.Str
 			lastSeen,
 		)
 		if err != nil {
-			logging.WithStacktrace(ctx.Log, err).Warnf(
+			logging.WithStacktrace(ctx, err).Warnf(
 				"skipping node %s from executor %s", nodeInfo.GetName(), req.GetClusterId(),
 			)
 			continue
@@ -566,7 +566,7 @@ func (q *AggregatedQueueServer) getJobs(ctx *armadacontext.Context, req *api.Str
 	if q.SchedulingContextRepository != nil {
 		sctx.ClearJobSpecs()
 		if err := q.SchedulingContextRepository.AddSchedulingContext(sctx); err != nil {
-			logging.WithStacktrace(ctx.Log, err).Error("failed to store scheduling context")
+			logging.WithStacktrace(ctx, err).Error("failed to store scheduling context")
 		}
 	}
 
@@ -641,7 +641,7 @@ func (q *AggregatedQueueServer) getJobs(ctx *armadacontext.Context, req *api.Str
 		jobIdsToDelete := util.Map(jobsToDelete, func(job *api.Job) string { return job.Id })
 		log.Infof("deleting preempted jobs: %v", jobIdsToDelete)
 		if deletionResult, err := q.jobRepository.DeleteJobs(jobsToDelete); err != nil {
-			logging.WithStacktrace(ctx.Log, err).Error("failed to delete preempted jobs from Redis")
+			logging.WithStacktrace(ctx, err).Error("failed to delete preempted jobs from Redis")
 		} else {
 			deleteErrorByJobId := armadamaps.MapKeys(deletionResult, func(job *api.Job) string { return job.Id })
 			for jobId := range preemptedApiJobsById {
@@ -704,7 +704,7 @@ func (q *AggregatedQueueServer) getJobs(ctx *armadacontext.Context, req *api.Str
 		}
 	}
 	if err := q.usageRepository.UpdateClusterQueueResourceUsage(req.ClusterId, currentExecutorReport); err != nil {
-		logging.WithStacktrace(ctx.Log, err).Errorf("failed to update cluster usage")
+		logging.WithStacktrace(ctx, err).Errorf("failed to update cluster usage")
 	}
 
 	allocatedByQueueAndPriorityClassForPool = q.aggregateAllocationAcrossExecutor(reportsByExecutor, req.Pool)
@@ -728,7 +728,7 @@ func (q *AggregatedQueueServer) getJobs(ctx *armadacontext.Context, req *api.Str
 			}
 			node, err := nodeDb.GetNode(nodeId)
 			if err != nil {
-				logging.WithStacktrace(ctx.Log, err).Warnf("failed to set node id selector on job %s: node with id %s not found", apiJob.Id, nodeId)
+				logging.WithStacktrace(ctx, err).Warnf("failed to set node id selector on job %s: node with id %s not found", apiJob.Id, nodeId)
 				continue
 			}
 			v := node.Labels[q.schedulingConfig.Preemption.NodeIdLabel]
@@ -764,7 +764,7 @@ func (q *AggregatedQueueServer) getJobs(ctx *armadacontext.Context, req *api.Str
 			}
 			node, err := nodeDb.GetNode(nodeId)
 			if err != nil {
-				logging.WithStacktrace(ctx.Log, err).Warnf("failed to set node name on job %s: node with id %s not found", apiJob.Id, nodeId)
+				logging.WithStacktrace(ctx, err).Warnf("failed to set node name on job %s: node with id %s not found", apiJob.Id, nodeId)
 				continue
 			}
 			podSpec.NodeName = node.Name

diff --git a/internal/armada/server/submit_from_log.go b/internal/armada/server/submit_from_log.go
@@ -125,12 +125,12 @@ func (srv *SubmitFromLog) Run(ctx *armadacontext.Context) error {
 			sequence, err := eventutil.UnmarshalEventSequence(ctxWithLogger, msg.Payload())
 			if err != nil {
 				srv.ack(ctx, msg)
-				logging.WithStacktrace(ctxWithLogger.Log, err).Warnf("processing message failed; ignoring")
+				logging.WithStacktrace(ctxWithLogger, err).Warnf("processing message failed; ignoring")
 				numErrored++
 				break
 			}
 
-			ctxWithLogger.Log.WithField("numEvents", len(sequence.Events)).Info("processing sequence")
+			ctxWithLogger.WithField("numEvents", len(sequence.Events)).Info("processing sequence")
 			// TODO: Improve retry logic.
 			srv.ProcessSequence(ctxWithLogger, sequence)
 			srv.ack(ctx, msg)
@@ -155,11 +155,11 @@ func (srv *SubmitFromLog) ProcessSequence(ctx *armadacontext.Context, sequence *
 	for i < len(sequence.Events) && time.Since(lastProgress) < timeout {
 		j, err := srv.ProcessSubSequence(ctx, i, sequence)
 		if err != nil {
-			logging.WithStacktrace(ctx.Log, err).WithFields(logrus.Fields{"lowerIndex": i, "upperIndex": j}).Warnf("processing subsequence failed; ignoring")
+			logging.WithStacktrace(ctx, err).WithFields(logrus.Fields{"lowerIndex": i, "upperIndex": j}).Warnf("processing subsequence failed; ignoring")
 		}
 
 		if j == i {
-			ctx.Log.WithFields(logrus.Fields{"lowerIndex": i, "upperIndex": j}).Info("made no progress")
+			ctx.WithFields(logrus.Fields{"lowerIndex": i, "upperIndex": j}).Info("made no progress")
 
 			// We should only get here if a transient error occurs.
 			// Sleep for a bit before retrying.