From ae73b646096153358df4f5a17c3d228215accc1d Mon Sep 17 00:00:00 2001 From: Sebastian Widmer Date: Thu, 25 Aug 2022 10:50:39 +0200 Subject: [PATCH] Add timeout information to scheduling metrics --- controllers/metrics.go | 52 ++++++++++++++++++ controllers/pod_controller.go | 65 +++++++++++++---------- controllers/schedulercanary_controller.go | 41 -------------- 3 files changed, 89 insertions(+), 69 deletions(-) create mode 100644 controllers/metrics.go diff --git a/controllers/metrics.go b/controllers/metrics.go new file mode 100644 index 0000000..a2296fa --- /dev/null +++ b/controllers/metrics.go @@ -0,0 +1,52 @@ +package controllers + +import ( + "github.com/prometheus/client_golang/prometheus" + "sigs.k8s.io/controller-runtime/pkg/metrics" +) + +const ( + metricTimedOutLabel = "timed_out" + metricCompletedLabel = "completed" +) + +var podTimeUnscheduled = prometheus.NewSummaryVec(prometheus.SummaryOpts{ + Name: "scheduler_canary_pod_unscheduled_seconds", + Help: "Time spent in pending state", +}, []string{"namespace", "name", "reason"}) + +var podTimeUntilAcknowledged = prometheus.NewSummaryVec(prometheus.SummaryOpts{ + Name: "scheduler_canary_pod_until_acknowledged_seconds", + Help: "Time spent in an unacknowledged state", +}, []string{"namespace", "name", "reason"}) + +var podTimeUntilWaiting = prometheus.NewSummaryVec(prometheus.SummaryOpts{ + Name: "scheduler_canary_pod_until_waiting_seconds", + Help: "Time spent before pulling images mounting volumes", +}, []string{"namespace", "name", "reason"}) + +var podTimeCompleted = prometheus.NewSummaryVec(prometheus.SummaryOpts{ + Name: "scheduler_canary_pods_completed_seconds", + Help: "Time spent from creation to completion or from creation to the the specified .maxPodCompletionTimeout timeout", +}, []string{"namespace", "name", "reason"}) + +func init() { + metrics.Registry.MustRegister( + podTimeUnscheduled, + podTimeUntilAcknowledged, + podTimeUntilWaiting, + + podTimeCompleted, + ) +} + +// initMetrics ensures the metrics are present in the output as soon as the instance is created. +func initMetrics(namespace, name string) { + for _, l := range [...]string{metricCompletedLabel, metricTimedOutLabel} { + podTimeUnscheduled.WithLabelValues(namespace, name, l) + podTimeUntilAcknowledged.WithLabelValues(namespace, name, l) + podTimeUntilWaiting.WithLabelValues(namespace, name, l) + + podTimeCompleted.WithLabelValues(namespace, name, l) + } +} diff --git a/controllers/pod_controller.go b/controllers/pod_controller.go index 837a3da..2f4039f 100644 --- a/controllers/pod_controller.go +++ b/controllers/pod_controller.go @@ -22,8 +22,8 @@ import ( "fmt" "time" - "github.com/appuio/scheduler-canary-controller/controllers/podstate" "github.com/go-logr/logr" + "github.com/prometheus/client_golang/prometheus" corev1 "k8s.io/api/core/v1" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" @@ -33,6 +33,8 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/predicate" + + "github.com/appuio/scheduler-canary-controller/controllers/podstate" ) const ( @@ -72,23 +74,17 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R instance := pod.GetLabels()[instanceLabel] - // ensure we check timeouts very earlier so pods still get deleted even if we mess up later timeout, err := timeoutFromPod(pod) if err != nil { - l.Error(err, "Pod has no timeout annotation, deleting immediately") - } - if podHasReachedTimeout(*pod, timeout) { - l.Info("Pod has reached timeout, deleting") - podsTimeouted.WithLabelValues(pod.Namespace, instance).Inc() - return ctrl.Result{}, r.Client.Delete(ctx, pod) + l.Error(err, "Pod has no valid timeout annotation, deleting immediately") } + timedOut := podHasReachedTimeout(*pod, timeout) state := podstate.State(*pod) - l.Info("Pod is in state", "state", state) - if state == podstate.PodCompleted { - err := calculateTimes(l, instance, pod) - if err != nil { - return ctrl.Result{}, err + l.Info("Pod is in state", "state", state, "timedOut", timedOut) + if timedOut || state == podstate.PodCompleted { + if err := recordSchedulingMetrics(l, instance, pod, timedOut); err != nil { + l.Error(err, "Failed to record metrics") } return ctrl.Result{}, r.Client.Delete(ctx, pod) } @@ -128,44 +124,57 @@ func (r *PodReconciler) strategicMergePatch(ctx context.Context, obj client.Obje return r.Client.Patch(ctx, obj, client.RawPatch(types.StrategicMergePatchType, jp)) } -func calculateTimes(l logr.Logger, instance string, pod *corev1.Pod) error { +func recordSchedulingMetrics(l logr.Logger, instance string, pod *corev1.Pod, timedOut bool) error { tr, err := getTrackedStates(pod) if err != nil { - return err + return fmt.Errorf("failed to get state tracking timestamps: %w", err) } createdTime, ok := tr[podstate.PodCreated] if !ok { - l.Info("WARNING: Pod created time not found, skipping calculation") - return nil + return fmt.Errorf("pod created time not found") + } + + metricLabels := prometheus.Labels{"namespace": pod.Namespace, "name": instance} + + cm := podTimeCompleted.MustCurryWith(metricLabels) + if timedOut { + cm.WithLabelValues(metricTimedOutLabel).Observe(time.Since(createdTime).Seconds()) + } else { + cm.WithLabelValues(metricCompletedLabel).Observe(time.Since(createdTime).Seconds()) } acknowledgedTime, hasAcknowledgedTime := tr[podstate.PodAcknowledged] scheduledTime, hasScheduledTime := tr[podstate.PodScheduled] waitingTime, hasWaitingTime := tr[podstate.PodWaiting] - usm := podTimeUnscheduled.WithLabelValues(pod.Namespace, instance) + usm := podTimeUnscheduled.MustCurryWith(metricLabels) if hasScheduledTime { - usm.Observe(scheduledTime.Sub(createdTime).Seconds()) + usm.WithLabelValues(metricCompletedLabel).Observe(scheduledTime.Sub(createdTime).Seconds()) } else if hasAcknowledgedTime { - usm.Observe(acknowledgedTime.Sub(createdTime).Seconds()) + usm.WithLabelValues(metricCompletedLabel).Observe(acknowledgedTime.Sub(createdTime).Seconds()) } else if hasWaitingTime { - usm.Observe(waitingTime.Sub(createdTime).Seconds()) + usm.WithLabelValues(metricCompletedLabel).Observe(waitingTime.Sub(createdTime).Seconds()) + } else if timedOut { + usm.WithLabelValues(metricTimedOutLabel).Observe(time.Since(createdTime).Seconds()) } - uam := podTimeUntilAcknowledged.WithLabelValues(pod.Namespace, instance) + uam := podTimeUntilAcknowledged.MustCurryWith(metricLabels) if hasAcknowledgedTime { - uam.Observe(acknowledgedTime.Sub(createdTime).Seconds()) + uam.WithLabelValues(metricCompletedLabel).Observe(acknowledgedTime.Sub(createdTime).Seconds()) } else if hasWaitingTime { - uam.Observe(waitingTime.Sub(createdTime).Seconds()) + uam.WithLabelValues(metricCompletedLabel).Observe(waitingTime.Sub(createdTime).Seconds()) + } else if timedOut { + uam.WithLabelValues(metricTimedOutLabel).Observe(time.Since(createdTime).Seconds()) } + ptuw := podTimeUntilWaiting.MustCurryWith(metricLabels) if hasWaitingTime { - podTimeUntilWaiting. - WithLabelValues(pod.Namespace, instance). - Observe(waitingTime.Sub(createdTime).Seconds()) + ptuw.WithLabelValues(metricCompletedLabel).Observe(waitingTime.Sub(createdTime).Seconds()) + } else if timedOut { + ptuw.WithLabelValues(metricTimedOutLabel).Observe(time.Since(createdTime).Seconds()) } else { - l.Info("WARNING: No pod waiting time found, skipping calculation") + return fmt.Errorf("pod has no waiting time") } return nil diff --git a/controllers/schedulercanary_controller.go b/controllers/schedulercanary_controller.go index 8afe761..d405161 100644 --- a/controllers/schedulercanary_controller.go +++ b/controllers/schedulercanary_controller.go @@ -20,49 +20,17 @@ import ( "context" "time" - "github.com/prometheus/client_golang/prometheus" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/controller-runtime/pkg/metrics" monitoringv1beta1 "github.com/appuio/scheduler-canary-controller/api/v1beta1" "github.com/appuio/scheduler-canary-controller/controllers/podstate" ) -var podTimeUnscheduled = prometheus.NewSummaryVec(prometheus.SummaryOpts{ - Name: "scheduler_canary_pod_unscheduled_seconds", - Help: "Time spent in pending state", -}, []string{"namespace", "name"}) - -var podTimeUntilAcknowledged = prometheus.NewSummaryVec(prometheus.SummaryOpts{ - Name: "scheduler_canary_pod_until_acknowledged_seconds", - Help: "Time spent in an unacknowledged state", -}, []string{"namespace", "name"}) - -var podTimeUntilWaiting = prometheus.NewSummaryVec(prometheus.SummaryOpts{ - Name: "scheduler_canary_pod_until_waiting_seconds", - Help: "Time spent before pulling images mounting volumes", -}, []string{"namespace", "name"}) - -var podsTimeouted = prometheus.NewCounterVec(prometheus.CounterOpts{ - Name: "scheduler_canary_pods_timeouted", - Help: "Pods that reached the specified .maxPodCompletionTimeout timeout", -}, []string{"namespace", "name"}) - -func init() { - metrics.Registry.MustRegister( - podTimeUnscheduled, - podTimeUntilAcknowledged, - podTimeUntilWaiting, - - podsTimeouted, - ) -} - // SchedulerCanaryReconciler reconciles a SchedulerCanary object type SchedulerCanaryReconciler struct { client.Client @@ -155,12 +123,3 @@ func (r *SchedulerCanaryReconciler) createCanaryPod(ctx context.Context, instanc l.Info("Reconciled") return nil } - -// initMetrics ensures the metrics are present in the output as soon as the instance is created. -func initMetrics(namespace, name string) { - podTimeUnscheduled.WithLabelValues(namespace, name) - podTimeUntilAcknowledged.WithLabelValues(namespace, name) - podTimeUntilWaiting.WithLabelValues(namespace, name) - - podsTimeouted.WithLabelValues(namespace, name) -}