Skip to content

Commit

Permalink
Add timeout information to scheduling metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
bastjan committed Aug 25, 2022
1 parent d132aa9 commit ae73b64
Show file tree
Hide file tree
Showing 3 changed files with 89 additions and 69 deletions.
52 changes: 52 additions & 0 deletions controllers/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
package controllers

import (
"github.com/prometheus/client_golang/prometheus"
"sigs.k8s.io/controller-runtime/pkg/metrics"
)

const (
metricTimedOutLabel = "timed_out"
metricCompletedLabel = "completed"
)

var podTimeUnscheduled = prometheus.NewSummaryVec(prometheus.SummaryOpts{
Name: "scheduler_canary_pod_unscheduled_seconds",
Help: "Time spent in pending state",
}, []string{"namespace", "name", "reason"})

var podTimeUntilAcknowledged = prometheus.NewSummaryVec(prometheus.SummaryOpts{
Name: "scheduler_canary_pod_until_acknowledged_seconds",
Help: "Time spent in an unacknowledged state",
}, []string{"namespace", "name", "reason"})

var podTimeUntilWaiting = prometheus.NewSummaryVec(prometheus.SummaryOpts{
Name: "scheduler_canary_pod_until_waiting_seconds",
Help: "Time spent before pulling images mounting volumes",
}, []string{"namespace", "name", "reason"})

var podTimeCompleted = prometheus.NewSummaryVec(prometheus.SummaryOpts{
Name: "scheduler_canary_pods_completed_seconds",
Help: "Time spent from creation to completion or from creation to the the specified .maxPodCompletionTimeout timeout",
}, []string{"namespace", "name", "reason"})

func init() {
metrics.Registry.MustRegister(
podTimeUnscheduled,
podTimeUntilAcknowledged,
podTimeUntilWaiting,

podTimeCompleted,
)
}

// initMetrics ensures the metrics are present in the output as soon as the instance is created.
func initMetrics(namespace, name string) {
for _, l := range [...]string{metricCompletedLabel, metricTimedOutLabel} {
podTimeUnscheduled.WithLabelValues(namespace, name, l)
podTimeUntilAcknowledged.WithLabelValues(namespace, name, l)
podTimeUntilWaiting.WithLabelValues(namespace, name, l)

podTimeCompleted.WithLabelValues(namespace, name, l)
}
}
65 changes: 37 additions & 28 deletions controllers/pod_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ import (
"fmt"
"time"

"github.com/appuio/scheduler-canary-controller/controllers/podstate"
"github.com/go-logr/logr"
"github.com/prometheus/client_golang/prometheus"
corev1 "k8s.io/api/core/v1"
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
Expand All @@ -33,6 +33,8 @@ import (
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/predicate"

"github.com/appuio/scheduler-canary-controller/controllers/podstate"
)

const (
Expand Down Expand Up @@ -72,23 +74,17 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R

instance := pod.GetLabels()[instanceLabel]

// ensure we check timeouts very earlier so pods still get deleted even if we mess up later
timeout, err := timeoutFromPod(pod)
if err != nil {
l.Error(err, "Pod has no timeout annotation, deleting immediately")
}
if podHasReachedTimeout(*pod, timeout) {
l.Info("Pod has reached timeout, deleting")
podsTimeouted.WithLabelValues(pod.Namespace, instance).Inc()
return ctrl.Result{}, r.Client.Delete(ctx, pod)
l.Error(err, "Pod has no valid timeout annotation, deleting immediately")
}

timedOut := podHasReachedTimeout(*pod, timeout)
state := podstate.State(*pod)
l.Info("Pod is in state", "state", state)
if state == podstate.PodCompleted {
err := calculateTimes(l, instance, pod)
if err != nil {
return ctrl.Result{}, err
l.Info("Pod is in state", "state", state, "timedOut", timedOut)
if timedOut || state == podstate.PodCompleted {
if err := recordSchedulingMetrics(l, instance, pod, timedOut); err != nil {
l.Error(err, "Failed to record metrics")
}
return ctrl.Result{}, r.Client.Delete(ctx, pod)
}
Expand Down Expand Up @@ -128,44 +124,57 @@ func (r *PodReconciler) strategicMergePatch(ctx context.Context, obj client.Obje
return r.Client.Patch(ctx, obj, client.RawPatch(types.StrategicMergePatchType, jp))
}

func calculateTimes(l logr.Logger, instance string, pod *corev1.Pod) error {
func recordSchedulingMetrics(l logr.Logger, instance string, pod *corev1.Pod, timedOut bool) error {
tr, err := getTrackedStates(pod)
if err != nil {
return err
return fmt.Errorf("failed to get state tracking timestamps: %w", err)
}

createdTime, ok := tr[podstate.PodCreated]
if !ok {
l.Info("WARNING: Pod created time not found, skipping calculation")
return nil
return fmt.Errorf("pod created time not found")
}

metricLabels := prometheus.Labels{"namespace": pod.Namespace, "name": instance}

cm := podTimeCompleted.MustCurryWith(metricLabels)
if timedOut {
cm.WithLabelValues(metricTimedOutLabel).Observe(time.Since(createdTime).Seconds())
} else {
cm.WithLabelValues(metricCompletedLabel).Observe(time.Since(createdTime).Seconds())
}

acknowledgedTime, hasAcknowledgedTime := tr[podstate.PodAcknowledged]
scheduledTime, hasScheduledTime := tr[podstate.PodScheduled]
waitingTime, hasWaitingTime := tr[podstate.PodWaiting]

usm := podTimeUnscheduled.WithLabelValues(pod.Namespace, instance)
usm := podTimeUnscheduled.MustCurryWith(metricLabels)
if hasScheduledTime {
usm.Observe(scheduledTime.Sub(createdTime).Seconds())
usm.WithLabelValues(metricCompletedLabel).Observe(scheduledTime.Sub(createdTime).Seconds())
} else if hasAcknowledgedTime {
usm.Observe(acknowledgedTime.Sub(createdTime).Seconds())
usm.WithLabelValues(metricCompletedLabel).Observe(acknowledgedTime.Sub(createdTime).Seconds())
} else if hasWaitingTime {
usm.Observe(waitingTime.Sub(createdTime).Seconds())
usm.WithLabelValues(metricCompletedLabel).Observe(waitingTime.Sub(createdTime).Seconds())
} else if timedOut {
usm.WithLabelValues(metricTimedOutLabel).Observe(time.Since(createdTime).Seconds())
}

uam := podTimeUntilAcknowledged.WithLabelValues(pod.Namespace, instance)
uam := podTimeUntilAcknowledged.MustCurryWith(metricLabels)
if hasAcknowledgedTime {
uam.Observe(acknowledgedTime.Sub(createdTime).Seconds())
uam.WithLabelValues(metricCompletedLabel).Observe(acknowledgedTime.Sub(createdTime).Seconds())
} else if hasWaitingTime {
uam.Observe(waitingTime.Sub(createdTime).Seconds())
uam.WithLabelValues(metricCompletedLabel).Observe(waitingTime.Sub(createdTime).Seconds())
} else if timedOut {
uam.WithLabelValues(metricTimedOutLabel).Observe(time.Since(createdTime).Seconds())
}

ptuw := podTimeUntilWaiting.MustCurryWith(metricLabels)
if hasWaitingTime {
podTimeUntilWaiting.
WithLabelValues(pod.Namespace, instance).
Observe(waitingTime.Sub(createdTime).Seconds())
ptuw.WithLabelValues(metricCompletedLabel).Observe(waitingTime.Sub(createdTime).Seconds())
} else if timedOut {
ptuw.WithLabelValues(metricTimedOutLabel).Observe(time.Since(createdTime).Seconds())
} else {
l.Info("WARNING: No pod waiting time found, skipping calculation")
return fmt.Errorf("pod has no waiting time")
}

return nil
Expand Down
41 changes: 0 additions & 41 deletions controllers/schedulercanary_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,49 +20,17 @@ import (
"context"
"time"

"github.com/prometheus/client_golang/prometheus"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/metrics"

monitoringv1beta1 "github.com/appuio/scheduler-canary-controller/api/v1beta1"
"github.com/appuio/scheduler-canary-controller/controllers/podstate"
)

var podTimeUnscheduled = prometheus.NewSummaryVec(prometheus.SummaryOpts{
Name: "scheduler_canary_pod_unscheduled_seconds",
Help: "Time spent in pending state",
}, []string{"namespace", "name"})

var podTimeUntilAcknowledged = prometheus.NewSummaryVec(prometheus.SummaryOpts{
Name: "scheduler_canary_pod_until_acknowledged_seconds",
Help: "Time spent in an unacknowledged state",
}, []string{"namespace", "name"})

var podTimeUntilWaiting = prometheus.NewSummaryVec(prometheus.SummaryOpts{
Name: "scheduler_canary_pod_until_waiting_seconds",
Help: "Time spent before pulling images mounting volumes",
}, []string{"namespace", "name"})

var podsTimeouted = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "scheduler_canary_pods_timeouted",
Help: "Pods that reached the specified .maxPodCompletionTimeout timeout",
}, []string{"namespace", "name"})

func init() {
metrics.Registry.MustRegister(
podTimeUnscheduled,
podTimeUntilAcknowledged,
podTimeUntilWaiting,

podsTimeouted,
)
}

// SchedulerCanaryReconciler reconciles a SchedulerCanary object
type SchedulerCanaryReconciler struct {
client.Client
Expand Down Expand Up @@ -155,12 +123,3 @@ func (r *SchedulerCanaryReconciler) createCanaryPod(ctx context.Context, instanc
l.Info("Reconciled")
return nil
}

// initMetrics ensures the metrics are present in the output as soon as the instance is created.
func initMetrics(namespace, name string) {
podTimeUnscheduled.WithLabelValues(namespace, name)
podTimeUntilAcknowledged.WithLabelValues(namespace, name)
podTimeUntilWaiting.WithLabelValues(namespace, name)

podsTimeouted.WithLabelValues(namespace, name)
}

0 comments on commit ae73b64

Please sign in to comment.