From 86efb449fa21e6063cc83d89dd6b39f58f6a7295 Mon Sep 17 00:00:00 2001 From: Chris Bandy Date: Mon, 2 Dec 2024 15:32:39 -0600 Subject: [PATCH] Add functions for attaching errors to spans The "spancheck" linter reminds us to call "Span.RecordError" when returning an error. Two functions help with that: "tracing.Check" and "tracing.Escape". --- .golangci.yaml | 3 + .../crunchybridgecluster_controller.go | 16 ++-- .../pgupgrade/pgupgrade_controller.go | 1 + .../controller/postgrescluster/controller.go | 21 ++--- .../controller/postgrescluster/instance.go | 38 ++++---- .../standalone_pgadmin/controller.go | 4 +- internal/naming/dns.go | 3 +- internal/tracing/errors.go | 34 +++++++ internal/tracing/errors_test.go | 94 +++++++++++++++++++ 9 files changed, 172 insertions(+), 42 deletions(-) create mode 100644 internal/tracing/errors.go create mode 100644 internal/tracing/errors_test.go diff --git a/.golangci.yaml b/.golangci.yaml index e2dd0c9fb..59bf0ad53 100644 --- a/.golangci.yaml +++ b/.golangci.yaml @@ -100,8 +100,11 @@ linters-settings: no-unaliased: true spancheck: + checks: [end, record-error] extra-start-span-signatures: - 'github.com/crunchydata/postgres-operator/internal/tracing.Start:opentelemetry' + ignore-check-signatures: + - 'tracing.Escape' issues: exclude-generated: strict diff --git a/internal/bridge/crunchybridgecluster/crunchybridgecluster_controller.go b/internal/bridge/crunchybridgecluster/crunchybridgecluster_controller.go index 49a0cc555..df283318c 100644 --- a/internal/bridge/crunchybridgecluster/crunchybridgecluster_controller.go +++ b/internal/bridge/crunchybridgecluster/crunchybridgecluster_controller.go @@ -133,7 +133,7 @@ func (r *CrunchyBridgeClusterReconciler) Reconcile(ctx context.Context, req ctrl // NotFound cannot be fixed by requeuing so ignore it. During background // deletion, we receive delete events from crunchybridgecluster's dependents after // crunchybridgecluster is deleted. - return ctrl.Result{}, client.IgnoreNotFound(err) + return ctrl.Result{}, tracing.Escape(span, client.IgnoreNotFound(err)) } // Get and validate connection secret for requests @@ -152,12 +152,12 @@ func (r *CrunchyBridgeClusterReconciler) Reconcile(ctx context.Context, req ctrl // is not being deleted. if result, err := r.handleDelete(ctx, crunchybridgecluster, key); err != nil { log.Error(err, "deleting") - return ctrl.Result{}, err + return ctrl.Result{}, tracing.Escape(span, err) } else if result != nil { if log := log.V(1); log.Enabled() { log.Info("deleting", "result", fmt.Sprintf("%+v", *result)) } - return *result, err + return *result, tracing.Escape(span, err) } // Wonder if there's a better way to handle adding/checking/removing statuses @@ -190,7 +190,7 @@ func (r *CrunchyBridgeClusterReconciler) Reconcile(ctx context.Context, req ctrl // Check if a cluster with the same name already exists controllerResult, err := r.handleDuplicateClusterName(ctx, key, team, crunchybridgecluster) if err != nil || controllerResult != nil { - return *controllerResult, err + return *controllerResult, tracing.Escape(span, err) } // if we've gotten here then no cluster exists with that name and we're missing the ID, ergo, create cluster @@ -204,26 +204,26 @@ func (r *CrunchyBridgeClusterReconciler) Reconcile(ctx context.Context, req ctrl // Get Cluster err = r.handleGetCluster(ctx, key, crunchybridgecluster) if err != nil { - return ctrl.Result{}, err + return ctrl.Result{}, tracing.Escape(span, err) } // Get Cluster Status err = r.handleGetClusterStatus(ctx, key, crunchybridgecluster) if err != nil { - return ctrl.Result{}, err + return ctrl.Result{}, tracing.Escape(span, err) } // Get Cluster Upgrade err = r.handleGetClusterUpgrade(ctx, key, crunchybridgecluster) if err != nil { - return ctrl.Result{}, err + return ctrl.Result{}, tracing.Escape(span, err) } // Reconcile roles and their secrets err = r.reconcilePostgresRoles(ctx, key, crunchybridgecluster) if err != nil { log.Error(err, "issue reconciling postgres user roles/secrets") - return ctrl.Result{}, err + return ctrl.Result{}, tracing.Escape(span, err) } // For now, we skip updating until the upgrade status is cleared. diff --git a/internal/controller/pgupgrade/pgupgrade_controller.go b/internal/controller/pgupgrade/pgupgrade_controller.go index c66494c6b..349a01ee8 100644 --- a/internal/controller/pgupgrade/pgupgrade_controller.go +++ b/internal/controller/pgupgrade/pgupgrade_controller.go @@ -98,6 +98,7 @@ func (r *PGUpgradeReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( ctx, span := tracing.Start(ctx, "reconcile-pgupgrade") log := logging.FromContext(ctx) defer span.End() + defer func(s tracing.Span) { _ = tracing.Escape(s, err) }(span) // Retrieve the upgrade from the client cache, if it exists. A deferred // function below will send any changes to its Status field. diff --git a/internal/controller/postgrescluster/controller.go b/internal/controller/postgrescluster/controller.go index 512738621..933b78181 100644 --- a/internal/controller/postgrescluster/controller.go +++ b/internal/controller/postgrescluster/controller.go @@ -80,9 +80,8 @@ func (r *Reconciler) Reconcile( // cluster is deleted. if err = client.IgnoreNotFound(err); err != nil { log.Error(err, "unable to fetch PostgresCluster") - span.RecordError(err) } - return runtime.ErrorWithBackoff(err) + return runtime.ErrorWithBackoff(tracing.Escape(span, err)) } // Set any defaults that may not have been stored in the API. No DeepCopy @@ -107,9 +106,8 @@ func (r *Reconciler) Reconcile( // Check for and handle deletion of cluster. Return early if it is being // deleted or there was an error. if result, err := r.handleDelete(ctx, cluster); err != nil { - span.RecordError(err) log.Error(err, "deleting") - return runtime.ErrorWithBackoff(err) + return runtime.ErrorWithBackoff(tracing.Escape(span, err)) } else if result != nil { if log := log.V(1); log.Enabled() { @@ -130,7 +128,7 @@ func (r *Reconciler) Reconcile( // specifically allow reconciliation if the cluster is shutdown to // facilitate upgrades, otherwise return if !initialize.FromPointer(cluster.Spec.Shutdown) { - return runtime.ErrorWithBackoff(err) + return runtime.ErrorWithBackoff(tracing.Escape(span, err)) } } // Issue Warning Event if postgres version is EOL according to PostgreSQL: @@ -154,7 +152,7 @@ func (r *Reconciler) Reconcile( path := field.NewPath("spec", "standby") err := field.Invalid(path, cluster.Name, "Standby requires a host or repoName to be enabled") r.Recorder.Event(cluster, corev1.EventTypeWarning, "InvalidStandbyConfiguration", err.Error()) - return runtime.ErrorWithBackoff(err) + return runtime.ErrorWithBackoff(tracing.Escape(span, err)) } var ( @@ -208,7 +206,7 @@ func (r *Reconciler) Reconcile( ObservedGeneration: cluster.GetGeneration(), }) - return runtime.ErrorWithBackoff(patchClusterStatus()) + return runtime.ErrorWithBackoff(tracing.Escape(span, patchClusterStatus())) } else { meta.RemoveStatusCondition(&cluster.Status.Conditions, v1beta1.PostgresClusterProgressing) } @@ -228,7 +226,7 @@ func (r *Reconciler) Reconcile( ObservedGeneration: cluster.GetGeneration(), }) - return runtime.ErrorWithBackoff(patchClusterStatus()) + return runtime.ErrorWithBackoff(tracing.Escape(span, patchClusterStatus())) } else { meta.RemoveStatusCondition(&cluster.Status.Conditions, v1beta1.PostgresClusterProgressing) } @@ -259,7 +257,8 @@ func (r *Reconciler) Reconcile( // return is no longer needed, and reconciliation can proceed normally. returnEarly, err := r.reconcileDirMoveJobs(ctx, cluster) if err != nil || returnEarly { - return runtime.ErrorWithBackoff(errors.Join(err, patchClusterStatus())) + return runtime.ErrorWithBackoff(tracing.Escape(span, + errors.Join(err, patchClusterStatus()))) } } if err == nil { @@ -309,7 +308,7 @@ func (r *Reconciler) Reconcile( // can proceed normally. returnEarly, err := r.reconcileDataSource(ctx, cluster, instances, clusterVolumes, rootCA, backupsSpecFound) if err != nil || returnEarly { - return runtime.ErrorWithBackoff(errors.Join(err, patchClusterStatus())) + return runtime.ErrorWithBackoff(tracing.Escape(span, errors.Join(err, patchClusterStatus()))) } } if err == nil { @@ -401,7 +400,7 @@ func (r *Reconciler) Reconcile( log.V(1).Info("reconciled cluster") - return result, errors.Join(err, patchClusterStatus()) + return result, tracing.Escape(span, errors.Join(err, patchClusterStatus())) } // deleteControlled safely deletes object when it is controlled by cluster. diff --git a/internal/controller/postgrescluster/instance.go b/internal/controller/postgrescluster/instance.go index 4dfa37559..97cc2cdce 100644 --- a/internal/controller/postgrescluster/instance.go +++ b/internal/controller/postgrescluster/instance.go @@ -807,8 +807,7 @@ func (r *Reconciler) rolloutInstance( err = errors.New("unable to switchover") } - span.RecordError(err) - return err + return tracing.Escape(span, err) } // When the cluster has only one instance for failover, perform a series of @@ -840,8 +839,7 @@ func (r *Reconciler) rolloutInstance( logging.FromContext(ctx).V(1).Info("attempted checkpoint", "duration", elapsed, "stdout", stdout, "stderr", stderr) - span.RecordError(err) - return elapsed, err + return elapsed, tracing.Escape(span, err) } duration, err := checkpoint(ctx) @@ -950,8 +948,7 @@ func (r *Reconciler) rolloutInstances( } } - span.RecordError(err) - return err + return tracing.Escape(span, err) } // scaleDownInstances removes extra instances from a cluster until it matches @@ -1081,20 +1078,23 @@ func (r *Reconciler) scaleUpInstances( // While there are fewer instances than specified, generate another empty one // and append it. for len(instances) < int(*set.Replicas) { - _, span := tracing.Start(ctx, "generate-instance-name") - next := naming.GenerateInstance(cluster, set) - // if there are any available instance names (as determined by observing any PVCs for the - // instance set that are not currently associated with an instance, e.g. in the event the - // instance STS was deleted), then reuse them instead of generating a new name - if len(availableInstanceNames) > 0 { - next.Name = availableInstanceNames[0] - availableInstanceNames = availableInstanceNames[1:] - } else { - for instanceNames.Has(next.Name) { - next = naming.GenerateInstance(cluster, set) + next := func() metav1.ObjectMeta { + _, span := tracing.Start(ctx, "generate-instance-name") + defer span.End() + n := naming.GenerateInstance(cluster, set) + // if there are any available instance names (as determined by observing any PVCs for the + // instance set that are not currently associated with an instance, e.g. in the event the + // instance STS was deleted), then reuse them instead of generating a new name + if len(availableInstanceNames) > 0 { + n.Name = availableInstanceNames[0] + availableInstanceNames = availableInstanceNames[1:] + } else { + for instanceNames.Has(n.Name) { + n = naming.GenerateInstance(cluster, set) + } } - } - span.End() + return n + }() instanceNames.Insert(next.Name) instances = append(instances, &appsv1.StatefulSet{ObjectMeta: next}) diff --git a/internal/controller/standalone_pgadmin/controller.go b/internal/controller/standalone_pgadmin/controller.go index 481231684..7e1005900 100644 --- a/internal/controller/standalone_pgadmin/controller.go +++ b/internal/controller/standalone_pgadmin/controller.go @@ -93,7 +93,7 @@ func (r *PGAdminReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct // NotFound cannot be fixed by requeuing so ignore it. During background // deletion, we receive delete events from pgadmin's dependents after // pgadmin is deleted. - return ctrl.Result{}, client.IgnoreNotFound(err) + return ctrl.Result{}, tracing.Escape(span, client.IgnoreNotFound(err)) } // Write any changes to the pgadmin status on the way out. @@ -148,7 +148,7 @@ func (r *PGAdminReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct log.V(1).Info("Reconciled pgAdmin") } - return ctrl.Result{}, err + return ctrl.Result{}, tracing.Escape(span, err) } // The owner reference created by controllerutil.SetControllerReference blocks diff --git a/internal/naming/dns.go b/internal/naming/dns.go index 3d770bd2a..3925bfe98 100644 --- a/internal/naming/dns.go +++ b/internal/naming/dns.go @@ -79,11 +79,10 @@ func KubernetesClusterDomain(ctx context.Context) string { api := "kubernetes.default.svc" cname, err := net.DefaultResolver.LookupCNAME(ctx, api) - if err == nil { + if tracing.Check(span, err) { return strings.TrimPrefix(cname, api+".") } - span.RecordError(err) // The kubeadm default is "cluster.local" and is adequate when not running // in an actual Kubernetes cluster. return "cluster.local." diff --git a/internal/tracing/errors.go b/internal/tracing/errors.go new file mode 100644 index 000000000..d0e00cf56 --- /dev/null +++ b/internal/tracing/errors.go @@ -0,0 +1,34 @@ +// Copyright 2021 - 2024 Crunchy Data Solutions, Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +package tracing + +import ( + semconv "go.opentelemetry.io/otel/semconv/v1.27.0" + "go.opentelemetry.io/otel/trace" +) + +// Check returns true when err is nil. Otherwise, it adds err as an exception +// event on s and returns false. If you intend to return err, consider using +// [Escape] instead. +// +// See: https://opentelemetry.io/docs/specs/semconv/exceptions/exceptions-spans +func Check(s Span, err error) bool { + if err == nil { + return true + } + if s.IsRecording() { + s.RecordError(err) + } + return false +} + +// Escape adds non-nil err as an escaped exception event on s and returns err. +// See: https://opentelemetry.io/docs/specs/semconv/exceptions/exceptions-spans +func Escape(s Span, err error) error { + if err != nil && s.IsRecording() { + s.RecordError(err, trace.WithAttributes(semconv.ExceptionEscaped(true))) + } + return err +} diff --git a/internal/tracing/errors_test.go b/internal/tracing/errors_test.go new file mode 100644 index 000000000..4f8f6d1be --- /dev/null +++ b/internal/tracing/errors_test.go @@ -0,0 +1,94 @@ +// Copyright 2021 - 2024 Crunchy Data Solutions, Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +package tracing + +import ( + "context" + "errors" + "testing" + + "go.opentelemetry.io/otel/sdk/trace" + "go.opentelemetry.io/otel/sdk/trace/tracetest" + semconv "go.opentelemetry.io/otel/semconv/v1.27.0" + "gotest.tools/v3/assert" +) + +func TestCheck(t *testing.T) { + recorder := tracetest.NewSpanRecorder() + tracer := trace.NewTracerProvider( + trace.WithSpanProcessor(recorder), + ).Tracer("") + + { + _, span := tracer.Start(context.Background(), "") + assert.Assert(t, Check(span, nil)) + span.End() + + spans := recorder.Ended() + assert.Equal(t, len(spans), 1) + assert.Equal(t, len(spans[0].Events()), 0, "expected no events") + } + + { + _, span := tracer.Start(context.Background(), "") + assert.Assert(t, !Check(span, errors.New("msg"))) + span.End() + + spans := recorder.Ended() + assert.Equal(t, len(spans), 2) + assert.Equal(t, len(spans[1].Events()), 1, "expected one event") + + event := spans[1].Events()[0] + assert.Equal(t, event.Name, semconv.ExceptionEventName) + + attrs := event.Attributes + assert.Equal(t, len(attrs), 2) + assert.Equal(t, string(attrs[0].Key), "exception.type") + assert.Equal(t, string(attrs[1].Key), "exception.message") + assert.Equal(t, attrs[0].Value.AsInterface(), "*errors.errorString") + assert.Equal(t, attrs[1].Value.AsInterface(), "msg") + } +} + +func TestEscape(t *testing.T) { + recorder := tracetest.NewSpanRecorder() + tracer := trace.NewTracerProvider( + trace.WithSpanProcessor(recorder), + ).Tracer("") + + { + _, span := tracer.Start(context.Background(), "") + assert.NilError(t, Escape(span, nil)) + span.End() + + spans := recorder.Ended() + assert.Equal(t, len(spans), 1) + assert.Equal(t, len(spans[0].Events()), 0, "expected no events") + } + + { + _, span := tracer.Start(context.Background(), "") + expected := errors.New("somesuch") + assert.Assert(t, errors.Is(Escape(span, expected), expected), + "expected to unwrap the original error") + span.End() + + spans := recorder.Ended() + assert.Equal(t, len(spans), 2) + assert.Equal(t, len(spans[1].Events()), 1, "expected one event") + + event := spans[1].Events()[0] + assert.Equal(t, event.Name, semconv.ExceptionEventName) + + attrs := event.Attributes + assert.Equal(t, len(attrs), 3) + assert.Equal(t, string(attrs[0].Key), "exception.escaped") + assert.Equal(t, string(attrs[1].Key), "exception.type") + assert.Equal(t, string(attrs[2].Key), "exception.message") + assert.Equal(t, attrs[0].Value.AsInterface(), true) + assert.Equal(t, attrs[1].Value.AsInterface(), "*errors.errorString") + assert.Equal(t, attrs[2].Value.AsInterface(), "somesuch") + } +}