From d01fa60625ab0fb89ff7fc4caea1330a44bc7afb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9e?= Date: Fri, 20 Dec 2024 15:48:09 +0000 Subject: [PATCH] Migrate histogram metrics to `{f,u}64_histogram!` (#6356) --- .../maint_renee_migrate_metrics_histograms.md | 5 ++ apollo-router/src/cache/storage.rs | 54 +++++++------ apollo-router/src/graphql/request.rs | 16 ++-- apollo-router/src/plugins/cache/metrics.rs | 20 +++-- .../src/plugins/coprocessor/execution.rs | 15 +--- apollo-router/src/plugins/coprocessor/mod.rs | 37 ++++----- .../src/plugins/coprocessor/supergraph.rs | 15 +--- apollo-router/src/plugins/telemetry/utils.rs | 6 +- .../query_planner/caching_query_planner.rs | 6 +- .../src/services/subgraph_service.rs | 9 ++- apollo-router/src/uplink/mod.rs | 79 +++++++++---------- .../instrumentation/standard-instruments.mdx | 2 +- 12 files changed, 138 insertions(+), 126 deletions(-) create mode 100644 .changesets/maint_renee_migrate_metrics_histograms.md diff --git a/.changesets/maint_renee_migrate_metrics_histograms.md b/.changesets/maint_renee_migrate_metrics_histograms.md new file mode 100644 index 0000000000..b78b99d872 --- /dev/null +++ b/.changesets/maint_renee_migrate_metrics_histograms.md @@ -0,0 +1,5 @@ +### Migrate histogram metrics to `{f,u}64_histogram!` ([PR #6356](https://github.com/apollographql/router/pull/6356)) + +Updates histogram metrics using the legacy `tracing::info!(histogram.*)` syntax to the new metrics macros. + +By [@goto-bus-stop](https://github.com/goto-bus-stop) in https://github.com/apollographql/router/pull/6356 \ No newline at end of file diff --git a/apollo-router/src/cache/storage.rs b/apollo-router/src/cache/storage.rs index 4408d24c5c..d7e3079441 100644 --- a/apollo-router/src/cache/storage.rs +++ b/apollo-router/src/cache/storage.rs @@ -170,6 +170,14 @@ where match res { Some(v) => { + let duration = instant_memory.elapsed(); + f64_histogram!( + "apollo_router_cache_hit_time", + "Time to get a value from the cache in seconds", + duration.as_secs_f64(), + kind = self.caller, + storage = CacheStorageName::Memory.to_string() + ); u64_counter!( "apollo_router_cache_hit_count", "Number of cache hits", @@ -177,20 +185,16 @@ where kind = self.caller, storage = CacheStorageName::Memory.to_string() ); - let duration = instant_memory.elapsed().as_secs_f64(); - tracing::info!( - histogram.apollo_router_cache_hit_time = duration, - kind = %self.caller, - storage = &tracing::field::display(CacheStorageName::Memory), - ); Some(v) } None => { - let duration = instant_memory.elapsed().as_secs_f64(); - tracing::info!( - histogram.apollo_router_cache_miss_time = duration, - kind = %self.caller, - storage = &tracing::field::display(CacheStorageName::Memory), + let duration = instant_memory.elapsed(); + f64_histogram!( + "apollo_router_cache_miss_time", + "Time to check the cache for an uncached value in seconds", + duration.as_secs_f64(), + kind = self.caller, + storage = CacheStorageName::Memory.to_string() ); u64_counter!( "apollo_router_cache_miss_count", @@ -218,6 +222,14 @@ where Some(v) => { self.insert_in_memory(key.clone(), v.0.clone()).await; + let duration = instant_redis.elapsed(); + f64_histogram!( + "apollo_router_cache_hit_time", + "Time to get a value from the cache in seconds", + duration.as_secs_f64(), + kind = self.caller, + storage = CacheStorageName::Redis.to_string() + ); u64_counter!( "apollo_router_cache_hit_count", "Number of cache hits", @@ -225,15 +237,17 @@ where kind = self.caller, storage = CacheStorageName::Redis.to_string() ); - let duration = instant_redis.elapsed().as_secs_f64(); - tracing::info!( - histogram.apollo_router_cache_hit_time = duration, - kind = %self.caller, - storage = &tracing::field::display(CacheStorageName::Redis), - ); Some(v.0) } None => { + let duration = instant_redis.elapsed(); + f64_histogram!( + "apollo_router_cache_miss_time", + "Time to check the cache for an uncached value in seconds", + duration.as_secs_f64(), + kind = self.caller, + storage = CacheStorageName::Redis.to_string() + ); u64_counter!( "apollo_router_cache_miss_count", "Number of cache misses", @@ -241,12 +255,6 @@ where kind = self.caller, storage = CacheStorageName::Redis.to_string() ); - let duration = instant_redis.elapsed().as_secs_f64(); - tracing::info!( - histogram.apollo_router_cache_miss_time = duration, - kind = %self.caller, - storage = &tracing::field::display(CacheStorageName::Redis), - ); None } } diff --git a/apollo-router/src/graphql/request.rs b/apollo-router/src/graphql/request.rs index fcf80f4615..e22323488a 100644 --- a/apollo-router/src/graphql/request.rs +++ b/apollo-router/src/graphql/request.rs @@ -197,9 +197,11 @@ impl Request { let mut result = Request::allocate_result_array(value); if value.is_array() { - tracing::info!( - histogram.apollo.router.operations.batching.size = result.len() as f64, - mode = %BatchingMode::BatchHttpLink // Only supported mode right now + u64_histogram!( + "apollo.router.operations.batching.size", + "Number of queries contained within each query batch", + result.len() as u64, + mode = BatchingMode::BatchHttpLink.to_string() // Only supported mode right now ); u64_counter!( @@ -226,9 +228,11 @@ impl Request { let mut result = Request::allocate_result_array(value); if value.is_array() { - tracing::info!( - histogram.apollo.router.operations.batching.size = result.len() as f64, - mode = "batch_http_link" // Only supported mode right now + u64_histogram!( + "apollo.router.operations.batching.size", + "Number of queries contained within each query batch", + result.len() as u64, + mode = BatchingMode::BatchHttpLink.to_string() // Only supported mode right now ); u64_counter!( diff --git a/apollo-router/src/plugins/cache/metrics.rs b/apollo-router/src/plugins/cache/metrics.rs index 86e802c093..caff0ee738 100644 --- a/apollo-router/src/plugins/cache/metrics.rs +++ b/apollo-router/src/plugins/cache/metrics.rs @@ -244,15 +244,21 @@ impl CacheCounter { for (typename, (cache_hit, total_entities)) in seen.into_iter() { if separate_metrics_per_type { - ::tracing::info!( - histogram.apollo.router.operations.entity.cache_hit = (cache_hit as f64 / total_entities as f64) * 100f64, - entity_type = %typename, - subgraph = %subgraph_name, + f64_histogram!( + "apollo.router.operations.entity.cache_hit", + "Hit rate percentage of cached entities", + (cache_hit as f64 / total_entities as f64) * 100f64, + // Can't just `Arc::clone` these because they're `Arc`, + // while opentelemetry supports `Arc` + entity_type = typename.to_string(), + subgraph = subgraph_name.to_string() ); } else { - ::tracing::info!( - histogram.apollo.router.operations.entity.cache_hit = (cache_hit as f64 / total_entities as f64) * 100f64, - subgraph = %subgraph_name, + f64_histogram!( + "apollo.router.operations.entity.cache_hit", + "Hit rate percentage of cached entities", + (cache_hit as f64 / total_entities as f64) * 100f64, + subgraph = subgraph_name.to_string() ); } } diff --git a/apollo-router/src/plugins/coprocessor/execution.rs b/apollo-router/src/plugins/coprocessor/execution.rs index c5342db5bd..ba1b12e70d 100644 --- a/apollo-router/src/plugins/coprocessor/execution.rs +++ b/apollo-router/src/plugins/coprocessor/execution.rs @@ -10,7 +10,6 @@ use tower::BoxError; use tower::ServiceBuilder; use tower_service::Service; -use super::externalize_header_map; use super::*; use crate::graphql; use crate::layers::async_checkpoint::OneShotAsyncCheckpointLayer; @@ -235,12 +234,9 @@ where let guard = request.context.enter_active_request(); let start = Instant::now(); let co_processor_result = payload.call(http_client, &coprocessor_url).await; - let duration = start.elapsed().as_secs_f64(); + let duration = start.elapsed(); drop(guard); - tracing::info!( - histogram.apollo.router.operations.coprocessor.duration = duration, - coprocessor.stage = %PipelineStep::ExecutionRequest, - ); + record_coprocessor_duration(PipelineStep::ExecutionRequest, duration); tracing::debug!(?co_processor_result, "co-processor returned"); let co_processor_output = co_processor_result?; @@ -381,12 +377,9 @@ where let guard = response.context.enter_active_request(); let start = Instant::now(); let co_processor_result = payload.call(http_client.clone(), &coprocessor_url).await; - let duration = start.elapsed().as_secs_f64(); + let duration = start.elapsed(); drop(guard); - tracing::info!( - histogram.apollo.router.operations.coprocessor.duration = duration, - coprocessor.stage = %PipelineStep::ExecutionResponse, - ); + record_coprocessor_duration(PipelineStep::ExecutionResponse, duration); tracing::debug!(?co_processor_result, "co-processor returned"); let co_processor_output = co_processor_result?; diff --git a/apollo-router/src/plugins/coprocessor/mod.rs b/apollo-router/src/plugins/coprocessor/mod.rs index cbc5b88213..d2cf150edf 100644 --- a/apollo-router/src/plugins/coprocessor/mod.rs +++ b/apollo-router/src/plugins/coprocessor/mod.rs @@ -343,6 +343,15 @@ fn default_timeout() -> Duration { DEFAULT_EXTERNALIZATION_TIMEOUT } +fn record_coprocessor_duration(stage: PipelineStep, duration: Duration) { + f64_histogram!( + "apollo.router.operations.coprocessor.duration", + "Time spent waiting for the coprocessor to answer, in seconds", + duration.as_secs_f64(), + coprocessor.stage = stage.to_string() + ); +} + #[derive(Clone, Debug, Default, Deserialize, PartialEq, Serialize, JsonSchema)] #[serde(default)] pub(super) struct RouterStage { @@ -671,12 +680,9 @@ where let guard = request.context.enter_active_request(); let start = Instant::now(); let co_processor_result = payload.call(http_client, &coprocessor_url).await; - let duration = start.elapsed().as_secs_f64(); + let duration = start.elapsed(); drop(guard); - tracing::info!( - histogram.apollo.router.operations.coprocessor.duration = duration, - coprocessor.stage = %PipelineStep::RouterRequest, - ); + record_coprocessor_duration(PipelineStep::RouterRequest, duration); tracing::debug!(?co_processor_result, "co-processor returned"); let mut co_processor_output = co_processor_result?; @@ -846,12 +852,9 @@ where let guard = response.context.enter_active_request(); let start = Instant::now(); let co_processor_result = payload.call(http_client.clone(), &coprocessor_url).await; - let duration = start.elapsed().as_secs_f64(); + let duration = start.elapsed(); drop(guard); - tracing::info!( - histogram.apollo.router.operations.coprocessor.duration = duration, - coprocessor.stage = %PipelineStep::RouterResponse, - ); + record_coprocessor_duration(PipelineStep::RouterResponse, duration); tracing::debug!(?co_processor_result, "co-processor returned"); let co_processor_output = co_processor_result?; @@ -1034,12 +1037,9 @@ where let guard = request.context.enter_active_request(); let start = Instant::now(); let co_processor_result = payload.call(http_client, &coprocessor_url).await; - let duration = start.elapsed().as_secs_f64(); + let duration = start.elapsed(); drop(guard); - tracing::info!( - histogram.apollo.router.operations.coprocessor.duration = duration, - coprocessor.stage = %PipelineStep::SubgraphRequest, - ); + record_coprocessor_duration(PipelineStep::SubgraphRequest, duration); tracing::debug!(?co_processor_result, "co-processor returned"); let co_processor_output = co_processor_result?; @@ -1193,12 +1193,9 @@ where let guard = response.context.enter_active_request(); let start = Instant::now(); let co_processor_result = payload.call(http_client, &coprocessor_url).await; - let duration = start.elapsed().as_secs_f64(); + let duration = start.elapsed(); drop(guard); - tracing::info!( - histogram.apollo.router.operations.coprocessor.duration = duration, - coprocessor.stage = %PipelineStep::SubgraphResponse, - ); + record_coprocessor_duration(PipelineStep::SubgraphResponse, duration); tracing::debug!(?co_processor_result, "co-processor returned"); let co_processor_output = co_processor_result?; diff --git a/apollo-router/src/plugins/coprocessor/supergraph.rs b/apollo-router/src/plugins/coprocessor/supergraph.rs index d5aff80b7e..63e9addb59 100644 --- a/apollo-router/src/plugins/coprocessor/supergraph.rs +++ b/apollo-router/src/plugins/coprocessor/supergraph.rs @@ -10,7 +10,6 @@ use tower::BoxError; use tower::ServiceBuilder; use tower_service::Service; -use super::externalize_header_map; use super::*; use crate::graphql; use crate::layers::async_checkpoint::OneShotAsyncCheckpointLayer; @@ -243,12 +242,9 @@ where let guard = request.context.enter_active_request(); let start = Instant::now(); let co_processor_result = payload.call(http_client, &coprocessor_url).await; - let duration = start.elapsed().as_secs_f64(); + let duration = start.elapsed(); drop(guard); - tracing::info!( - histogram.apollo.router.operations.coprocessor.duration = duration, - coprocessor.stage = %PipelineStep::SupergraphRequest, - ); + record_coprocessor_duration(PipelineStep::SupergraphRequest, duration); tracing::debug!(?co_processor_result, "co-processor returned"); let co_processor_output = co_processor_result?; @@ -397,12 +393,9 @@ where let guard = response.context.enter_active_request(); let start = Instant::now(); let co_processor_result = payload.call(http_client.clone(), &coprocessor_url).await; - let duration = start.elapsed().as_secs_f64(); + let duration = start.elapsed(); drop(guard); - tracing::info!( - histogram.apollo.router.operations.coprocessor.duration = duration, - coprocessor.stage = %PipelineStep::SupergraphResponse, - ); + record_coprocessor_duration(PipelineStep::SupergraphResponse, duration); tracing::debug!(?co_processor_result, "co-processor returned"); let co_processor_output = co_processor_result?; diff --git a/apollo-router/src/plugins/telemetry/utils.rs b/apollo-router/src/plugins/telemetry/utils.rs index 2081748dcb..471033de96 100644 --- a/apollo-router/src/plugins/telemetry/utils.rs +++ b/apollo-router/src/plugins/telemetry/utils.rs @@ -4,7 +4,11 @@ use std::time::Instant; /// Timer implementing Drop to automatically compute the duration between the moment it has been created until it's dropped ///```ignore /// Timer::new(|duration| { -/// tracing::info!(histogram.apollo_router_test = duration.as_secs_f64()); +/// f64_histogram!( +/// "apollo_router_test", +/// "Time spent testing the timer", +/// duration.as_secs_f64() +/// ); /// }) /// ``` pub(crate) struct Timer diff --git a/apollo-router/src/query_planner/caching_query_planner.rs b/apollo-router/src/query_planner/caching_query_planner.rs index e5ea4941e4..a4a36d6e9f 100644 --- a/apollo-router/src/query_planner/caching_query_planner.rs +++ b/apollo-router/src/query_planner/caching_query_planner.rs @@ -138,8 +138,10 @@ where experimental_pql_prewarm: &PersistedQueriesPrewarmQueryPlanCache, ) { let _timer = Timer::new(|duration| { - ::tracing::info!( - histogram.apollo.router.query.planning.warmup.duration = duration.as_secs_f64() + f64_histogram!( + "apollo.router.query_planning.warmup.duration", + "Time spent warming up the query planner queries in seconds", + duration.as_secs_f64() ); }); diff --git a/apollo-router/src/services/subgraph_service.rs b/apollo-router/src/services/subgraph_service.rs index 8b6954b5ce..78574f9502 100644 --- a/apollo-router/src/services/subgraph_service.rs +++ b/apollo-router/src/services/subgraph_service.rs @@ -871,9 +871,12 @@ pub(crate) async fn process_batch( let client = client_factory.create(&service); // Update our batching metrics (just before we fetch) - tracing::info!(histogram.apollo.router.operations.batching.size = listener_count as f64, - mode = %BatchingMode::BatchHttpLink, // Only supported mode right now - subgraph = &service + u64_histogram!( + "apollo.router.operations.batching.size", + "Number of queries contained within each query batch", + listener_count as u64, + mode = BatchingMode::BatchHttpLink.to_string(), // Only supported mode right now + subgraph = service.clone() ); u64_counter!( diff --git a/apollo-router/src/uplink/mod.rs b/apollo-router/src/uplink/mod.rs index 6bc9508efe..2c13b02cb7 100644 --- a/apollo-router/src/uplink/mod.rs +++ b/apollo-router/src/uplink/mod.rs @@ -345,13 +345,14 @@ where match http_request::(client, url.as_str(), request_body).await { Ok(response) => match response.data.map(Into::into) { None => { - tracing::info!( - histogram.apollo_router_uplink_fetch_duration_seconds = - now.elapsed().as_secs_f64(), - query, + f64_histogram!( + "apollo_router_uplink_fetch_duration_seconds", + "Duration of Apollo Uplink fetches.", + now.elapsed().as_secs_f64(), + query = query, url = url.to_string(), - "kind" = "uplink_error", - error = "empty response from uplink", + kind = "uplink_error", + error = "empty response from uplink" ); } Some(UplinkResponse::New { @@ -359,12 +360,13 @@ where id, delay, }) => { - tracing::info!( - histogram.apollo_router_uplink_fetch_duration_seconds = - now.elapsed().as_secs_f64(), - query, + f64_histogram!( + "apollo_router_uplink_fetch_duration_seconds", + "Duration of Apollo Uplink fetches.", + now.elapsed().as_secs_f64(), + query = query, url = url.to_string(), - "kind" = "new" + kind = "new" ); match transform_new_response(response).await { Ok(res) => { @@ -375,22 +377,19 @@ where }) } Err(err) => { - tracing::debug!( - "failed to process results of Uplink response from {}: {}. Other endpoints will be tried", - url, - err - ); + tracing::debug!("failed to process results of Uplink response from {url}: {err}. Other endpoints will be tried"); continue; } } } Some(UplinkResponse::Unchanged { id, delay }) => { - tracing::info!( - histogram.apollo_router_uplink_fetch_duration_seconds = - now.elapsed().as_secs_f64(), - query, + f64_histogram!( + "apollo_router_uplink_fetch_duration_seconds", + "Duration of Apollo Uplink fetches.", + now.elapsed().as_secs_f64(), + query = query, url = url.to_string(), - "kind" = "unchanged" + kind = "unchanged" ); return Ok(UplinkResponse::Unchanged { id, delay }); } @@ -399,14 +398,15 @@ where code, retry_later, }) => { - tracing::info!( - histogram.apollo_router_uplink_fetch_duration_seconds = - now.elapsed().as_secs_f64(), - query, + f64_histogram!( + "apollo_router_uplink_fetch_duration_seconds", + "Duration of Apollo Uplink fetches.", + now.elapsed().as_secs_f64(), + query = query, url = url.to_string(), - "kind" = "uplink_error", - error = message, - code + kind = "uplink_error", + error = message.clone(), + code = code.clone() ); return Ok(UplinkResponse::Error { message, @@ -415,21 +415,18 @@ where }); } }, - Err(e) => { - tracing::info!( - histogram.apollo_router_uplink_fetch_duration_seconds = - now.elapsed().as_secs_f64(), - query, + Err(err) => { + f64_histogram!( + "apollo_router_uplink_fetch_duration_seconds", + "Duration of Apollo Uplink fetches.", + now.elapsed().as_secs_f64(), + query = query, url = url.to_string(), - "kind" = "http_error", - error = e.to_string(), - code = e.status().unwrap_or_default().as_str() - ); - tracing::debug!( - "failed to fetch from Uplink endpoint {}: {}. Other endpoints will be tried", - url, - e + kind = "http_error", + error = err.to_string(), + code = err.status().unwrap_or_default().to_string() ); + tracing::debug!("failed to fetch from Uplink endpoint {url}: {err}. Other endpoints will be tried"); } }; } diff --git a/docs/source/reference/router/telemetry/instrumentation/standard-instruments.mdx b/docs/source/reference/router/telemetry/instrumentation/standard-instruments.mdx index 499e6cc043..ec9fc56327 100644 --- a/docs/source/reference/router/telemetry/instrumentation/standard-instruments.mdx +++ b/docs/source/reference/router/telemetry/instrumentation/standard-instruments.mdx @@ -44,7 +44,7 @@ All cache metrics listed above have the following attributes: ### Coprocessor - `apollo_router_operations_coprocessor_total` - Total operations with coprocessors enabled. -- `apollo_router_operations_coprocessor.duration` - Time spent waiting for the coprocessor to answer, in seconds. +- `apollo.router.operations.coprocessor.duration` - Time spent waiting for the coprocessor to answer, in seconds. The coprocessor operations metric has the following attributes: