From 3db282fc2d559f54f618940e05d7bef7f94cc61f Mon Sep 17 00:00:00 2001 From: David Goffredo Date: Fri, 15 Jul 2022 11:48:08 -0400 Subject: [PATCH] span sampling rules (#231) * undo non-test related removals from #218 * fix bug * add additional "sample" test * clang-format * glob.{h,cpp} * class SpanSampler, without using or testing it * small (premature?) optimization * TODO: AMEND * checkpoint: added span rules to Tracer, but not elsewhere * checkpoint: add span rules to SpanBuffer, but don't use them * checkpoint: written but untested * test rule parsing * obnoxious compiler * test rule matching * test span sampling * clang-format * test environment variables * repo-level documentation * attempt at fixing doc links * describe glob in the repo-level docs * unit tests catch bugs! * a little const correctness * code comment typo * temporary files are temporary * review: trim some docs and add an example * review: describe globs more concretely * review: make a future diff smaller --- BUILD.bazel | 2 + doc/configuration.md | 25 +- doc/sampling.md | 50 ++++ include/datadog/opentracing.h | 74 +++++- src/glob.cpp | 59 +++++ src/glob.h | 30 +++ src/pending_trace.cpp | 33 ++- src/pending_trace.h | 7 +- src/sample.cpp | 146 +++++++++++ src/sample.h | 83 +++++++ src/sampling_mechanism.h | 7 +- src/span_buffer.cpp | 14 +- src/span_buffer.h | 18 +- src/tracer.cpp | 19 +- src/tracer_factory.cpp | 3 + src/tracer_options.cpp | 57 +++++ test/CMakeLists.txt | 1 + test/glob_test.cpp | 52 ++++ test/mocks.h | 7 +- test/propagation_test.cpp | 7 +- test/sample_test.cpp | 443 +++++++++++++++++++++++++++++++++- test/span_buffer_test.cpp | 3 +- test/tracer_factory_test.cpp | 151 ++++++++++++ 23 files changed, 1253 insertions(+), 38 deletions(-) create mode 100644 src/glob.cpp create mode 100644 src/glob.h create mode 100644 test/glob_test.cpp diff --git a/BUILD.bazel b/BUILD.bazel index bfd8f338..f29fc556 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -6,6 +6,8 @@ cc_library( "src/clock.h", "src/encoder.cpp", "src/encoder.h", + "src/glob.cpp", + "src/glob.h", "src/limiter.cpp", "src/limiter.h", "src/logger.cpp", diff --git a/doc/configuration.md b/doc/configuration.md index 942bb02f..d6dca1c7 100644 --- a/doc/configuration.md +++ b/doc/configuration.md @@ -112,7 +112,7 @@ specified as a JSON array of objects. For more information about the configuration of trace sampling, see [sampling.md][6]. -- **TracerOptions member**: `std::string sampling_rules` +- **TracerOptions member**: `std::string sampling_rules` _(JSON)_ - **JSON property**: `"sampling_rules"` _(array of objects)_ - **Environment variable**: `DD_TRACE_SAMPLING_RULES` _(JSON)_ - **Default value**: `[]` @@ -253,6 +253,28 @@ made, is propagated between services along the trace in the form of the order to prevent rejection by peers or other HTTP header policies. This configuration option is that limit, in bytes. +### Span Sampling Rules +Span sampling rules allow spans to be sent to Datadog that otherwise would be +dropped due to trace sampling. + +For more information about the configuration of span sampling, see the [Span +Sampling][11] section of [sampling.md][6]. + +- **TracerOptions member**: `std::string span_sampling_rules` _(JSON)_ +- **JSON property**: `"span_sampling_rules"` _(array of objects)_ +- **Environment variable**: `DD_SPAN_SAMPLING_RULES` _(JSON)_ +- **Default value**: `[]` + +### Span Sampling Rules File +Span sampling rules (see above) can be specified in their own file. The value +of the `DD_SPAN_SAMPLING_RULES_FILE` environment variable is the path to a file +whose contents are the span sampling rules JSON array. + +- **Environment variable**: `DD_SPAN_SAMPLING_RULES_FILE` + +Note that `DD_SPAN_SAMPLING_RULES_FILE` is ignored when +`DD_SPAN_SAMPLING_RULES` is also in the environment. + - **TracerOptions member**: `uint64_t tags_header_size` - **JSON property**: `tags_header_size` _(number)_ - **Environment variable**: `DD_TRACE_TAGS_PROPAGATION_MAX_LENGTH` @@ -267,3 +289,4 @@ configuration option is that limit, in bytes. [7]: https://github.com/openzipkin/b3-propagation [8]: https://pubs.opengroup.org/onlinepubs/9699919799/ [9]: https://docs.datadoghq.com/getting_started/tagging/unified_service_tagging +[11]: sampling.md#span-sampling diff --git a/doc/sampling.md b/doc/sampling.md index 9e44352a..67d3af72 100644 --- a/doc/sampling.md +++ b/doc/sampling.md @@ -120,5 +120,55 @@ This configuration option has the same meaning as the `DD_TRACE_RATE_LIMIT` environment variable. Note that the environment variable overrides the `TracerOptions` field if both are specified. +Span Sampling +------------- +Span sampling is used to select spans to keep even when the enclosing +trace is dropped. + +Similar to _trace_ sampling rules, _span_ sampling rules are configured as a +JSON array of object, where each object may contain the following properties: +``` +[{ + "service": , + "name": , + "sample_rate": , + "max_per_second": +}, ...] +``` + +The `service` and `name` are glob patterns, where "glob" here means: +- `*` matches any substring, including the empty string, +- `?` matches exactly one of any character, and +- any other character matches exactly one of itself. + +Span sampling rules are examined only when the enclosing trace is to be +dropped. + +The first span sampling rule that matches a span is used to make a span +sampling decision for that span. If the decision is "keep," then the span is +sent to Datadog despite the enclosing trace having been dropped. + +Span sampling rules can be configured [directly][3] or [in a file][4]. + +For example, consider the following span sampling rules: +```shell +export DD_SPAN_SAMPLING_RULES='[ + {"service": "router", "name": "rack.request", "max_per_second": 2000}, + {"service": "classic-mysql", "name": "mysql2.*"}, + {"service": "authn?", "sample_rate": 0.5} +]' +``` +These rules state: + +- When a trace is dropped, keep spans whose service name is `router` and whose + operation name is `rack.request`, but keep at most 2000 such spans per + second. +- When a trace is dropped, keep spans whose service name is `classic-mysql` and + whose operation name begins with `mysql2.`. +- When a trace is dropped, keep 50% of spans whose service name is `authn` + followed by another character, e.g. `authny`, `authnj`. + [1]: https://docs.datadoghq.com/tracing/trace_ingestion/mechanisms/?tab=environmentvariables#in-the-agent [2]: https://docs.datadoghq.com/tracing/setup_overview/proxy_setup/?tab=nginx +[3]: configuration.md#span-sampling-rules +[4]: configuration.md#span-sampling-rules-file diff --git a/include/datadog/opentracing.h b/include/datadog/opentracing.h index 3283f2f5..08d3661d 100644 --- a/include/datadog/opentracing.h +++ b/include/datadog/opentracing.h @@ -76,19 +76,19 @@ struct TracerOptions { double sample_rate = std::nan(""); // This option is deprecated, and may be removed in future releases. bool priority_sampling = true; - // Rules sampling is applied when initiating traces to determine the sampling - // rate. Configuration is specified as a JSON array of objects. Each object - // must have a "sample_rate", while the "name" and "service" fields are - // optional. The "sample_rate" value must be between 0.0 and 1.0 (inclusive). - // Rules are checked in order, so a more specific rule should be specified - // before a less specific rule. Note that if the `sample_rate` field of this - // `TracerOptions` has a non-NaN value, then there is an implicit rule at the - // end of the list that matches any trace unmatched by other rules, and - // applies a sampling rate of `sample_rate`. If no rule matches a trace, - // then "priority sampling" is applied instead, where the sample rate is - // determined by the Datadog trace agent. If any rules are invalid, they are - // ignored. This option is also configurable as the environment variable - // DD_TRACE_SAMPLING_RULES. + // Rule-based trace sampling is applied when initiating traces to determine + // the sampling rate. Configuration is specified as a JSON array of objects. + // Each object must have a "sample_rate", while the "name" and "service" + // fields are optional. The "sample_rate" value must be between 0.0 and 1.0 + // (inclusive). Rules are checked in order, so a more specific rule should + // be specified before a less specific rule. Note that if the `sample_rate` + // field of this `TracerOptions` has a non-NaN value, then there is an + // implicit rule at the end of the list that matches any trace unmatched by + // other rules, and applies a sampling rate of `sample_rate`. If no rule + // matches a trace, then "priority sampling" is applied instead, where the + // sample rate is determined by the Datadog trace agent. If any rules are + // invalid, they are ignored. This option is also configurable as the + // environment variable DD_TRACE_SAMPLING_RULES. std::string sampling_rules = "[]"; // Max amount of time to wait between sending traces to agent, in ms. Agent discards traces older // than 10s, so that is the upper bound. @@ -156,6 +156,54 @@ struct TracerOptions { // serialized tags allowed. Trace-wide tags whose serialized length exceeds // this limit are not propagated. uint64_t tags_header_size = 512; + // Rule-based span sampling, which is distinct from rule-based trace + // sampling, is used to determine which spans to keep, if any, when trace + // sampling decides to drop the trace. + // When the trace is to be dropped, each span is matched against the + // `span_sampling_rules`. For each span, the first rule to match, if any, + // applies to the span and a span-specific sampling decision is made. If the + // decision for the span is to keep, then the span is sent to Datadog even + // though the enclosing trace is not. + // `span_sampling_rules` is a JSON array of objects, where each object has + // the following shape: + // + // { + // "service": , + // "name": , + // "sample_rate": , + // "max_per_second": + // } + // + // The properties mean the following: + // + // - "service" is a glob pattern that must match a span's service name in + // order for the rule to match. If "service" is not specified, then its + // default value is "*". Glob patterns are described below. + // - "name" is a glob pattern that must match a span's operation name in + // order for the rule to match. If "name" is not specified, then its default + // value is "*". Glob patterns are described below. + // - "sample_rate" is the probability that a span matching the rule will be + // kept. If "sample_rate" is not specified, then its default value is 1.0. + // - "max_per_second" is the maximum number of spans that will be kept on + // account of this rule each second. Spans that would cause the limit to + // be exceeded are dropped. If "max_per_second" is not specified, then + // there is no limit. + // + // Glob patterns are a simplified form of regular expressions. Certain + // characters in a glob pattern have special meaning: + // + // - "*" matches any substring, including the empty string. + // - "?" matches exactly one instance of any character. + // - Other characters match exactly one instance of themselves. + // + // For example: + // + // - The glob pattern "foobar" is matched by "foobar" only. + // - The glob pattern "foo*" is matched by "foobar", "foo", and "fooop", but + // not by "fond". + // - The glob pattern "a?b*e*" is matched by "amble" and "albedo", but not by + // "albino". + std::string span_sampling_rules = "[]"; }; // TraceEncoder exposes the data required to encode and submit traces to the diff --git a/src/glob.cpp b/src/glob.cpp new file mode 100644 index 00000000..46bb61d4 --- /dev/null +++ b/src/glob.cpp @@ -0,0 +1,59 @@ +#include "glob.h" + +#include + +namespace datadog { +namespace opentracing { + +bool glob_match(ot::string_view pattern, ot::string_view subject) { + // This is a backtracking implementation of the glob matching algorithm. + // The glob pattern language supports `*` and `?`, but no escape sequences. + // + // Based off of a Go example in accessed + // February 3, 2022. + + using Index = std::size_t; + Index p = 0; // [p]attern index + Index s = 0; // [s]ubject index + Index next_p = 0; // next [p]attern index + Index next_s = 0; // next [s]ubject index + + while (p < pattern.size() || s < subject.size()) { + if (p < pattern.size()) { + const char pattern_char = pattern[p]; + switch (pattern_char) { + case '*': + // Try to match at `s`. If that doesn't work out, restart at + // `s + 1` next. + next_p = p; + next_s = s + 1; + ++p; + continue; + case '?': + if (s < subject.size()) { + ++p; + ++s; + continue; + } + break; + default: + if (s < subject.size() && subject[s] == pattern_char) { + ++p; + ++s; + continue; + } + } + } + // Mismatch. Maybe restart. + if (0 < next_s && next_s <= subject.size()) { + p = next_p; + s = next_s; + continue; + } + return false; + } + return true; +} + +} // namespace opentracing +} // namespace datadog diff --git a/src/glob.h b/src/glob.h new file mode 100644 index 00000000..46ff5cbd --- /dev/null +++ b/src/glob.h @@ -0,0 +1,30 @@ +#ifndef DD_OPENTRACING_GLOB_H +#define DD_OPENTRACING_GLOB_H + +// This component provides a string matching function, `glob_match`, that +// returns whether a specified string matches a specified pattern, where the +// pattern language is the following: +// +// - "*" matches any contiguous substring, including the empty string. +// - "?" matches exactly one instance of any character. +// - Other characters match exactly one instance of themselves. +// +// The patterns are here called "glob patterns," though they are different from +// the patterns used in Unix shells. + +#include + +namespace ot = opentracing; + +namespace datadog { +namespace opentracing { + +// Return whether the specified `subject` matches the specified glob `pattern`, +// i.e. whether `subject` is a member of the set of strings represented by the +// glob `pattern`. +bool glob_match(ot::string_view pattern, ot::string_view subject); + +} // namespace opentracing +} // namespace datadog + +#endif diff --git a/src/pending_trace.cpp b/src/pending_trace.cpp index a7197554..6aa52941 100644 --- a/src/pending_trace.cpp +++ b/src/pending_trace.cpp @@ -17,6 +17,9 @@ const std::string event_sample_rate_metric = "_dd1.sr.eausr"; const std::string rules_sampler_applied_rate = "_dd.rule_psr"; const std::string rules_sampler_limiter_rate = "_dd.limit_psr"; const std::string priority_sampler_applied_rate = "_dd.agent_psr"; +const std::string span_sampling_mechanism = "_dd.span_sampling.mechanism"; +const std::string span_sampling_rule_rate = "_dd.span_sampling.rule_rate"; +const std::string span_sampling_limit = "_dd.span_sampling.max_per_second"; // Return whether the specified `span` is without a parent among the specified // `all_spans_in_trace`. @@ -71,6 +74,25 @@ void finish_root_span(PendingTrace& trace, SpanData& span) { finish_span(trace, span); } +// Determine whether the specified `span` matches a rule in the specified +// `span_sampler` and the sampling decision of that rule is to keep the `span`. +// If so, then add appropriate tags to `span`. +void apply_span_sampling(SpanSampler& span_sampler, SpanData& span) { + SpanSampler::Rule* const rule = span_sampler.match(span); + if (!rule || !rule->sample(span)) { + return; + } + + // The span matched a span rule, and the rule decided to keep the span. + // Add span-sampling-specific tags to the span. + span.metrics[span_sampling_mechanism] = int(SamplingMechanism::SpanRule); + span.metrics[span_sampling_rule_rate] = rule->config().sample_rate; + const double limit = rule->config().max_per_second; + if (!std::isnan(limit)) { + span.metrics[span_sampling_limit] = limit; + } +} + } // namespace PendingTrace::PendingTrace(std::shared_ptr logger, uint64_t trace_id) @@ -87,7 +109,7 @@ PendingTrace::PendingTrace(std::shared_ptr logger, uint64_t trace_ all_spans(), sampling_priority(std::move(sampling_priority)) {} -void PendingTrace::finish() { +void PendingTrace::finish(SpanSampler* span_sampler) { // Apply changes to spans, in particular treating the root / local-root // span as special. for (const auto& span : *finished_spans) { @@ -97,6 +119,15 @@ void PendingTrace::finish() { finish_span(*this, *span); } } + + // If we have span sampling rules and are dropping the trace, see if any + // span sampling tags need to be added. + if (span_sampler && !span_sampler->rules().empty() && sampling_priority && + int(*sampling_priority) <= 0) { + for (const auto& span : *finished_spans) { + apply_span_sampling(*span_sampler, *span); + } + } } void PendingTrace::applySamplingDecisionToTraceTags() { diff --git a/src/pending_trace.h b/src/pending_trace.h index 87754654..dccf9eea 100644 --- a/src/pending_trace.h +++ b/src/pending_trace.h @@ -25,7 +25,12 @@ struct PendingTrace { PendingTrace(std::shared_ptr logger, uint64_t trace_id, std::unique_ptr sampling_priority); - void finish(); + // Modify span tags in order to prepare this trace for serialization. Use + // the optionally specified `span_sampler` to identify spans to keep should + // this trace be dropped. If `span_sampler` is `nullptr`, then no span + // sampling is performed. + void finish(SpanSampler *span_sampler = nullptr); + // If this tracer did not inherit a sampling decision from an upstream // service, but instead made a sampling decision, then record that decision // in the "_dd.p.dm" member of `trace_tags`. diff --git a/src/sample.cpp b/src/sample.cpp index caee98dc..01ab5d13 100644 --- a/src/sample.cpp +++ b/src/sample.cpp @@ -1,7 +1,15 @@ #include "sample.h" +#include +#include +#include #include +#include "clock.h" +#include "glob.h" +#include "logger.h" +#include "span.h" + namespace datadog { namespace opentracing { @@ -135,5 +143,143 @@ RuleResult RulesSampler::match(const std::string& service, const std::string& na void RulesSampler::updatePrioritySampler(json config) { priority_sampler_.configure(config); } +SpanSampler::Rule::Config::Config() + : service_pattern("*"), + operation_name_pattern("*"), + sample_rate(1.0), + max_per_second(std::nan("")), + text() {} + +SpanSampler::Rule::Rule(const SpanSampler::Rule::Config& config, TimeProvider clock) + : config_(config) { + if (!std::isnan(config.max_per_second)) { + limiter_ = std::make_unique(clock, config.max_per_second); + } +} + +bool SpanSampler::Rule::match(const SpanData& span) const { + const auto is_match = [](const std::string& pattern, const std::string& subject) { + // Since "*" is the default pattern, optimize for that case. + return pattern == "*" || glob_match(pattern, subject); + }; + + return is_match(config_.service_pattern, span.service) && + is_match(config_.operation_name_pattern, span.name); +} + +bool SpanSampler::Rule::sample(const SpanData& span) { return roll(span) && allow(); } + +bool SpanSampler::Rule::roll(const SpanData& span) const { + const uint64_t max_hash = maxIdFromSampleRate(config_.sample_rate); + // Use the span ID (not the trace ID), so that rolls can differ among spans + // within the same trace (given the same sample rate). + const uint64_t hashed_id = span.span_id * constant_rate_hash_factor; + return hashed_id < max_hash; +} + +bool SpanSampler::Rule::allow() { + if (!limiter_) { + return true; + } + + return limiter_->allow().allowed; +} + +const SpanSampler::Rule::Config& SpanSampler::Rule::config() const { return config_; } + +void SpanSampler::configure(ot::string_view raw_json, const Logger& logger, TimeProvider clock) { + rules_.clear(); + + // `raw_json` is expected to be a JSON array of objects, where each object + // configures a `SpanSampler::Rule`. + try { + const auto log_invalid_json = [&](const std::string& description, const json& object) { + logger.Log(LogLevel::error, description + ": " + object.dump()); + }; + + const json config_json = json::parse(raw_json); + + for (const auto& item : config_json.items()) { + const auto rule_json = item.value(); + + if (!rule_json.is_object()) { + log_invalid_json("span sampler: unexpected element type in rules array", rule_json); + continue; + } + + // Default values are enforced by the `Rule::Config` constructor. + Rule::Config config; + + if (rule_json.contains("service")) { + if (!rule_json.at("service").is_string()) { + log_invalid_json("span sampler: invalid type for 'service' (expected string)", + rule_json); + continue; + } + config.service_pattern = rule_json.at("service").get(); + } + + if (rule_json.contains("name")) { + if (!rule_json.at("name").is_string()) { + log_invalid_json("span sampler: invalid type for 'name' (expected string)", rule_json); + continue; + } + config.operation_name_pattern = rule_json.at("name").get(); + } + + if (rule_json.contains("sample_rate")) { + if (!rule_json.at("sample_rate").is_number()) { + log_invalid_json("span sampler: invalid type for 'sample_rate' (expected number)", + rule_json); + continue; + } + const double sample_rate = rule_json.at("sample_rate").get(); + if (!(sample_rate >= 0.0 && sample_rate <= 1.0)) { + log_invalid_json( + "span sampler: invalid value for 'sample_rate' (expected value between 0.0 and 1.0)", + rule_json); + continue; + } + config.sample_rate = sample_rate; + } + + if (rule_json.contains("max_per_second")) { + if (!rule_json.at("max_per_second").is_number()) { + log_invalid_json("span sampler: invalid type for 'max_per_second' (expected number)", + rule_json); + continue; + } + const double max_per_second = rule_json.at("max_per_second").get(); + if (max_per_second <= 0) { + log_invalid_json( + "span sampler: invalid value for 'max_per_second' (expected positive value)", + rule_json); + continue; + } + config.max_per_second = max_per_second; + } + + config.text = rule_json.dump(); + rules_.emplace_back(config, clock); + } + } catch (const json::parse_error& error) { + std::string message; + message += "span sampler: unable to parse JSON config: "; + message += error.what(); + logger.Log(LogLevel::error, message); + } +} + +SpanSampler::Rule* SpanSampler::match(const SpanData& span) { + const auto found = std::find_if(rules_.begin(), rules_.end(), + [&](const Rule& rule) { return rule.match(span); }); + if (found == rules_.end()) { + return nullptr; + } + return &*found; +} + +const std::vector& SpanSampler::rules() const { return rules_; } + } // namespace opentracing } // namespace datadog diff --git a/src/sample.h b/src/sample.h index 5bfeed35..0439c4c6 100644 --- a/src/sample.h +++ b/src/sample.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -86,6 +87,88 @@ class RulesSampler { PrioritySampler priority_sampler_; }; +class Logger; +struct SpanData; + +// `SpanSampler` is consulted for each span, but only after another sampler has +// decided that the _trace_ will be dropped (i.e. sampling priority <= 0). +// Span sampling might select individual spans to be kept anyway, based on +// separately configured rules (`DD_SPAN_SAMPLING_RULES`). +// +// Configure `SpanSampling` by calling the `configure` member function. Then, +// see if a span matches one of the configured rules by calling the `match` +// member function. If the span matched a rule, then a pointer the rule is +// returned. The rule's `sample` member function then determines whether the +// span is kept on account of the rule. +class SpanSampler { + public: + // `Rule` contains the configuration parsed from a span sampling rule, as + // well as the associated rate limiter if so configured. + class Rule { + public: + // `Config` contains the parsed configuration for a span sampling rule, as + // well as the original text of the JSON configuration particular to the + // rule. + struct Config { + std::string service_pattern; // glob pattern + std::string operation_name_pattern; // glob pattern + double sample_rate; // never NaN + double max_per_second; // NaN if there is no max + std::string text; // as the rule appeared in the JSON array + + Config(); + }; + + private: + Config config_; + std::unique_ptr limiter_; + + public: + // Create a rule having the specified 'config'. If `config` contains a + // non-NaN `max_per_second`, then configure the rule's limiter accordingly + // with the specified `clock`. + explicit Rule(const Config& config, TimeProvider clock); + + // Return whether the specified `span` matches this rule. + bool match(const SpanData& span) const; + + // Without checking whether the specified `span` matches this rule, return + // whether `span` is kept by the rule. + bool sample(const SpanData& span); + + // Return this rule's configuration. + const Config& config() const; + + private: + // Without checking whether the specified `span` matches this rule, and + // without consulting the limiter, return whether `span` is kept on account + // of this rule. + bool roll(const SpanData& span) const; + + // Return whether another span is permitted past this rule's rate limiter. + // If there is no rate limiter associated with this rule, then this + // function always returns `true`. + bool allow(); + }; + + private: + std::vector rules_; + + public: + // Overwrite this sampler's rules with those parsed from the specified + // `raw_json` configuration text. Use the specified `clock` for rate + // limiting. If an error occurs, skip the offending rule and emit a + // diagnostic using the specified `logger`. + void configure(ot::string_view raw_json, const Logger& logger, TimeProvider clock); + + // Return a pointer to the first rule that the specified `span` matches, or + // return `nullptr` if `span` does not match any configured rule. + Rule* match(const SpanData& span); + + // Return this sampler's rules. + const std::vector& rules() const; +}; + } // namespace opentracing } // namespace datadog diff --git a/src/sampling_mechanism.h b/src/sampling_mechanism.h index f0cfc293..eec501b5 100644 --- a/src/sampling_mechanism.h +++ b/src/sampling_mechanism.h @@ -59,7 +59,12 @@ enum class SamplingMechanism { // Reserved for future use. AppSec = 5, // Reserved for future use. - RemoteRateUserDefined = 6 + RemoteRateUserDefined = 6, + // Reserved for future use. + RemoteRateEmergency = 7, + // Individual span kept by a matching span sampling rule when the enclosing + // trace was dropped. + SpanRule = 8, }; // `OptionalSamplingMechanism` is either a `SamplingMechanism` or "empty," diff --git a/src/span_buffer.cpp b/src/span_buffer.cpp index bd72ef5e..073e71b8 100644 --- a/src/span_buffer.cpp +++ b/src/span_buffer.cpp @@ -9,8 +9,13 @@ namespace datadog { namespace opentracing { SpanBuffer::SpanBuffer(std::shared_ptr logger, std::shared_ptr writer, - std::shared_ptr sampler, SpanBufferOptions options) - : logger_(logger), writer_(writer), sampler_(sampler), options_(options) {} + std::shared_ptr trace_sampler, + std::shared_ptr span_sampler, SpanBufferOptions options) + : logger_(logger), + writer_(writer), + trace_sampler_(trace_sampler), + span_sampler_(span_sampler), + options_(options) {} void SpanBuffer::registerSpan(const SpanContext& context) { std::lock_guard lock_guard{mutex_}; @@ -52,7 +57,7 @@ void SpanBuffer::finishSpan(std::unique_ptr span) { trace.finished_spans->push_back(std::move(span)); if (trace.finished_spans->size() == trace.all_spans.size()) { generateSamplingPriorityImpl(trace.finished_spans->back().get()); - trace.finish(); + trace.finish(span_sampler_.get()); unbufferAndWriteTrace(trace_id); } } @@ -171,7 +176,8 @@ OptionalSamplingPriority SpanBuffer::generateSamplingPriorityImpl(const SpanData // Consult the sampler for a decision, save the decision, and then return the // saved decision. - auto sampler_result = sampler_->sample(span->env(), span->service, span->name, span->trace_id); + auto sampler_result = + trace_sampler_->sample(span->env(), span->service, span->name, span->trace_id); setSamplerResult(span->trace_id, sampler_result); setSamplingPriorityFromSampler(span->trace_id, sampler_result); return getSamplingPriorityImpl(span->trace_id); diff --git a/src/span_buffer.h b/src/span_buffer.h index 3f0be003..bb942e08 100644 --- a/src/span_buffer.h +++ b/src/span_buffer.h @@ -18,6 +18,7 @@ namespace opentracing { class Writer; class SpanContext; +class SpanSampler; struct SpanBufferOptions { bool enabled = true; @@ -32,8 +33,20 @@ struct SpanBufferOptions { // traces to a Writer. class SpanBuffer { public: + // Create a span buffer that: + // + // - uses the specified `logger` to log diagnostics, + // - uses the specified `writer` to output completed trace segments, + // - uses the specified `trace_sampler` to make decisions about whether to keep traces, + // - uses the specified `span_sampler` to make decisions about whether to keep spans when a trace + // is dropped, + // - is configured using the specified `options`. + // + // If `span_sampler` is `nullptr`, then span sampling is disabled (but + // `trace_sampler` is still consulted for trace sampling decisions). SpanBuffer(std::shared_ptr logger, std::shared_ptr writer, - std::shared_ptr sampler, SpanBufferOptions options); + std::shared_ptr trace_sampler, + std::shared_ptr span_sampler, SpanBufferOptions options); virtual ~SpanBuffer() = default; void registerSpan(const SpanContext& context); @@ -109,7 +122,8 @@ class SpanBuffer { std::shared_ptr logger_; std::shared_ptr writer_; mutable std::mutex mutex_; - std::shared_ptr sampler_; + std::shared_ptr trace_sampler_; + std::shared_ptr span_sampler_; protected: // Exists to make it easy for a subclass (ie, our testing mock) to override on-trace-finish diff --git a/src/tracer.cpp b/src/tracer.cpp index f2527c8e..dfd81d6e 100644 --- a/src/tracer.cpp +++ b/src/tracer.cpp @@ -11,8 +11,8 @@ #include #endif +#include #include -#include #include #include @@ -137,7 +137,7 @@ uint64_t traceTagsPropagationMaxLength(const TracerOptions &options, const Logge void Tracer::configureRulesSampler(std::shared_ptr sampler) noexcept { try { auto log_invalid_json = [&](const std::string &description, json &object) { - logger_->Log(LogLevel::info, description + ": " + object.get()); + logger_->Log(LogLevel::error, description + ": " + object.dump()); }; json config = json::parse(opts_.sampling_rules); for (auto &item : config.items()) { @@ -160,6 +160,7 @@ void Tracer::configureRulesSampler(std::shared_ptr sampler) noexce log_invalid_json( "rules sampler: invalid value for sample rate (expected value between 0.0 and 1.0)", rule); + continue; } // "service" and "name" are optional bool has_service = rule.contains("service") && rule.at("service").is_string(); @@ -223,16 +224,19 @@ Tracer::Tracer(TracerOptions options, std::shared_ptr buffer, TimePr legacy_obfuscation_(legacyObfuscationEnabled()) {} Tracer::Tracer(TracerOptions options, std::shared_ptr writer, - std::shared_ptr sampler, std::shared_ptr logger) + std::shared_ptr trace_sampler, std::shared_ptr logger) : logger_(logger), opts_(options), get_time_(getRealTime), get_id_(getId), legacy_obfuscation_(legacyObfuscationEnabled()) { - configureRulesSampler(sampler); + assert(logger_); + configureRulesSampler(trace_sampler); + auto span_sampler = std::make_shared(); + span_sampler->configure(opts_.span_sampling_rules, *logger_, get_time_); startupLog(options); buffer_ = std::make_shared( - logger_, writer, sampler, + logger_, writer, trace_sampler, span_sampler, SpanBufferOptions{isEnabled(), reportingHostname(options), analyticsRate(options), options.service, traceTagsPropagationMaxLength(options, *logger_)}); } @@ -285,8 +289,9 @@ std::unique_ptr Tracer::StartSpanWithOptions(ot::string_view operation span->SetTag(tag.first, tag.second); } return span; -} catch (const std::bad_alloc &) { - // At least don't crash. +} catch (const std::exception &error) { + logger_->Log(LogLevel::error, + std::string("Unexpected error in StartSpanWithOptions: ") + error.what()); return nullptr; } diff --git a/src/tracer_factory.cpp b/src/tracer_factory.cpp index 5c899e0d..09f0122c 100644 --- a/src/tracer_factory.cpp +++ b/src/tracer_factory.cpp @@ -93,6 +93,9 @@ ot::expected optionsFromConfig(const char *configuration, if (config.find("tags_header_size") != config.end()) { config.at("tags_header_size").get_to(options.tags_header_size); } + if (config.find("span_sampling_rules") != config.end()) { + options.span_sampling_rules = config.at("span_sampling_rules").dump(); + } } catch (const nlohmann::detail::type_error &) { error_message = "configuration has an argument with an incorrect type"; return ot::make_unexpected(std::make_error_code(std::errc::invalid_argument)); diff --git a/src/tracer_options.cpp b/src/tracer_options.cpp index da44940e..6ce0c615 100644 --- a/src/tracer_options.cpp +++ b/src/tracer_options.cpp @@ -4,6 +4,7 @@ #include #include +#include #include #include #include @@ -11,6 +12,7 @@ #include #include "bool.h" +#include "logger.h" namespace ot = opentracing; @@ -114,6 +116,59 @@ ot::expected parseDouble(const std::string &text, double mi return ot::make_unexpected("not within the range of a double: " + text); } +// Update `options.span_sampling_rules` if there are relevant environment +// variables defined. If an error occurs, log a diagnostic via +// `options.log_func`. +void applySpanSamplingRulesFromEnvironment(TracerOptions &options) { + // Prefer DD_SPAN_SAMPLING_RULES, if present. + // Next, prefer DD_SPAN_SAMPLING_RULES_FILE. + // If both are specified, log an error and use DD_SPAN_SAMPLING_RULES. + // If neither are specified, use `options.span_sampling_rules`. + + const auto logger_ptr = makeLogger(options); + const auto &logger = *logger_ptr; + + const char *const span_rules = std::getenv("DD_SPAN_SAMPLING_RULES"); + const char *const span_rules_file = std::getenv("DD_SPAN_SAMPLING_RULES_FILE"); + if (span_rules) { + options.span_sampling_rules = span_rules; + if (span_rules_file) { + logger.Log(LogLevel::error, + "Both DD_SPAN_SAMPLING_RULES and DD_SPAN_SAMPLING_RULES_FILE have values in the " + "environment. DD_SPAN_SAMPLING_RULES will be used, and " + "DD_SPAN_SAMPLING_RULES_FILE will be ignored."); + } + return; + } + + if (span_rules_file) { + const auto log_file_error = [&](const char *operation) { + std::string message; + message += "Unable to "; + message += operation; + message += " file \""; + message += span_rules_file; + message += "\" specified as value of environment variable DD_SPAN_SAMPLING_RULES_FILE."; + logger.Log(LogLevel::error, message); + }; + + std::ifstream file(span_rules_file); + if (!file) { + log_file_error("open"); + return; + } + + std::stringstream span_rules; + span_rules << file.rdbuf(); + if (!file) { + log_file_error("read"); + return; + } + + options.span_sampling_rules = span_rules.str(); + } +} + } // namespace ot::expected> asPropagationStyle( @@ -273,6 +328,8 @@ ot::expected applyTracerOptionsFromEnvironment( opts.sample_rate = maybe_value.value(); } + applySpanSamplingRulesFromEnvironment(opts); + return opts; } diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 42592164..67d0e22b 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -24,3 +24,4 @@ _datadog_test(tracer_options_test tracer_options_test.cpp) _datadog_test(tracer_test tracer_test.cpp) _datadog_test(limiter_test limiter_test.cpp) _datadog_test(logger_test logger_test.cpp) +_datadog_test(glob_test glob_test.cpp) diff --git a/test/glob_test.cpp b/test/glob_test.cpp new file mode 100644 index 00000000..973a9322 --- /dev/null +++ b/test/glob_test.cpp @@ -0,0 +1,52 @@ +// This test covers the glob-style string pattern matching function, +// `glob_match`, defined in `glob.h`. + +#include "../src/glob.h" + +#include + +using namespace datadog::opentracing; + +TEST_CASE("glob") { + struct TestCase { + ot::string_view pattern; + ot::string_view subject; + bool expected; + }; + + auto test_case = GENERATE(values( + {// clang-format off + // from the reference implementation + // https://github.com/DataDog/tag-matching-sampling-rules/blob/master/glob.mjs + {"foo", "foo", true}, + {"foo.*", "foo.you", true}, + {"foo.*", "snafoo.", false}, + {"hi*there", "hithere", true}, + {"*stuff", "lots of stuff", true}, + {"*stuff", "stuff to think about", false}, + {"*a*a*a*a*a*a", "aaaaaaaaaaaaaaaaaaaaaaaaaax", false}, + {"*a*a*a*a*a*a", "aaaaaaaarrrrrrraaaraaarararaarararaarararaaa", true}, + + // from deliberation with Zach Groves + {"aok*", "aok**", true}, + + // question marks + {"mysql??", "mysql01", true}, + {"mysql??", "mysql1x", true}, + {"n?-ingress-*", "ny-ingress-backup", true}, + {"n?-ingress-*", "nj-ingress-leader", true}, + {"n?-ingress-*", "nj-ingress", false}, + + // edge cases + {"", "", true}, + {"", "a", false}, + {"*", "", true}, + {"?", "", false} + })); + // clang-format on + + CAPTURE(test_case.pattern); + CAPTURE(test_case.subject); + CAPTURE(test_case.expected); + REQUIRE(glob_match(test_case.pattern, test_case.subject) == test_case.expected); +} diff --git a/test/mocks.h b/test/mocks.h index fef2735f..ba3d5537 100644 --- a/test/mocks.h +++ b/test/mocks.h @@ -219,14 +219,15 @@ struct MockWriter : public Writer { struct MockBuffer : public SpanBuffer { MockBuffer() : SpanBuffer(std::make_shared(), nullptr, std::make_shared(), - SpanBufferOptions{}){}; + nullptr, SpanBufferOptions{}){}; explicit MockBuffer(std::shared_ptr sampler) - : SpanBuffer(std::make_shared(), nullptr, sampler, SpanBufferOptions{}){}; + : SpanBuffer(std::make_shared(), nullptr, sampler, nullptr, + SpanBufferOptions{}){}; // This constructor overload is provided for tests where the service name is // relevant. MockBuffer(std::shared_ptr sampler, std::string service, uint64_t tags_header_size = 512) - : SpanBuffer(std::make_shared(), nullptr, sampler, + : SpanBuffer(std::make_shared(), nullptr, sampler, nullptr, SpanBufferOptions{true, "localhost", std::nan(""), service, tags_header_size}) { } diff --git a/test/propagation_test.cpp b/test/propagation_test.cpp index 9affd582..ebc51497 100644 --- a/test/propagation_test.cpp +++ b/test/propagation_test.cpp @@ -419,7 +419,8 @@ TEST_CASE("sampling behaviour") { auto logger = std::make_shared(); auto sampler = std::make_shared(); auto writer = std::make_shared(sampler); - auto buffer = std::make_shared(logger, writer, sampler, SpanBufferOptions{}); + auto buffer = + std::make_shared(logger, writer, sampler, nullptr, SpanBufferOptions{}); TracerOptions tracer_options{"", 0, "service_name", "web"}; std::shared_ptr tracer{new Tracer{tracer_options, buffer, getRealTime, getId}}; ot::Tracer::InitGlobal(tracer); @@ -633,7 +634,8 @@ TEST_CASE("force tracing behaviour") { auto logger = std::make_shared(); auto sampler = std::make_shared(); auto writer = std::make_shared(sampler); - auto buffer = std::make_shared(logger, writer, sampler, SpanBufferOptions{}); + auto buffer = + std::make_shared(logger, writer, sampler, nullptr, SpanBufferOptions{}); TracerOptions tracer_options{"", 0, "service_name", "web"}; std::shared_ptr tracer{new Tracer{tracer_options, buffer, getRealTime, getId}}; ot::Tracer::InitGlobal(tracer); @@ -788,7 +790,6 @@ TEST_CASE("propagated Datadog tags (x-datadog-tags)") { std::cerr << "Exception thrown in test: " << error.what() << '\n'; } } - SECTION("including a 'decision maker' for us, if we make the sampling decision") { const std::string serialized_tags = "_dd.p.hello=world"; // Our sampler will make a decision. diff --git a/test/sample_test.cpp b/test/sample_test.cpp index 114f1956..53c93c03 100644 --- a/test/sample_test.cpp +++ b/test/sample_test.cpp @@ -1,13 +1,16 @@ #include "../src/sample.h" +#include #include #include +#include #include "../src/agent_writer.h" #include "../src/span.h" #include "../src/tracer.h" #include "mocks.h" using namespace datadog::opentracing; +using json = nlohmann::json; TEST_CASE("priority sampler unit test") { PrioritySampler sampler; @@ -305,8 +308,446 @@ TEST_CASE("rules sampler") { const auto tag_found = span_data.meta.find("_dd.p.dm"); REQUIRE(tag_found != span_data.meta.end()); const std::string& decision_maker = tag_found->second; - const std::string expected = "-" + std::to_string(int(SamplingMechanism::Rule)); REQUIRE(decision_maker == expected); } } + +TEST_CASE("SpanSampler rule parsing") { + MockLogger logger; + const auto dummy_clock = []() { return TimePoint(); }; + SpanSampler sampler; + + SECTION("empty array means no rules") { + sampler.configure("[]", logger, dummy_clock); + REQUIRE(sampler.rules().size() == 0); + REQUIRE(logger.records.size() == 0); + } + + SECTION("default values for rule properties") { + sampler.configure("[{}]", logger, dummy_clock); + REQUIRE(sampler.rules().size() == 1); + REQUIRE(logger.records.size() == 0); + + const SpanSampler::Rule::Config& config = sampler.rules().front().config(); + CAPTURE(config.max_per_second); + REQUIRE(std::isnan(config.max_per_second)); + REQUIRE(config.operation_name_pattern == "*"); + REQUIRE(config.sample_rate == 1.0); + REQUIRE(config.service_pattern == "*"); + REQUIRE(config.text == "{}"); + } + + SECTION("valid values for rule properties") { + const std::string rule_json = R"json({ + "service": "foosvc", + "name": "handle.thing", + "sample_rate": 0.1, + "max_per_second": 1000 + })json"; + sampler.configure("[" + rule_json + "]", logger, dummy_clock); + REQUIRE(sampler.rules().size() == 1); + REQUIRE(logger.records.size() == 0); + + const SpanSampler::Rule::Config& config = sampler.rules().front().config(); + REQUIRE(config.max_per_second == 1000); + REQUIRE(config.operation_name_pattern == "handle.thing"); + REQUIRE(config.sample_rate == 0.1); + REQUIRE(config.service_pattern == "foosvc"); + REQUIRE(json::parse(config.text) == json::parse(rule_json)); + } + + SECTION("invalid JSON yields no rules and logs error") { + auto bad_json = GENERATE(as{}, "this is not json", "[{'neither': 'is this'}]", + "[{}, {4}, {}]"); + + CAPTURE(bad_json); + sampler.configure(bad_json, logger, dummy_clock); + REQUIRE(sampler.rules().size() == 0); + REQUIRE(logger.records.size() == 1); + const auto& log_record = logger.records.front(); + REQUIRE(log_record.level == LogLevel::error); + CAPTURE(log_record.message); + REQUIRE(log_record.message.find("JSON") != std::string::npos); + } + + SECTION("invalid rules are skipped and log error") { + struct TestCase { + ot::string_view rules_json; + unsigned expected_rule_count; + ot::string_view expected_error_excerpt; + }; + + auto test_case = GENERATE(values( + {// "sample_rate" has the wrong type + {R"json([{"sample_rate": "foo"}, {}, {}, {}])json", 3, "sample_rate"}, + // "sample_rate" is out of range + {R"json([{"sample_rate": 1.2}, {}, {}, {}])json", 3, "sample_rate"}, + // "max_per_second" has the wrong type + {R"json([{}, {"max_per_second": null}, {}, {}])json", 3, "max_per_second"}, + // "max_per_second" is out of range + {R"json([{}, {"max_per_second": 0}, {}, {}])json", 3, "max_per_second"}, + // "service" has the wrong type + {R"json([{}, {}, {"service": 10}, {}])json", 3, "service"}, + // "name" has the wrong type + {R"json([{}, {}, {}, {"name": false}])json", 3, "name"}})); + + CAPTURE(test_case.rules_json); + sampler.configure(test_case.rules_json, logger, dummy_clock); + + REQUIRE(sampler.rules().size() == test_case.expected_rule_count); + + REQUIRE(logger.records.size() == 1); + const auto& log_record = logger.records.front(); + CAPTURE(log_record.message); + REQUIRE(log_record.level == LogLevel::error); + + CAPTURE(test_case.expected_error_excerpt); + REQUIRE(log_record.message.find(test_case.expected_error_excerpt) != std::string::npos); + } +} + +namespace { + +// `join({"x", "y", "z"}, " -> ") == "x -> y -> z"` +std::string join(const std::vector& pieces, ot::string_view separator) { + std::string result; + auto iter = pieces.begin(); + const auto end = pieces.end(); + if (iter != end) { + result += *iter; + for (++iter; iter != end; ++iter) { + result.append(separator.data(), separator.size()); + result += *iter; + } + } + return result; +} + +} // namespace + +TEST_CASE("SpanSampler matching") { + MockLogger logger; + const auto dummy_clock = []() { return TimePoint(); }; + SpanSampler sampler; + + const std::vector json_rules = { + R"json({"service": "mysql", "name": "exec.*", "sample_rate": 1.0})json", + R"json({"service": "mysql*", "sample_rate": 0.1})json", + R"json({"name": "super.auth", "sample_rate": 1.0})json", + R"json({"name": "super.auth??", "sample_rate": 1.0})json", + }; + + sampler.configure("[" + join(json_rules, ", ") + "]", logger, dummy_clock); + REQUIRE(logger.records.size() == 0); + REQUIRE(sampler.rules().size() == json_rules.size()); + + SECTION("span can match multiple rules, but the first matching rule is chosen") { + SpanData span; + span.service = "mysql"; + span.name = "exec.query"; + // `span` matches both `json_rules[0]` and `json_rules[1]`, but the earlier + // rule will be chosen (`json_rules[0]`). + + // First check that two rules could match. + const unsigned match_count = + std::count_if(sampler.rules().begin(), sampler.rules().end(), + [&](const SpanSampler::Rule& rule) { return rule.match(span); }); + REQUIRE(match_count == 2); + + // Then check that the first one is chosen. + SpanSampler::Rule* rule = sampler.match(span); + REQUIRE(rule == &sampler.rules()[0]); + } + + SECTION("no match") { + SpanData span; + span.service = "table"; + span.name = "check.please"; + REQUIRE(sampler.match(span) == nullptr); + } + + SECTION("match by service name") { + SpanData span; + span.service = "mysql123"; + span.name = "cache.lookup"; + // matches `json_rules[1]` + REQUIRE(sampler.match(span) == &sampler.rules()[1]); + } + + SECTION("match by operation name") { + SpanData span; + span.service = "langley"; + span.name = "super.auth"; + // matches `json_rules[2]` + REQUIRE(sampler.match(span) == &sampler.rules()[2]); + } + + SECTION("match by service name and operation name") { + SpanData span; + span.service = "mysql"; + span.name = "exec.query"; + // matches `json_rules[0]` (as before) + SpanSampler::Rule* rule = sampler.match(span); + REQUIRE(rule == &sampler.rules()[0]); + } + + SECTION("match involving question marks") { + SpanData span; + span.service = "roswell"; + span.name = "super.auth51"; + // matches `json_rules[3]` (not `json_rules[2]`) + REQUIRE(sampler.match(span) == &sampler.rules()[3]); + } +} + +TEST_CASE("SpanSampler sampling") { + // Starting calendar time 2022-07-01 00:00:00 local time + std::tm start{}; + start.tm_year = 122; + start.tm_mon = 7; + start.tm_mday = 1; + TimePoint now{std::chrono::system_clock::from_time_t(std::mktime(&start)), + std::chrono::steady_clock::time_point{}}; + // Note: Use `advanceTime(now, ...)` to advance the clock. + const auto clock = [&now]() { return now; }; + std::uint64_t next_id = 1; + const auto make_id = [&next_id]() { return next_id++; }; + const auto trace_sampler = std::make_shared(); + const auto writer = std::make_shared(trace_sampler); + const auto span_sampler = std::make_shared(); + const auto logger = std::make_shared(); + const auto span_buffer = std::make_shared(logger, writer, trace_sampler, + span_sampler, SpanBufferOptions{}); + TracerOptions tracer_options; + tracer_options.service = "foosvc"; + const auto tracer = + std::make_shared(tracer_options, span_buffer, clock, make_id, logger); + const auto has_span_sampling_tag = [](const auto& span_ptr) { + const auto& numeric_tags = span_ptr->metrics; + return numeric_tags.count("_dd.span_sampling.mechanism") || + numeric_tags.count("_dd.span_sampling.rule_rate") || + numeric_tags.count("_dd.span_sampling.max_per_second"); + }; + + SECTION("no span_sampling tags when there are no span sampling rules") { + // Make sure that the trace sampler is dropping the trace, otherwise we + // wouldn't expect the span sampling rules to matter. + trace_sampler->rule_rate = 0; + trace_sampler->sampling_mechanism = SamplingMechanism::Manual; + trace_sampler->sampling_priority = + std::make_unique(SamplingPriority::UserDrop); + + // We expect an empty array of rules to mean that span sampling won't + // happen. + span_sampler->configure("[]", *logger, clock); + { + const auto root = tracer->StartSpan("root"); + REQUIRE(root); + advanceTime(now, std::chrono::milliseconds(8)); + const auto child1 = tracer->StartSpan("child1", {ot::ChildOf(&root->context())}); + REQUIRE(child1); + advanceTime(now, std::chrono::milliseconds(5)); + const auto child2 = tracer->StartSpan("child2", {ot::ChildOf(&root->context())}); + REQUIRE(child2); + advanceTime(now, std::chrono::milliseconds(31)); + const auto grandchild = tracer->StartSpan("grandchild", {ot::ChildOf(&child1->context())}); + REQUIRE(grandchild); + advanceTime(now, std::chrono::milliseconds(2)); + } + + REQUIRE(writer->traces.size() == 1); + const auto& trace = writer->traces.front(); + REQUIRE(trace.size() == 4); + + REQUIRE(std::none_of(trace.begin(), trace.end(), has_span_sampling_tag)); + } + + SECTION("no span_sampling tags when the trace is kept") { + // When the trace is kept, span sampling rules aren't consulted (even if + // they would match and keep spans). + trace_sampler->rule_rate = 1; + trace_sampler->sampling_mechanism = SamplingMechanism::Manual; + trace_sampler->sampling_priority = + std::make_unique(SamplingPriority::UserKeep); + + const auto rules_json = R"json([ + {"service": "foosvc", "name": "grandchild"}, + {"name": "child*"}, + {"service": "foosvc", "max_per_second": 1000} + ])json"; + span_sampler->configure(rules_json, *logger, clock); + { + const auto root = tracer->StartSpan("root"); + REQUIRE(root); + advanceTime(now, std::chrono::milliseconds(8)); + const auto child1 = tracer->StartSpan("child1", {ot::ChildOf(&root->context())}); + REQUIRE(child1); + advanceTime(now, std::chrono::milliseconds(5)); + const auto child2 = tracer->StartSpan("child2", {ot::ChildOf(&root->context())}); + REQUIRE(child2); + advanceTime(now, std::chrono::milliseconds(31)); + const auto grandchild = tracer->StartSpan("grandchild", {ot::ChildOf(&child1->context())}); + REQUIRE(grandchild); + advanceTime(now, std::chrono::milliseconds(2)); + } + + REQUIRE(writer->traces.size() == 1); + const auto& trace = writer->traces.front(); + REQUIRE(trace.size() == 4); + + REQUIRE(std::none_of(trace.begin(), trace.end(), has_span_sampling_tag)); + } + + SECTION("expected span_sampling tags when the trace is dropped") { + // Make sure that the trace sampler is dropping the trace, otherwise we + // wouldn't expect the span sampling rules to matter. + trace_sampler->rule_rate = 0; + trace_sampler->sampling_mechanism = SamplingMechanism::Manual; + trace_sampler->sampling_priority = + std::make_unique(SamplingPriority::UserDrop); + + const auto rules_json = R"json([ + {"service": "foosvc", "name": "grandchild", "max_per_second": 999}, + {"name": "child*"}, + {"service": "foosvc", "max_per_second": 1000} + ])json"; + span_sampler->configure(rules_json, *logger, clock); + { + const auto root = tracer->StartSpan("root"); + REQUIRE(root); + advanceTime(now, std::chrono::milliseconds(8)); + const auto child1 = tracer->StartSpan("child1", {ot::ChildOf(&root->context())}); + REQUIRE(child1); + advanceTime(now, std::chrono::milliseconds(5)); + const auto child2 = tracer->StartSpan("child2", {ot::ChildOf(&root->context())}); + REQUIRE(child2); + advanceTime(now, std::chrono::milliseconds(31)); + const auto grandchild = tracer->StartSpan("grandchild", {ot::ChildOf(&child1->context())}); + REQUIRE(grandchild); + advanceTime(now, std::chrono::milliseconds(2)); + } + + REQUIRE(writer->traces.size() == 1); + const auto& trace = writer->traces.front(); + REQUIRE(trace.size() == 4); + + for (const auto& span_ptr : trace) { + const auto& span = *span_ptr; + const auto& numeric_tags = span.metrics; + if (span.name == "root") { + // `root` matches the rule: {"service": "foosvc", "max_per_second": 1000} + REQUIRE(numeric_tags.count("_dd.span_sampling.mechanism") == 1); + REQUIRE(numeric_tags.at("_dd.span_sampling.mechanism") == + int(SamplingMechanism::SpanRule)); + REQUIRE(numeric_tags.count("_dd.span_sampling.rule_rate") == 1); + REQUIRE(numeric_tags.at("_dd.span_sampling.rule_rate") == 1.0); + REQUIRE(numeric_tags.count("_dd.span_sampling.max_per_second") == 1); + REQUIRE(numeric_tags.at("_dd.span_sampling.max_per_second") == 1000); + } else if (span.name == "child1" || span.name == "child2") { + // `child1` and `child2` match the rule: {"name": "child*"} + REQUIRE(numeric_tags.count("_dd.span_sampling.mechanism") == 1); + REQUIRE(numeric_tags.at("_dd.span_sampling.mechanism") == + int(SamplingMechanism::SpanRule)); + REQUIRE(numeric_tags.count("_dd.span_sampling.rule_rate") == 1); + REQUIRE(numeric_tags.at("_dd.span_sampling.rule_rate") == 1.0); + REQUIRE(numeric_tags.count("_dd.span_sampling.max_per_second") == 0); + } else { + REQUIRE(span.name == "grandchild"); + // `grandchild` matches the rule: {"service": "foosvc", "name": "grandchild"} + REQUIRE(numeric_tags.count("_dd.span_sampling.mechanism") == 1); + REQUIRE(numeric_tags.at("_dd.span_sampling.mechanism") == + int(SamplingMechanism::SpanRule)); + REQUIRE(numeric_tags.count("_dd.span_sampling.rule_rate") == 1); + REQUIRE(numeric_tags.at("_dd.span_sampling.rule_rate") == 1.0); + REQUIRE(numeric_tags.count("_dd.span_sampling.max_per_second") == 1); + REQUIRE(numeric_tags.at("_dd.span_sampling.max_per_second") == 999); + } + } + } + + SECTION("probabilistic sampling for span rules") { + // Make sure that the trace sampler is dropping the trace, otherwise we + // wouldn't expect the span sampling rules to matter. + trace_sampler->rule_rate = 0; + trace_sampler->sampling_mechanism = SamplingMechanism::Manual; + trace_sampler->sampling_priority = + std::make_unique(SamplingPriority::UserDrop); + + const auto rules_json = R"json([ + {"name": "mysql.*", "sample_rate": 0.5} + ])json"; + span_sampler->configure(rules_json, *logger, clock); + const int child_count = 10000; + { + const auto root = tracer->StartSpan("root"); + REQUIRE(root); + advanceTime(now, std::chrono::milliseconds(8)); + // Generate a lot of spans that match the rule, so that our 50% + // probability can be measured. + for (int i = 0; i < child_count; ++i) { + const auto child = tracer->StartSpan("mysql.query", {ot::ChildOf(&root->context())}); + REQUIRE(child); + advanceTime(now, std::chrono::milliseconds(100)); + } + } + + REQUIRE(writer->traces.size() == 1); + const auto& trace = writer->traces.front(); + REQUIRE(trace.size() == child_count + 1); + + // `root` did not match any rule. + const auto root_iter = std::find_if( + trace.begin(), trace.end(), [](const auto& span_ptr) { return span_ptr->name == "root"; }); + REQUIRE(root_iter != trace.end()); + REQUIRE(!has_span_sampling_tag(*root_iter)); + + const int kept_children_count = + std::count_if(trace.begin(), trace.end(), has_span_sampling_tag); + // 50% of `child_count` would be 5000. Let's say within 5% of that -> 5000 +/- 10. + REQUIRE(kept_children_count >= 5000 - 10); + REQUIRE(kept_children_count <= 5000 + 10); + } + + SECTION("rate limiting for span rules") { + // Make sure that the trace sampler is dropping the trace, otherwise we + // wouldn't expect the span sampling rules to matter. + trace_sampler->rule_rate = 0; + trace_sampler->sampling_mechanism = SamplingMechanism::Manual; + trace_sampler->sampling_priority = + std::make_unique(SamplingPriority::UserDrop); + + const auto rules_json = R"json([ + {"name": "mysql.*", "max_per_second": 10} + ])json"; + span_sampler->configure(rules_json, *logger, clock); + + // Each trace's 20 children will hit the limiter after 10. + // Then a second goes by, so the limiter recharges. + // We'd expect 10 kept per trace, or 1000 total. + const int num_traces = 100; + const int children_per_trace = 20; + const int milliseconds_between_traces = 1000; + { + for (int i = 0; i < num_traces; ++i) { + advanceTime(now, std::chrono::milliseconds(milliseconds_between_traces)); + const auto root = tracer->StartSpan("root"); + REQUIRE(root); + for (int j = 0; j < children_per_trace; ++j) { + const auto child = tracer->StartSpan("mysql.query", {ot::ChildOf(&root->context())}); + REQUIRE(child); + } + } + } + + REQUIRE(writer->traces.size() == num_traces); + + int kept_spans_count = 0; + for (const auto& trace : writer->traces) { + kept_spans_count += std::count_if(trace.begin(), trace.end(), has_span_sampling_tag); + } + + // 10 is the configured `max_per_second`. + REQUIRE(kept_spans_count == num_traces * 10); + } +} diff --git a/test/span_buffer_test.cpp b/test/span_buffer_test.cpp index 61f5bf26..18a63138 100644 --- a/test/span_buffer_test.cpp +++ b/test/span_buffer_test.cpp @@ -10,7 +10,8 @@ TEST_CASE("span buffer") { auto logger = std::make_shared(); auto sampler = std::make_shared(); auto writer = std::make_shared(sampler); - auto buffer = std::make_shared(logger, writer, sampler, SpanBufferOptions{}); + auto buffer = + std::make_shared(logger, writer, sampler, nullptr, SpanBufferOptions{}); auto context_from_span = [](const TestSpanData& span) -> SpanContext { auto logger = std::make_shared(); diff --git a/test/tracer_factory_test.cpp b/test/tracer_factory_test.cpp index 611e7c2a..9ebddc8a 100644 --- a/test/tracer_factory_test.cpp +++ b/test/tracer_factory_test.cpp @@ -3,12 +3,80 @@ #include #include +#include +#include +#include #include "../src/tracer.h" #include "mocks.h" using namespace datadog::opentracing; +namespace { + +// `EnvGuard` sets an environment variable to a specified value for the scope +// of the `EnvGuard`, restoring the environment variable's previous value +// afterward. +class EnvGuard { + std::string name_; + bool had_value_; + std::string old_value_; + + public: + explicit EnvGuard(std::string name, const char *value) : name_(name) { + if (const char *const old_value = std::getenv(name.c_str())) { + had_value_ = true; + old_value_ = old_value; + } else { + had_value_ = false; + } + + const bool overwrite = true; + ::setenv(name.c_str(), value, overwrite); + } + + ~EnvGuard() { + if (had_value_) { + const bool overwrite = true; + ::setenv(name_.c_str(), old_value_.c_str(), overwrite); + } else { + ::unsetenv(name_.c_str()); + } + } +}; + +// `TmpFile` creates a temporary file having the specified content, and deletes +// the file at the end of the `TmpFile` scope. A path to the temporary file is +// accesible via the `name` member function. +class TmpFile { + std::string name_; + + public: + explicit TmpFile(ot::string_view content) { + char name_buffer[L_tmpnam]; + if (std::tmpnam(name_buffer) == nullptr) { + throw std::runtime_error("Unable to create a temporary file name."); + } + name_ = name_buffer; + + std::ofstream file(name_buffer); + if (!file) { + throw std::runtime_error("unable to open temporary file for writing: " + name_); + } + + file.write(content.data(), content.size()); + if (!file) { + throw std::runtime_error("unable to write to temporary file named: " + name_); + } + } + + ~TmpFile() { std::remove(name_.c_str()); } + + const std::string &name() const { return name_; } +}; + +} // namespace + TEST_CASE("tracer factory") { TracerFactory factory; @@ -529,5 +597,88 @@ TEST_CASE("tracer factory") { REQUIRE(!tracer->opts.analytics_enabled); REQUIRE(std::isnan(tracer->opts.analytics_rate)); } + SECTION("DD_SPAN_SAMPLING_RULES") { + const auto value = "[{}]"; + EnvGuard guard("DD_SPAN_SAMPLING_RULES", value); + + SECTION("overrides default") { + const auto input = R"json({ + "service": "my-service" + })json"; + std::string error; + const auto result = factory.MakeTracer(input, error); + + REQUIRE(error == ""); + REQUIRE(result); + REQUIRE(*result); + auto &tracer = static_cast(**result); + REQUIRE(tracer.opts.span_sampling_rules == value); + } + + SECTION("overrides config") { + const auto input = R"json({ + "service": "my-service", + "span_sampling_rules": [{}, {}] + })json"; + std::string error; + const auto result = factory.MakeTracer(input, error); + + REQUIRE(error == ""); + REQUIRE(result); + REQUIRE(*result); + auto &tracer = static_cast(**result); + REQUIRE(tracer.opts.span_sampling_rules == value); + } + } + SECTION("DD_SPAN_SAMPLING_RULES_FILE") { + const std::string value = "[{}]"; + TmpFile file(value); + EnvGuard guard("DD_SPAN_SAMPLING_RULES_FILE", file.name().c_str()); + + SECTION("overrides default") { + const auto input = R"json({ + "service": "my-service" + })json"; + std::string error; + const auto result = factory.MakeTracer(input, error); + + REQUIRE(error == ""); + REQUIRE(result); + REQUIRE(*result); + auto &tracer = static_cast(**result); + REQUIRE(tracer.opts.span_sampling_rules == value); + } + + SECTION("overrides config") { + const auto input = R"json({ + "service": "my-service", + "span_sampling_rules": [{}, {}] + })json"; + std::string error; + const auto result = factory.MakeTracer(input, error); + + REQUIRE(error == ""); + REQUIRE(result); + REQUIRE(*result); + auto &tracer = static_cast(**result); + REQUIRE(tracer.opts.span_sampling_rules == value); + } + + SECTION("does not override DD_SPAN_SAMPLING_RULES") { + const auto override_value = "[{}, {}, {}]"; + EnvGuard guard("DD_SPAN_SAMPLING_RULES", override_value); + const auto input = R"json({ + "service": "my-service" + })json"; + std::string error; + const auto result = factory.MakeTracer(input, error); + + REQUIRE(error == ""); + REQUIRE(result); + REQUIRE(*result); + auto &tracer = static_cast(**result); + REQUIRE(tracer.opts.span_sampling_rules == override_value); + } + } } }