Skip to content

Commit

Permalink
[SPM] Differentiate null from no error data (jaegertracing#4985)
Browse files Browse the repository at this point in the history
## Which problem is this PR solving?
- Resolves jaegertracing#4229

## Description of the changes
- Ensures SPM displays a 0% error rate if there are no error metrics
_and_ call rates exist.
- If call rates don't exist, the error rate will also be null.
- This ensures SPM is able to differentiate "no data" from "no errors".

## How was this change tested?
- Add unit tests to cover happy and error cases.
- Tested locally to confirm "No data" is shown in the Error graph when
there is no data, then when call rates are available, a 0% rate is
displayed.

<img width="1710" alt="Screenshot 2023-12-03 at 8 01 36 pm"
src="https://github.com/jaegertracing/jaeger/assets/26584478/e38bdefc-2e2e-4a9c-a873-2ad1857f2098">

<img width="1696" alt="Screenshot 2023-12-03 at 8 00 45 pm"
src="https://github.com/jaegertracing/jaeger/assets/26584478/3e10d5fb-03e4-4ff3-b260-0dd8045eafbe">

## Checklist
- [x] I have read
https://github.com/jaegertracing/jaeger/blob/master/CONTRIBUTING_GUIDELINES.md
- [x] I have signed all commits
- [x] I have added unit tests for the new functionality
- [x] I have run lint and test steps successfully
  - for `jaeger`: `make lint test`
  - for `jaeger-ui`: `yarn lint` and `yarn test`

---------

Signed-off-by: Albert Teoh <albert@packsmith.io>
Co-authored-by: Albert Teoh <albert@packsmith.io>
  • Loading branch information
albertteoh and Albert Teoh authored Dec 4, 2023
1 parent e170dba commit d45aba6
Show file tree
Hide file tree
Showing 5 changed files with 245 additions and 5 deletions.
2 changes: 1 addition & 1 deletion docker-compose/monitor/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
build: export DOCKER_TAG = dev
build: clean-jaeger
cd ../../ && \
make build-all-in-one && \
make build-all-in-one-linux && \
make docker-images-jaeger-backend

# starts up the system required for SPM using the latest otel image and a development jaeger image.
Expand Down
4 changes: 2 additions & 2 deletions docker-compose/monitor/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,13 +62,13 @@ make build
## Bring up the dev environment

```bash
make run-dev
make dev
```

## Backwards compatibility testing with spanmetrics processor

```bash
make run-dev-processor
make dev-processor
```

For each "run" make target, you should expect to see the following in the Monitor tab after a few minutes:
Expand Down
33 changes: 32 additions & 1 deletion plugin/metrics/prometheus/metricsstore/reader.go
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,38 @@ func (m MetricsReader) GetErrorRates(ctx context.Context, requestParams *metrics
)
},
}
return m.executeQuery(ctx, metricsParams)
errorMetrics, err := m.executeQuery(ctx, metricsParams)
if err != nil {
return nil, fmt.Errorf("failed getting error metrics: %w", err)
}
// Non-zero error rates are available.
if len(errorMetrics.Metrics) > 0 {
return errorMetrics, nil
}

// Check for the presence of call rate metrics to differentiate the absence of error rate from
// the absence of call rate metrics altogether.
callMetrics, err := m.GetCallRates(ctx, &metricsstore.CallRateQueryParameters{BaseQueryParameters: requestParams.BaseQueryParameters})
if err != nil {
return nil, fmt.Errorf("failed getting call metrics: %w", err)
}
// No call rate metrics are available, and by association, means no error rate metrics are available.
if len(callMetrics.Metrics) == 0 {
return errorMetrics, nil
}

// Non-zero call rate metrics are available, which implies that there are just no errors, so we report a zero error rate.
zeroErrorMetrics := make([]*metrics.Metric, 0, len(callMetrics.Metrics))
for _, cm := range callMetrics.Metrics {
zm := *cm
for i := 0; i < len(zm.MetricPoints); i++ {
zm.MetricPoints[i].Value = &metrics.MetricPoint_GaugeValue{GaugeValue: &metrics.GaugeValue{Value: &metrics.GaugeValue_DoubleValue{DoubleValue: 0.0}}}
}
zeroErrorMetrics = append(zeroErrorMetrics, &zm)
}

errorMetrics.Metrics = zeroErrorMetrics
return errorMetrics, nil
}

// GetMinStepDuration gets the minimum step duration (the smallest possible duration between two data points in a time series) supported.
Expand Down
204 changes: 203 additions & 1 deletion plugin/metrics/prometheus/metricsstore/reader_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -487,6 +487,208 @@ func TestGetErrorRates(t *testing.T) {
}
}

func TestGetErrorRatesZero(t *testing.T) {
params := metricsstore.ErrorRateQueryParameters{
BaseQueryParameters: buildTestBaseQueryParametersFrom(metricsTestCase{
serviceNames: []string{"emailservice"},
spanKinds: []string{"SPAN_KIND_SERVER"},
}),
}
tracer, exp, closer := tracerProvider(t)
defer closer()

const (
queryErrorRate = `sum(rate(calls{service_name =~ "emailservice", status_code = "STATUS_CODE_ERROR", ` +
`span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name) / ` +
`sum(rate(calls{service_name =~ "emailservice", span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name)`
queryCallRate = `sum(rate(calls{service_name =~ "emailservice", ` +
`span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name)`
)
wantPromQLQueries := []string{queryErrorRate, queryCallRate}
responses := []string{"testdata/empty_response.json", "testdata/service_datapoint_response.json"}
var callCount int

mockPrometheus := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
body, _ := io.ReadAll(r.Body)
defer r.Body.Close()

u, err := url.Parse("http://" + r.Host + r.RequestURI + "?" + string(body))
require.NoError(t, err)

q := u.Query()
promQuery := q.Get("query")
assert.Equal(t, wantPromQLQueries[callCount], promQuery)
sendResponse(t, w, responses[callCount])
callCount++
}))

logger := zap.NewNop()
address := mockPrometheus.Listener.Addr().String()

cfg := defaultConfig
cfg.ServerURL = "http://" + address
cfg.ConnectTimeout = defaultTimeout

reader, err := NewMetricsReader(cfg, logger, tracer)
require.NoError(t, err)

defer mockPrometheus.Close()

m, err := reader.GetErrorRates(context.Background(), &params)
require.NoError(t, err)

require.Len(t, m.Metrics, 1)
mps := m.Metrics[0].MetricPoints

require.Len(t, mps, 1)

// Assert that we essentially zeroed the call rate data point.
// That is, the timestamp is the same as the call rate's data point, but the value is 0.
actualVal := mps[0].Value.(*metrics.MetricPoint_GaugeValue).GaugeValue.Value.(*metrics.GaugeValue_DoubleValue).DoubleValue
assert.Zero(t, actualVal)
assert.Equal(t, int64(1620351786), mps[0].Timestamp.GetSeconds())
assert.Len(t, exp.GetSpans(), 2, "expected an error rate query and a call rate query to be made")
}

func TestGetErrorRatesNull(t *testing.T) {
params := metricsstore.ErrorRateQueryParameters{
BaseQueryParameters: buildTestBaseQueryParametersFrom(metricsTestCase{
serviceNames: []string{"emailservice"},
spanKinds: []string{"SPAN_KIND_SERVER"},
}),
}
tracer, exp, closer := tracerProvider(t)
defer closer()

const (
queryErrorRate = `sum(rate(calls{service_name =~ "emailservice", status_code = "STATUS_CODE_ERROR", ` +
`span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name) / ` +
`sum(rate(calls{service_name =~ "emailservice", span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name)`
queryCallRate = `sum(rate(calls{service_name =~ "emailservice", ` +
`span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name)`
)
wantPromQLQueries := []string{queryErrorRate, queryCallRate}
responses := []string{"testdata/empty_response.json", "testdata/empty_response.json"}
var callCount int

mockPrometheus := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
body, _ := io.ReadAll(r.Body)
defer r.Body.Close()

u, err := url.Parse("http://" + r.Host + r.RequestURI + "?" + string(body))
require.NoError(t, err)

q := u.Query()
promQuery := q.Get("query")
assert.Equal(t, wantPromQLQueries[callCount], promQuery)
sendResponse(t, w, responses[callCount])
callCount++
}))

logger := zap.NewNop()
address := mockPrometheus.Listener.Addr().String()

cfg := defaultConfig
cfg.ServerURL = "http://" + address
cfg.ConnectTimeout = defaultTimeout

reader, err := NewMetricsReader(cfg, logger, tracer)
require.NoError(t, err)

defer mockPrometheus.Close()

m, err := reader.GetErrorRates(context.Background(), &params)
require.NoError(t, err)
assert.Empty(t, m.Metrics, "expect no error data available")
assert.Len(t, exp.GetSpans(), 2, "expected an error rate query and a call rate query to be made")
}

func TestGetErrorRatesErrors(t *testing.T) {
for _, tc := range []struct {
name string
failErrorRateQuery bool
failCallRateQuery bool
wantErr string
}{
{
name: "error rate query failure",
failErrorRateQuery: true,
wantErr: "failed getting error metrics: failed executing metrics query: server_error: server error: 500",
},
{
name: "call rate query failure",
failCallRateQuery: true,
wantErr: "failed getting call metrics: failed executing metrics query: server_error: server error: 500",
},
} {
t.Run(tc.name, func(t *testing.T) {
params := metricsstore.ErrorRateQueryParameters{
BaseQueryParameters: buildTestBaseQueryParametersFrom(metricsTestCase{
serviceNames: []string{"emailservice"},
spanKinds: []string{"SPAN_KIND_SERVER"},
}),
}
tracer, _, closer := tracerProvider(t)
defer closer()

const (
queryErrorRate = `sum(rate(calls{service_name =~ "emailservice", status_code = "STATUS_CODE_ERROR", ` +
`span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name) / ` +
`sum(rate(calls{service_name =~ "emailservice", span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name)`
queryCallRate = `sum(rate(calls{service_name =~ "emailservice", ` +
`span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name)`
)
wantPromQLQueries := []string{queryErrorRate, queryCallRate}
responses := []string{"testdata/empty_response.json", "testdata/service_datapoint_response.json"}
var callCount int

mockPrometheus := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
body, _ := io.ReadAll(r.Body)
defer r.Body.Close()

u, err := url.Parse("http://" + r.Host + r.RequestURI + "?" + string(body))
require.NoError(t, err)

q := u.Query()
promQuery := q.Get("query")
assert.Equal(t, wantPromQLQueries[callCount], promQuery)

switch promQuery {
case queryErrorRate:
if tc.failErrorRateQuery {
w.WriteHeader(http.StatusInternalServerError)
} else {
sendResponse(t, w, responses[callCount])
}
case queryCallRate:
if tc.failCallRateQuery {
w.WriteHeader(http.StatusInternalServerError)
} else {
sendResponse(t, w, responses[callCount])
}
}
callCount++
}))

logger := zap.NewNop()
address := mockPrometheus.Listener.Addr().String()

cfg := defaultConfig
cfg.ServerURL = "http://" + address
cfg.ConnectTimeout = defaultTimeout

reader, err := NewMetricsReader(cfg, logger, tracer)
require.NoError(t, err)

defer mockPrometheus.Close()

_, err = reader.GetErrorRates(context.Background(), &params)
require.Error(t, err)
assert.EqualError(t, err, tc.wantErr)
})
}
}

func TestInvalidLatencyUnit(t *testing.T) {
defer func() {
if r := recover(); r == nil {
Expand Down Expand Up @@ -515,7 +717,7 @@ func TestWarningResponse(t *testing.T) {
m, err := reader.GetErrorRates(context.Background(), &params)
require.NoError(t, err)
assert.NotNil(t, m)
assert.Len(t, exp.GetSpans(), 1, "HTTP request was traced and span reported")
assert.Len(t, exp.GetSpans(), 2, "expected an error rate query and a call rate query to be made")
}

type fakePromServer struct {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"status": "success",
"data": {
"resultType": "matrix",
"result": []
}
}

0 comments on commit d45aba6

Please sign in to comment.