From e1312ff3194a373487df81334dd166f405ea12d6 Mon Sep 17 00:00:00 2001 From: Samantha Date: Mon, 20 Nov 2023 14:12:21 -0500 Subject: [PATCH] RA: Fix legacy override utilization metrics (#7124) - Emit override utilization only when resource counts are under threshold. - Override utilization accounts for anticipated issuance. - Correct the limit metric label for `CertificatesPerName` and `CertificatesPerFQDNSet/Fast`. Part of #5545 --- ra/ra.go | 77 +++++++++++++++++++++++++--------------- ra/ra_test.go | 25 +++++++------ ratelimit/rate-limits.go | 6 ++-- 3 files changed, 65 insertions(+), 43 deletions(-) diff --git a/ra/ra.go b/ra/ra.go index ab19f842de5..611a32fd121 100644 --- a/ra/ra.go +++ b/ra/ra.go @@ -394,16 +394,15 @@ func (ra *RegistrationAuthorityImpl) checkRegistrationIPLimit(ctx context.Contex } threshold, overrideKey := limit.GetThreshold(ip.String(), noRegistrationID) + if count.Count >= threshold { + return berrors.RegistrationsPerIPError(0, "too many registrations for this IP") + } if overrideKey != "" { // We do not support overrides for the NewRegistrationsPerIPRange limit. - utilization := float64(count.Count) / float64(threshold) + utilization := float64(count.Count+1) / float64(threshold) ra.rlOverrideUsageGauge.WithLabelValues(ratelimit.RegistrationsPerIP, overrideKey).Set(utilization) } - if count.Count >= threshold { - return berrors.RegistrationsPerIPError(0, "too many registrations for this IP") - } - return nil } @@ -600,14 +599,14 @@ func (ra *RegistrationAuthorityImpl) checkPendingAuthorizationLimit(ctx context. if err != nil { return err } - if overrideKey != "" { - utilization := float64(countPB.Count) / float64(threshold) - ra.rlOverrideUsageGauge.WithLabelValues(ratelimit.PendingAuthorizationsPerAccount, overrideKey).Set(utilization) - } if countPB.Count >= threshold { ra.log.Infof("Rate limit exceeded, PendingAuthorizationsByRegID, regID: %d", regID) return berrors.RateLimitError(0, "too many currently pending authorizations: %d", countPB.Count) } + if overrideKey != "" { + utilization := float64(countPB.Count) / float64(threshold) + ra.rlOverrideUsageGauge.WithLabelValues(ratelimit.PendingAuthorizationsPerAccount, overrideKey).Set(utilization) + } return nil } @@ -653,14 +652,14 @@ func (ra *RegistrationAuthorityImpl) checkInvalidAuthorizationLimit(ctx context. // here. noKey := "" threshold, overrideKey := limit.GetThreshold(noKey, regID) - if overrideKey != "" { - utilization := float64(count.Count) / float64(threshold) - ra.rlOverrideUsageGauge.WithLabelValues(ratelimit.InvalidAuthorizationsPerAccount, overrideKey).Set(utilization) - } if count.Count >= threshold { ra.log.Infof("Rate limit exceeded, InvalidAuthorizationsByRegID, regID: %d", regID) return berrors.FailedValidationError(0, "too many failed authorizations recently") } + if overrideKey != "" { + utilization := float64(count.Count) / float64(threshold) + ra.rlOverrideUsageGauge.WithLabelValues(ratelimit.InvalidAuthorizationsPerAccount, overrideKey).Set(utilization) + } return nil } @@ -684,14 +683,13 @@ func (ra *RegistrationAuthorityImpl) checkNewOrdersPerAccountLimit(ctx context.C // There is no meaningful override key to use for this rate limit noKey := "" threshold, overrideKey := limit.GetThreshold(noKey, acctID) - if overrideKey != "" { - utilization := float64(count.Count) / float64(threshold) - ra.rlOverrideUsageGauge.WithLabelValues(ratelimit.NewOrdersPerAccount, overrideKey).Set(utilization) - } - if count.Count >= threshold { return berrors.RateLimitError(0, "too many new orders recently") } + if overrideKey != "" { + utilization := float64(count.Count+1) / float64(threshold) + ra.rlOverrideUsageGauge.WithLabelValues(ratelimit.NewOrdersPerAccount, overrideKey).Set(utilization) + } return nil } @@ -1423,18 +1421,34 @@ func (ra *RegistrationAuthorityImpl) enforceNameCounts(ctx context.Context, name } var badNames []string + var metricsData []struct { + overrideKey string + utilization float64 + } + // Find the names that have counts at or over the threshold. Range // over the names slice input to ensure the order of badNames will // return the badNames in the same order they were input. for _, name := range names { threshold, overrideKey := limit.GetThreshold(name, regID) - if overrideKey != "" { - utilization := float64(response.Counts[name]) / float64(threshold) - ra.rlOverrideUsageGauge.WithLabelValues(ratelimit.CertificatesPerName, overrideKey).Set(utilization) - } if response.Counts[name] >= threshold { badNames = append(badNames, name) } + if overrideKey != "" { + // Name is under threshold due to an override. + utilization := float64(response.Counts[name]+1) / float64(threshold) + metricsData = append(metricsData, struct { + overrideKey string + utilization float64 + }{overrideKey, utilization}) + } + } + + if len(badNames) == 0 { + // All names were under the threshold, emit override utilization metrics. + for _, data := range metricsData { + ra.rlOverrideUsageGauge.WithLabelValues(ratelimit.CertificatesPerName, data.overrideKey).Set(data.utilization) + } } return badNames, response.Earliest.AsTime(), nil } @@ -1498,18 +1512,19 @@ func (ra *RegistrationAuthorityImpl) checkCertificatesPerFQDNSetLimit(ctx contex return fmt.Errorf("checking duplicate certificate limit for %q: %s", names, err) } - if overrideKey != "" { - utilization := float64(len(prevIssuances.TimestampsNS)) / float64(threshold) - ra.rlOverrideUsageGauge.WithLabelValues(ratelimit.CertificatesPerFQDNSet, overrideKey).Set(utilization) - } - - if int64(len(prevIssuances.TimestampsNS)) < threshold { + issuanceCount := int64(len(prevIssuances.TimestampsNS)) + if issuanceCount < threshold { // Issuance in window is below the threshold, no need to limit. + if overrideKey != "" { + utilization := float64(issuanceCount+1) / float64(threshold) + ra.rlOverrideUsageGauge.WithLabelValues(ratelimit.CertificatesPerFQDNSet, overrideKey).Set(utilization) + } return nil } else { // Evaluate the rate limit using a leaky bucket algorithm. The bucket // has a capacity of threshold and is refilled at a rate of 1 token per - // limit.Window/threshold from the time of each issuance timestamp. + // limit.Window/threshold from the time of each issuance timestamp. The + // timestamps start from the most recent issuance and go back in time. now := ra.clk.Now() nsPerToken := limit.Window.Nanoseconds() / threshold for i, timestamp := range prevIssuances.TimestampsNS { @@ -1518,6 +1533,10 @@ func (ra *RegistrationAuthorityImpl) checkCertificatesPerFQDNSetLimit(ctx contex // We know `i+1` tokens were generated since `tokenGeneratedSince`, // and only `i` certificates were issued, so there's room to allow // for an additional issuance. + if overrideKey != "" { + utilization := float64(issuanceCount) / float64(threshold) + ra.rlOverrideUsageGauge.WithLabelValues(ratelimit.CertificatesPerFQDNSet, overrideKey).Set(utilization) + } return nil } } diff --git a/ra/ra_test.go b/ra/ra_test.go index d95dd332250..1fdaab8bc89 100644 --- a/ra/ra_test.go +++ b/ra/ra_test.go @@ -719,12 +719,13 @@ func TestRegistrationsPerIPOverrideUsage(t *testing.T) { } // No error expected, the count of existing registrations for "4.5.6.7" - // should be 1 below the threshold. + // should be 1 below the override threshold. err := ra.checkRegistrationIPLimit(ctx, rlp, regIP, mockCounterAlwaysTwo) test.AssertNotError(t, err, "Unexpected error checking RegistrationsPerIPRange limit") - // Expecting ~66% of the override for "4.5.6.7" to be utilized. - test.AssertMetricWithLabelsEquals(t, ra.rlOverrideUsageGauge, prometheus.Labels{"limit": ratelimit.RegistrationsPerIP, "override_key": regIP.String()}, 0.6666666666666666) + // Accounting for the anticipated issuance, we expect "4.5.6.7" to be at + // 100% of their override threshold. + test.AssertMetricWithLabelsEquals(t, ra.rlOverrideUsageGauge, prometheus.Labels{"limit": ratelimit.RegistrationsPerIP, "override_key": regIP.String()}, 1) mockCounterAlwaysThree := func(context.Context, *sapb.CountRegistrationsByIPRequest, ...grpc.CallOption) (*sapb.Count, error) { return &sapb.Count{Count: 3}, nil @@ -1223,30 +1224,32 @@ func TestCheckCertificatesPerNameLimit(t *testing.T) { // Two base domains, one above threshold but with an override. mockSA.nameCounts.Counts["example.com"] = 0 mockSA.nameCounts.Counts["bigissuer.com"] = 50 + ra.rlOverrideUsageGauge.WithLabelValues(ratelimit.CertificatesPerName, "bigissuer.com").Set(.5) err = ra.checkCertificatesPerNameLimit(ctx, []string{"www.example.com", "subdomain.bigissuer.com"}, rlp, 99) test.AssertNotError(t, err, "incorrectly rate limited bigissuer") - // "bigissuer.com" has an override of 100 and they've issued 50. We do - // not count issuance that has yet to occur, so we expect to see 50% - // utilization. - test.AssertMetricWithLabelsEquals(t, ra.rlOverrideUsageGauge, prometheus.Labels{"limit": ratelimit.CertificatesPerName, "override_key": "bigissuer.com"}, .5) + // "bigissuer.com" has an override of 100 and they've issued 50. Accounting + // for the anticipated issuance, we expect to see 51% utilization. + test.AssertMetricWithLabelsEquals(t, ra.rlOverrideUsageGauge, prometheus.Labels{"limit": ratelimit.CertificatesPerName, "override_key": "bigissuer.com"}, .51) // Two base domains, one above its override mockSA.nameCounts.Counts["example.com"] = 10 mockSA.nameCounts.Counts["bigissuer.com"] = 100 + ra.rlOverrideUsageGauge.WithLabelValues(ratelimit.CertificatesPerName, "bigissuer.com").Set(1) err = ra.checkCertificatesPerNameLimit(ctx, []string{"www.example.com", "subdomain.bigissuer.com"}, rlp, 99) test.AssertError(t, err, "incorrectly failed to rate limit bigissuer") test.AssertErrorIs(t, err, berrors.RateLimit) - // "bigissuer.com" has an override of 100 and they've issued 100. So we - // expect to see 100% utilization. + // "bigissuer.com" has an override of 100 and they've issued 100. They're + // already at 100% utilization, so we expect to see 100% utilization. test.AssertMetricWithLabelsEquals(t, ra.rlOverrideUsageGauge, prometheus.Labels{"limit": ratelimit.CertificatesPerName, "override_key": "bigissuer.com"}, 1) // One base domain, above its override (which is below threshold) mockSA.nameCounts.Counts["smallissuer.co.uk"] = 1 + ra.rlOverrideUsageGauge.WithLabelValues(ratelimit.CertificatesPerName, "smallissuer.co.uk").Set(1) err = ra.checkCertificatesPerNameLimit(ctx, []string{"www.smallissuer.co.uk"}, rlp, 99) test.AssertError(t, err, "incorrectly failed to rate limit smallissuer") test.AssertErrorIs(t, err, berrors.RateLimit) - // "smallissuer.co.uk" has an override of 1 and they've issued 1. So we - // expect to see 100% utilization. + // "smallissuer.co.uk" has an override of 1 and they've issued 1. They're + // already at 100% utilization, so we expect to see 100% utilization. test.AssertMetricWithLabelsEquals(t, ra.rlOverrideUsageGauge, prometheus.Labels{"limit": ratelimit.CertificatesPerName, "override_key": "smallissuer.co.uk"}, 1) } diff --git a/ratelimit/rate-limits.go b/ratelimit/rate-limits.go index 3c6bd75d0d0..812b723b208 100644 --- a/ratelimit/rate-limits.go +++ b/ratelimit/rate-limits.go @@ -11,7 +11,7 @@ import ( const ( // CertificatesPerName is the name of the CertificatesPerName rate limit // when referenced in metric labels. - CertificatesPerName = "certificates_per_domain_per_account" + CertificatesPerName = "certificates_per_domain" // RegistrationsPerIP is the name of the RegistrationsPerIP rate limit when // referenced in metric labels. @@ -33,11 +33,11 @@ const ( // CertificatesPerFQDNSet is the name of the CertificatesPerFQDNSet rate // limit when referenced in metric labels. - CertificatesPerFQDNSet = "certificates_per_fqdn_set_per_account" + CertificatesPerFQDNSet = "certificates_per_fqdn_set" // CertificatesPerFQDNSetFast is the name of the CertificatesPerFQDNSetFast // rate limit when referenced in metric labels. - CertificatesPerFQDNSetFast = "certificates_per_fqdn_set_per_account_fast" + CertificatesPerFQDNSetFast = "certificates_per_fqdn_set_fast" // NewOrdersPerAccount is the name of the NewOrdersPerAccount rate limit // when referenced in metric labels.