diff --git a/cmd/fastly-exporter/main.go b/cmd/fastly-exporter/main.go index 23f0ada..f09ecc5 100644 --- a/cmd/fastly-exporter/main.go +++ b/cmd/fastly-exporter/main.go @@ -39,6 +39,7 @@ func main() { serviceBlocklist stringslice metricAllowlist stringslice metricBlocklist stringslice + certificateRefresh time.Duration datacenterRefresh time.Duration productRefresh time.Duration serviceRefresh time.Duration @@ -62,6 +63,7 @@ func main() { fs.Var(&serviceBlocklist, "service-blocklist", "if set, don't include services whose names match this regex (repeatable)") fs.Var(&metricAllowlist, "metric-allowlist", "if set, only export metrics whose names match this regex (repeatable)") fs.Var(&metricBlocklist, "metric-blocklist", "if set, don't export metrics whose names match this regex (repeatable)") + fs.DurationVar(&certificateRefresh, "certificate-refresh", 10*time.Minute, "how often to poll api.fastly.com for updated certificates metadata (10m–24h)") fs.DurationVar(&datacenterRefresh, "datacenter-refresh", 10*time.Minute, "how often to poll api.fastly.com for updated datacenter metadata (10m–1h)") fs.DurationVar(&productRefresh, "product-refresh", 10*time.Minute, "how often to poll api.fastly.com for updated product metadata (10m–24h)") fs.DurationVar(&serviceRefresh, "service-refresh", 1*time.Minute, "how often to poll api.fastly.com for updated service metadata (15s–10m)") @@ -124,6 +126,14 @@ func main() { }) { + if certificateRefresh < 10*time.Minute { + level.Warn(logger).Log("msg", "-certificate-refresh cannot be shorter than 10m; setting it to 10m") + certificateRefresh = 10 * time.Minute + } + if certificateRefresh > 24*time.Hour { + level.Warn(logger).Log("msg", "-certificaate-refresh cannot be longer than 24h; setting it to 24h") + certificateRefresh = 24 * time.Hour + } if datacenterRefresh < 10*time.Minute { level.Warn(logger).Log("msg", "-datacenter-refresh cannot be shorter than 10m; setting it to 10m") datacenterRefresh = 10 * time.Minute @@ -269,6 +279,11 @@ func main() { serviceCache = api.NewServiceCache(apiClient, token, serviceCacheOptions...) } + var certificateCache *api.CertificateCache + { + enabled := !metricNameFilter.Blocked(prometheus.BuildFQName(namespace, deprecatedSubsystem, "cert_expiry_timestamp_seconds")) + certificateCache = api.NewCertificateCache(apiClient, token, enabled) + } var datacenterCache *api.DatacenterCache { enabled := !metricNameFilter.Blocked(prometheus.BuildFQName(namespace, deprecatedSubsystem, "datacenter_info")) @@ -288,6 +303,14 @@ func main() { } return nil }) + if certificateCache.Enabled() { + g.Go(func() error { + if err := certificateCache.Refresh(context.Background()); err != nil { + level.Warn(logger).Log("during", "initial fetch of certificates", "err", err, "msg", "certificate labels unavailable, will retry") + } + return nil + }) + } if datacenterCache.Enabled() { g.Go(func() error { if err := datacenterCache.Refresh(context.Background()); err != nil { @@ -307,6 +330,15 @@ func main() { } var defaultGatherers prometheus.Gatherers + if certificateCache.Enabled() { + certs, err := certificateCache.Gatherer(namespace, deprecatedSubsystem) + if err != nil { + level.Error(apiLogger).Log("during", "create certificate gatherer", "err", err) + os.Exit(1) + } + defaultGatherers = append(defaultGatherers, certs) + } + if datacenterCache.Enabled() { dcs, err := datacenterCache.Gatherer(namespace, deprecatedSubsystem) if err != nil { @@ -351,6 +383,31 @@ func main() { } var g run.Group + // only setup the ticker if the certificateCache is enabled. + if certificateCache.Enabled() { + + // Every certificateRefresh, ask the api.CertificateCache to refresh + // metadata from the api.fastly.com/tls/certificates endpoint. + var ( + ctx, cancel = context.WithCancel(context.Background()) + ticker = time.NewTicker(certificateRefresh) + ) + g.Add(func() error { + for { + select { + case <-ticker.C: + if err := certificateCache.Refresh(ctx); err != nil { + level.Warn(apiLogger).Log("during", "certificate refresh", "err", err, "msg", "the certificate info metrics may be stale") + } + case <-ctx.Done(): + return ctx.Err() + } + } + }, func(error) { + ticker.Stop() + cancel() + }) + } // only setup the ticker if the datacenterCache is enabled. if datacenterCache.Enabled() { diff --git a/pkg/api/certificate_cache.go b/pkg/api/certificate_cache.go new file mode 100644 index 0000000..198f725 --- /dev/null +++ b/pkg/api/certificate_cache.go @@ -0,0 +1,154 @@ +package api + +import ( + "context" + "encoding/json" + "fmt" + "net/http" + "sort" + "sync" + "time" + + "github.com/prometheus/client_golang/prometheus" +) + +// maxCertificatesPageSize is the maximum amount of results that can be requested +// from the api.fastly.com/tls/certificates endpoint. +const maxCertificatesPageSize = 1000 + +type Certificates struct { + Certificate []Certificate `json:"data"` +} + +type Certificate struct { + Attributes Attributes `json:"attributes"` + Id string `json:"id"` +} + +type Attributes struct { + CN string `json:"issued_to"` + Name string `json:"name"` + Issuer string `json:"issuer"` + Not_after string `json:"not_after"` + SN string `json:"serial_number"` +} + +// CertificateCache polls api.fastly.com/tls/certificates and maintains a local cache +// of the returned metadata. That information is exposed as Prometheus metrics. +type CertificateCache struct { + client HTTPClient + token string + enabled bool + + mtx sync.Mutex + certs Certificates +} + +// NewCertificateCache returns an empty cache of certificates metadata. Use the +// Refresh method to update the cache. +func NewCertificateCache(client HTTPClient, token string, enabled bool) *CertificateCache { + return &CertificateCache{ + client: client, + token: token, + enabled: enabled, + } +} + +// Refresh the cache with metadata retreived from the Fastly API. +func (c *CertificateCache) Refresh(ctx context.Context) error { + if !c.enabled { + return nil + } + + // TODO: Implement additional requests for next pages if there are more + // TLS certificates than maxCertificatesPageSize + var uri string = fmt.Sprintf("https://api.fastly.com/tls/certificates?page%%5Bnumber%%5D=1&page%%5Bsize%%5D=%d&sort=created_at", maxCertificatesPageSize) + + req, err := http.NewRequestWithContext(ctx, "GET", uri, nil) + if err != nil { + return fmt.Errorf("error constructing API certificates request: %w", err) + } + + req.Header.Set("Fastly-Key", c.token) + req.Header.Set("Accept", "application/json") + resp, err := c.client.Do(req) + if err != nil { + return fmt.Errorf("error executing API certificates request: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return NewError(resp) + } + + var response Certificates + if err := json.NewDecoder(resp.Body).Decode(&response); err != nil { + return fmt.Errorf("error decoding API certificates response: %w", err) + } + + sort.Slice(response.Certificate, func(i, j int) bool { + return response.Certificate[i].Attributes.CN < response.Certificate[j].Attributes.CN + }) + + c.mtx.Lock() + defer c.mtx.Unlock() + c.certs = response + + return nil +} + +// Certificates returns a copy of the currently cached certificates. +func (c *CertificateCache) Certificates() Certificates { + c.mtx.Lock() + defer c.mtx.Unlock() + certs := c.certs + return certs +} + +// Gatherer returns a Prometheus gatherer which will yield current metadata +// about Fastly certificates as labels on a gauge metric. +func (c *CertificateCache) Gatherer(namespace, subsystem string) (prometheus.Gatherer, error) { + var ( + fqName = prometheus.BuildFQName(namespace, subsystem, "cert_expiry_timestamp_seconds") + help = "Metadata about Fastly certificates." + labels = []string{"cn", "name", "id", "issuer", "sn"} + constLabels = prometheus.Labels{} + desc = prometheus.NewDesc(fqName, help, labels, constLabels) + collector = &certificateCollector{desc: desc, cache: c} + ) + + registry := prometheus.NewRegistry() + if err := registry.Register(collector); err != nil { + return nil, fmt.Errorf("registering certificate collector: %w", err) + } + + return registry, nil +} + +// Enabled returns true if the CertificateCache is enabled +func (c *CertificateCache) Enabled() bool { + return c.enabled +} + +type certificateCollector struct { + desc *prometheus.Desc + cache *CertificateCache +} + +func (c *certificateCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- c.desc +} + +func (c *certificateCollector) Collect(ch chan<- prometheus.Metric) { + for _, cert := range c.cache.Certificates().Certificate { + format := "2006-01-02T15:04:05.000Z" + t, _ := time.Parse(format, cert.Attributes.Not_after) + var ( + desc = c.desc + valueType = prometheus.GaugeValue + value = float64(t.Unix()) + labelValues = []string{cert.Attributes.CN, cert.Attributes.Name, cert.Id, cert.Attributes.Issuer, cert.Attributes.SN} + ) + ch <- prometheus.MustNewConstMetric(desc, valueType, value, labelValues...) + } +} diff --git a/pkg/api/certificate_cache_test.go b/pkg/api/certificate_cache_test.go new file mode 100644 index 0000000..6ead897 --- /dev/null +++ b/pkg/api/certificate_cache_test.go @@ -0,0 +1,178 @@ +package api_test + +import ( + "context" + "net/http" + "testing" + + "github.com/fastly/fastly-exporter/pkg/api" + "github.com/google/go-cmp/cmp" +) + +func TestCertificateCache(t *testing.T) { + t.Parallel() + + for _, testcase := range []struct { + name string + client api.HTTPClient + wantCerts api.Certificates + wantErr error + }{ + { + name: "success", + client: fixedResponseClient{code: http.StatusOK, response: certificatesResponseLarge}, + wantErr: nil, + wantCerts: api.Certificates{ + Certificate: []api.Certificate { + api.Certificate{Id:"ZfkhTtm4LdaOprVcdsffx4", + Attributes: api.Attributes{ + CN:"first.example1.com", + Name:"first.example1.com", + Issuer:"First CA", + Not_after:"2023-06-25T01:09:23.000Z", + SN:"52135557897532112355784498781325912334", + }, + }, + api.Certificate{Id:"YkUe3r6S3zN4m6lVCd3sGc", + Attributes: api.Attributes{ + CN:"second.example2.com", + Name:"My Testing Cert", + Issuer:"Second CA", + Not_after:"2024-08-29T11:07:33.000Z", + SN:"11106091125671337225612345678987654321", + }, + }, + }, + }, + }, + { + name: "success_and_empty", + client: fixedResponseClient{code: http.StatusOK, response: certificatesResponseEmpty}, + wantErr: nil, + wantCerts: api.Certificates{ + Certificate: []api.Certificate{}, + }, + }, + { + name: "error", + client: fixedResponseClient{code: http.StatusUnauthorized}, + wantErr: &api.Error{Code: http.StatusUnauthorized}, + wantCerts: api.Certificates{}, + }, + } { + t.Run(testcase.name, func(t *testing.T) { + var ( + ctx = context.Background() + client = testcase.client + cache = api.NewCertificateCache(client, "irrelevant token", true) + ) + + if want, have := testcase.wantErr, cache.Refresh(ctx); !cmp.Equal(want, have) { + t.Fatal(cmp.Diff(want, have)) + } + + if want, have := testcase.wantCerts, cache.Certificates(); !cmp.Equal(want, have) { + t.Fatal(cmp.Diff(want, have)) + } + }) + } +} + +const certificatesResponseEmpty = ` +{ + "data": [], + "links": { + "self": "https://api.fastly.com/tls/certificates?page%5Bnumber%5D=1&page%5Bsize%5D=20&sort=created_at", + "first": "https://api.fastly.com/tls/certificates?page%5Bnumber%5D=1&page%5Bsize%5D=20&sort=created_at", + "prev": null, + "next": null, + "last": "https://api.fastly.com/tls/certificates?page%5Bnumber%5D=1&page%5Bsize%5D=27&sort=created_at" + }, + "meta": { + "per_page": 20, + "current_page": 1, + "record_count": 0, + "total_pages": 1 + } +} +` + +const certificatesResponseLarge = ` +{ + "meta": { + "total_pages": 1, + "record_count": 3, + "current_page": 1, + "per_page": 20 + }, + "links": { + "last": "https://api.fastly.com/tls/certificates?page%5Bnumber%5D=1&page%5Bsize%5D=20&sort=created_at", + "next": null, + "prev": null, + "first": "https://api.fastly.com/tls/certificates?page%5Bnumber%5D=1&page%5Bsize%5D=20&sort=created_at", + "self": "https://api.fastly.com/tls/certificates?page%5Bnumber%5D=1&page%5Bsize%5D=20&sort=created_at" + }, + "data": [ + { + "relationships": { + "tls_domains": { + "data": [ + { + "type": "tls_domain", + "id": "abcd.first.example1.com" + }, + { + "type": "tls_domain", + "id": "1234.first.example1.com" + }, + { + "type": "tls_domain", + "id": "sub-domain.first.example1.com" + } + ] + } + }, + "attributes": { + "updated_at": "2022-07-05T04:48:21.000Z", + "signature_algorithm": "SHA256-RSA", + "created_at": "2022-07-05T04:48:21.000Z", + "issued_to": "first.example1.com", + "issuer": "First CA", + "name": "first.example1.com", + "not_after": "2023-06-25T01:09:23.000Z", + "not_before": "2022-06-25T01:09:24.000Z", + "replace": false, + "serial_number": "52135557897532112355784498781325912334" + }, + "type": "tls_certificate", + "id": "ZfkhTtm4LdaOprVcdsffx4" + }, + { + "relationships": { + "tls_domains": { + "data": [ + { + "type": "tls_domain", + "id": "abcd1234.second.example2.com" + } + ] + } + }, + "attributes": { + "updated_at": "2023-09-09T14:46:31.000Z", + "signature_algorithm": "SHA256-RSA", + "created_at": "2023-09-09T14:46:31.000Z", + "issued_to": "second.example2.com", + "issuer": "Second CA", + "name": "My Testing Cert", + "not_after": "2024-08-29T11:07:33.000Z", + "not_before": "2023-08-29T11:07:34.000Z", + "replace": false, + "serial_number": "11106091125671337225612345678987654321" + }, + "type": "tls_certificate", + "id": "YkUe3r6S3zN4m6lVCd3sGc" + } + ] +} +`