diff --git a/.drone/drone.yml b/.drone/drone.yml index c7500616ee..086ed83ac2 100644 --- a/.drone/drone.yml +++ b/.drone/drone.yml @@ -10,7 +10,7 @@ steps: - docker login -u $DOCKER_LOGIN -p $DOCKER_PASSWORD - docker run --rm --privileged multiarch/qemu-user-static --reset -p yes - docker buildx create --name multiarch --driver docker-container --use - - docker buildx build --build-arg="GO_RUNTIME=golang:1.22.3-bullseye" --push --platform + - docker buildx build --build-arg="GO_RUNTIME=golang:1.22.5-bullseye" --push --platform linux/amd64,linux/arm64 -t grafana/alloy-build-image:$IMAGE_TAG ./tools/build-image environment: DOCKER_LOGIN: @@ -44,7 +44,7 @@ steps: - docker login -u $DOCKER_LOGIN -p $DOCKER_PASSWORD - docker run --rm --privileged multiarch/qemu-user-static --reset -p yes - docker buildx create --name multiarch --driver docker-container --use - - docker buildx build --build-arg="GO_RUNTIME=mcr.microsoft.com/oss/go/microsoft/golang:1.22.3-bullseye" + - docker buildx build --build-arg="GO_RUNTIME=mcr.microsoft.com/oss/go/microsoft/golang:1.22.5-bullseye" --push --platform linux/amd64,linux/arm64 -t grafana/alloy-build-image:$IMAGE_TAG ./tools/build-image environment: @@ -110,7 +110,7 @@ steps: - commands: - apt-get update -y && apt-get install -y libsystemd-dev - make lint - image: grafana/alloy-build-image:v0.1.2 + image: grafana/alloy-build-image:v0.1.3 name: Lint trigger: event: @@ -125,7 +125,7 @@ platform: steps: - commands: - make GO_TAGS="nodocker" test - image: grafana/alloy-build-image:v0.1.2 + image: grafana/alloy-build-image:v0.1.3 name: Run Go tests trigger: event: @@ -140,7 +140,7 @@ platform: steps: - commands: - K8S_USE_DOCKER_NETWORK=1 make test - image: grafana/alloy-build-image:v0.1.2 + image: grafana/alloy-build-image:v0.1.3 name: Run Go tests volumes: - name: docker @@ -163,7 +163,7 @@ platform: steps: - commands: - go test -tags="nodocker,nonetwork" ./... - image: grafana/alloy-build-image:v0.1.2-windows + image: grafana/alloy-build-image:v0.1.3-windows name: Run Go tests trigger: ref: @@ -178,7 +178,7 @@ platform: steps: - commands: - make alloy-image - image: grafana/alloy-build-image:v0.1.2 + image: grafana/alloy-build-image:v0.1.3 name: Build container volumes: - name: docker @@ -204,7 +204,7 @@ platform: steps: - commands: - '& "C:/Program Files/git/bin/bash.exe" -c "make alloy-image-windows"' - image: grafana/alloy-build-image:v0.1.2-windows + image: grafana/alloy-build-image:v0.1.3-windows name: Build container volumes: - name: docker @@ -231,7 +231,7 @@ steps: - make generate-ui - GO_TAGS="builtinassets promtail_journal_enabled" GOOS=linux GOARCH=amd64 GOARM= make alloy - image: grafana/alloy-build-image:v0.1.2 + image: grafana/alloy-build-image:v0.1.3 name: Build trigger: event: @@ -248,7 +248,7 @@ steps: - make generate-ui - GO_TAGS="builtinassets promtail_journal_enabled" GOOS=linux GOARCH=arm64 GOARM= make alloy - image: grafana/alloy-build-image:v0.1.2 + image: grafana/alloy-build-image:v0.1.3 name: Build trigger: event: @@ -265,7 +265,7 @@ steps: - make generate-ui - GO_TAGS="builtinassets promtail_journal_enabled" GOOS=linux GOARCH=ppc64le GOARM= make alloy - image: grafana/alloy-build-image:v0.1.2 + image: grafana/alloy-build-image:v0.1.3 name: Build trigger: event: @@ -282,7 +282,7 @@ steps: - make generate-ui - GO_TAGS="builtinassets promtail_journal_enabled" GOOS=linux GOARCH=s390x GOARM= make alloy - image: grafana/alloy-build-image:v0.1.2 + image: grafana/alloy-build-image:v0.1.3 name: Build trigger: event: @@ -298,7 +298,7 @@ steps: - commands: - make generate-ui - GO_TAGS="builtinassets" GOOS=darwin GOARCH=amd64 GOARM= make alloy - image: grafana/alloy-build-image:v0.1.2 + image: grafana/alloy-build-image:v0.1.3 name: Build trigger: event: @@ -314,7 +314,7 @@ steps: - commands: - make generate-ui - GO_TAGS="builtinassets" GOOS=darwin GOARCH=arm64 GOARM= make alloy - image: grafana/alloy-build-image:v0.1.2 + image: grafana/alloy-build-image:v0.1.3 name: Build trigger: event: @@ -330,7 +330,7 @@ steps: - commands: - make generate-ui - GO_TAGS="builtinassets" GOOS=windows GOARCH=amd64 GOARM= make alloy - image: grafana/alloy-build-image:v0.1.2 + image: grafana/alloy-build-image:v0.1.3 name: Build trigger: event: @@ -346,7 +346,7 @@ steps: - commands: - make generate-ui - GO_TAGS="builtinassets" GOOS=freebsd GOARCH=amd64 GOARM= make alloy - image: grafana/alloy-build-image:v0.1.2 + image: grafana/alloy-build-image:v0.1.3 name: Build trigger: event: @@ -363,7 +363,7 @@ steps: - make generate-ui - GO_TAGS="builtinassets promtail_journal_enabled" GOOS=linux GOARCH=amd64 GOARM= GOEXPERIMENT=boringcrypto make alloy - image: grafana/alloy-build-image:v0.1.2-boringcrypto + image: grafana/alloy-build-image:v0.1.3-boringcrypto name: Build trigger: event: @@ -380,7 +380,7 @@ steps: - make generate-ui - GO_TAGS="builtinassets promtail_journal_enabled" GOOS=linux GOARCH=arm64 GOARM= GOEXPERIMENT=boringcrypto make alloy - image: grafana/alloy-build-image:v0.1.2-boringcrypto + image: grafana/alloy-build-image:v0.1.3-boringcrypto name: Build trigger: event: @@ -396,7 +396,7 @@ steps: - commands: - docker run --rm --privileged multiarch/qemu-user-static --reset -p yes failure: ignore - image: grafana/alloy-build-image:v0.1.2 + image: grafana/alloy-build-image:v0.1.3 name: Configure QEMU volumes: - name: docker @@ -416,7 +416,7 @@ steps: from_secret: docker_password GCR_CREDS: from_secret: gcr_admin - image: grafana/alloy-build-image:v0.1.2 + image: grafana/alloy-build-image:v0.1.3 name: Publish container volumes: - name: docker @@ -439,7 +439,7 @@ steps: - commands: - docker run --rm --privileged multiarch/qemu-user-static --reset -p yes failure: ignore - image: grafana/alloy-build-image:v0.1.2 + image: grafana/alloy-build-image:v0.1.3 name: Configure QEMU volumes: - name: docker @@ -459,7 +459,7 @@ steps: from_secret: docker_password GCR_CREDS: from_secret: gcr_admin - image: grafana/alloy-build-image:v0.1.2 + image: grafana/alloy-build-image:v0.1.3 name: Publish container volumes: - name: docker @@ -493,7 +493,7 @@ steps: from_secret: docker_password GCR_CREDS: from_secret: gcr_admin - image: grafana/alloy-build-image:v0.1.2-windows + image: grafana/alloy-build-image:v0.1.3-windows name: Build containers volumes: - name: docker @@ -516,7 +516,7 @@ steps: - commands: - docker run --rm --privileged multiarch/qemu-user-static --reset -p yes failure: ignore - image: grafana/alloy-build-image:v0.1.2 + image: grafana/alloy-build-image:v0.1.3 name: Configure QEMU volumes: - name: docker @@ -536,7 +536,7 @@ steps: from_secret: docker_password GCR_CREDS: from_secret: gcr_admin - image: grafana/alloy-build-image:v0.1.2 + image: grafana/alloy-build-image:v0.1.3 name: Publish container volumes: - name: docker @@ -559,7 +559,7 @@ steps: - commands: - docker run --rm --privileged multiarch/qemu-user-static --reset -p yes failure: ignore - image: grafana/alloy-build-image:v0.1.2 + image: grafana/alloy-build-image:v0.1.3 name: Configure QEMU volumes: - name: docker @@ -579,7 +579,7 @@ steps: from_secret: docker_password GCR_CREDS: from_secret: gcr_admin - image: grafana/alloy-build-image:v0.1.2 + image: grafana/alloy-build-image:v0.1.3 name: Publish container volumes: - name: docker @@ -613,7 +613,7 @@ steps: from_secret: docker_password GCR_CREDS: from_secret: gcr_admin - image: grafana/alloy-build-image:v0.1.2-windows + image: grafana/alloy-build-image:v0.1.3-windows name: Build containers volumes: - name: docker @@ -714,7 +714,7 @@ steps: from_secret: gpg_private_key GPG_PUBLIC_KEY: from_secret: gpg_public_key - image: grafana/alloy-build-image:v0.1.2 + image: grafana/alloy-build-image:v0.1.3 name: Publish release volumes: - name: docker @@ -737,7 +737,7 @@ steps: - commands: - DOCKER_OPTS="" make dist/alloy-linux-amd64 - DOCKER_OPTS="" make test-packages - image: grafana/alloy-build-image:v0.1.2 + image: grafana/alloy-build-image:v0.1.3 name: Test Linux system packages volumes: - name: docker @@ -835,6 +835,6 @@ kind: secret name: updater_private_key --- kind: signature -hmac: 60dbc7f93c24089a985433ec810276f8bbf84f6b93a173e27b2d278388329511 +hmac: ee269482e125ef982b0dffcf21ded182eb20960f97adf0513610fac49a1ab775 ... diff --git a/.github/workflows/check-linux-build-image.yml b/.github/workflows/check-linux-build-image.yml index b3ad180c01..5ea3e923e5 100644 --- a/.github/workflows/check-linux-build-image.yml +++ b/.github/workflows/check-linux-build-image.yml @@ -25,7 +25,7 @@ jobs: push: false tags: grafana/alloy-build-image:latest build-args: | - GO_RUNTIME=golang:1.22.3-bullseye + GO_RUNTIME=golang:1.22.5-bullseye - name: Create test Linux build image for boring crypto uses: docker/build-push-action@v5 @@ -34,4 +34,4 @@ jobs: push: false tags: grafana/alloy-build-image:latest build-args: | - GO_RUNTIME=mcr.microsoft.com/oss/go/microsoft/golang:1.22.3-bullseye + GO_RUNTIME=mcr.microsoft.com/oss/go/microsoft/golang:1.22.5-bullseye diff --git a/CHANGELOG.md b/CHANGELOG.md index 2dace02414..15f4cb2f80 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,23 @@ This document contains a historical list of changes between releases. Only changes that impact end-user behavior are listed; changes to documentation or internal API changes are not present. +v1.2.1 +----------------- + +### Bugfixes + +- Fixed an issue with `loki.source.kubernetes_events` not starting in large clusters due to short informer sync timeout. (@nrwiersma) + +- Updated [ckit](https://github.com/grafana/ckit) to fix an issue with armv7 panic on startup when forming a cluster. (@imavroukakis) + +- Fixed a clustering mode issue where a failure to perform static peers + discovery did not result in a fatal failure at startup and could lead to + potential split-brain issues. (@thampiotr) + +### Other + +- Use Go 1.22.5 for builds. (@mattdurham) + v1.2.0 ----------------- diff --git a/Dockerfile b/Dockerfile index a57b8a9db6..32933136fa 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,7 +4,7 @@ # default when running `docker buildx build` or when DOCKER_BUILDKIT=1 is set # in environment variables. -FROM --platform=$BUILDPLATFORM grafana/alloy-build-image:v0.1.2 as build +FROM --platform=$BUILDPLATFORM grafana/alloy-build-image:v0.1.3 as build ARG BUILDPLATFORM ARG TARGETPLATFORM ARG TARGETOS diff --git a/Dockerfile.windows b/Dockerfile.windows index ca82be7758..cdb1a5d6cb 100644 --- a/Dockerfile.windows +++ b/Dockerfile.windows @@ -1,4 +1,4 @@ -FROM grafana/alloy-build-image:v0.1.2-windows as builder +FROM grafana/alloy-build-image:v0.1.3-windows as builder ARG VERSION ARG RELEASE_BUILD=1 ARG GO_TAGS diff --git a/VERSION b/VERSION index 9a952fb88b..b560c64f6f 100644 --- a/VERSION +++ b/VERSION @@ -20,4 +20,4 @@ # # Lines starting with "#" and blank lines are ignored. -v1.2.0 +v1.2.1 diff --git a/docs/Makefile b/docs/Makefile index d1ff60d771..2ea60a40c8 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -11,7 +11,7 @@ include docs.mk docs: check-cloudwatch-integration check-cloudwatch-integration: - $(PODMAN) run -v $(shell git rev-parse --show-toplevel):/repo -v $(shell pwd):/docs -w /repo golang:1.22.3-bullseye go run internal/static/integrations/cloudwatch_exporter/docs/doc.go check /docs/sources/reference/components/prometheus/prometheus.exporter.cloudwatch.md + $(PODMAN) run -v $(shell git rev-parse --show-toplevel):/repo -v $(shell pwd):/docs -w /repo golang:1.22.5-bullseye go run internal/static/integrations/cloudwatch_exporter/docs/doc.go check /docs/sources/reference/components/prometheus/prometheus.exporter.cloudwatch.md generate-cloudwatch-integration: - $(PODMAN) run -v $(shell git rev-parse --show-toplevel):/repo -v $(shell pwd):/docs -w /repo golang:1.22.3-bullseye go run internal/static/integrations/cloudwatch_exporter/docs/doc.go generate + $(PODMAN) run -v $(shell git rev-parse --show-toplevel):/repo -v $(shell pwd):/docs -w /repo golang:1.22.5-bullseye go run internal/static/integrations/cloudwatch_exporter/docs/doc.go generate diff --git a/docs/sources/_index.md b/docs/sources/_index.md index a7d4b3f1ba..c5ae695c75 100644 --- a/docs/sources/_index.md +++ b/docs/sources/_index.md @@ -4,7 +4,7 @@ title: Grafana Alloy description: Grafana Alloy is a a vendor-neutral distribution of the OTel Collector weight: 350 cascade: - ALLOY_RELEASE: v1.2.0 + ALLOY_RELEASE: v1.2.1 OTEL_VERSION: v0.87.0 FULL_PRODUCT_NAME: Grafana Alloy PRODUCT_NAME: Alloy diff --git a/go.mod b/go.mod index 0a031d9ea5..daa4542b21 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module github.com/grafana/alloy -go 1.22.3 +go 1.22.5 require ( cloud.google.com/go/pubsub v1.36.1 @@ -55,7 +55,7 @@ require ( github.com/grafana/alloy-remote-config v0.0.4 github.com/grafana/alloy/syntax v0.1.0 github.com/grafana/beyla v1.6.3 - github.com/grafana/ckit v0.0.0-20230906125525-c046c99a5c04 + github.com/grafana/ckit v0.0.0-20240624165704-36f3407a8eaa github.com/grafana/cloudflare-go v0.0.0-20230110200409-c627cf6792f2 github.com/grafana/dskit v0.0.0-20240104111617-ea101a3b86eb github.com/grafana/go-gelf/v2 v2.0.1 @@ -460,6 +460,7 @@ require ( github.com/hashicorp/go-hclog v1.6.3 // indirect github.com/hashicorp/go-immutable-radix v1.3.1 // indirect github.com/hashicorp/go-msgpack v0.5.5 // indirect + github.com/hashicorp/go-msgpack/v2 v2.1.1 // indirect github.com/hashicorp/go-retryablehttp v0.7.4 // indirect github.com/hashicorp/go-rootcerts v1.0.2 // indirect github.com/hashicorp/go-secure-stdlib/awsutil v0.1.6 // indirect @@ -470,7 +471,7 @@ require ( github.com/hashicorp/go-version v1.7.0 // indirect github.com/hashicorp/hcl v1.0.0 // indirect github.com/hashicorp/mdns v1.0.4 // indirect - github.com/hashicorp/memberlist v0.5.0 // indirect + github.com/hashicorp/memberlist v0.5.1 // indirect github.com/hashicorp/nomad/api v0.0.0-20240306004928-3e7191ccb702 // indirect github.com/hashicorp/serf v0.10.1 // indirect github.com/hashicorp/vic v1.5.1-0.20190403131502-bbfe86ec9443 // indirect diff --git a/go.sum b/go.sum index 2006b16196..af8386d393 100644 --- a/go.sum +++ b/go.sum @@ -1011,8 +1011,8 @@ github.com/grafana/beyla v1.6.3 h1:Jonwtz2BouJ0A6vgcKZyZ4+zI22LAKDUm+NAD4WbwLI= github.com/grafana/beyla v1.6.3/go.mod h1:kNi1vKt2ESdoOy84+zq6Z7z67+BqwaMfEngcM/K8yKA= github.com/grafana/cadvisor v0.0.0-20231110094609-5f7917925dea h1:Q5f5/nJJ0SbusZjA6F6XkJuHDbl2/PqdTGw6wHsuccA= github.com/grafana/cadvisor v0.0.0-20231110094609-5f7917925dea/go.mod h1:XjiOCFjmxXIWwauV5p39Mr2Yxlpyk72uKQH1UZvd4fQ= -github.com/grafana/ckit v0.0.0-20230906125525-c046c99a5c04 h1:tG8Qxq4dN1WqakMmsPaxaH4+OQhYg5HVsarw5acLBX8= -github.com/grafana/ckit v0.0.0-20230906125525-c046c99a5c04/go.mod h1:HOnDIbkxfvVlDM5FBujt0uawGLfdpdTeqE7fIwfBmQk= +github.com/grafana/ckit v0.0.0-20240624165704-36f3407a8eaa h1:3rdc/z801roM6ky8cT8wz4tahQWkTxJ4VAmzANZe8qQ= +github.com/grafana/ckit v0.0.0-20240624165704-36f3407a8eaa/go.mod h1:k21VjCNs7gj1pAV80wb1577fVRePk51Hek5QUMEvKE0= github.com/grafana/cloudflare-go v0.0.0-20230110200409-c627cf6792f2 h1:qhugDMdQ4Vp68H0tp/0iN17DM2ehRo1rLEdOFe/gB8I= github.com/grafana/cloudflare-go v0.0.0-20230110200409-c627cf6792f2/go.mod h1:w/aiO1POVIeXUQyl0VQSZjl5OAGDTL5aX+4v0RA1tcw= github.com/grafana/dskit v0.0.0-20240104111617-ea101a3b86eb h1:AWE6+kvtE18HP+lRWNUCyvymyrFSXs6TcS2vXIXGIuw= @@ -1136,6 +1136,8 @@ github.com/hashicorp/go-msgpack v0.5.3/go.mod h1:ahLV/dePpqEmjfWmKiqvPkv/twdG7iP github.com/hashicorp/go-msgpack v0.5.4/go.mod h1:ahLV/dePpqEmjfWmKiqvPkv/twdG7iPBM1vqhUKIvfM= github.com/hashicorp/go-msgpack v0.5.5 h1:i9R9JSrqIz0QVLz3sz+i3YJdT7TTSLcfLLzJi9aZTuI= github.com/hashicorp/go-msgpack v0.5.5/go.mod h1:ahLV/dePpqEmjfWmKiqvPkv/twdG7iPBM1vqhUKIvfM= +github.com/hashicorp/go-msgpack/v2 v2.1.1 h1:xQEY9yB2wnHitoSzk/B9UjXWRQ67QKu5AOm8aFp8N3I= +github.com/hashicorp/go-msgpack/v2 v2.1.1/go.mod h1:upybraOAblm4S7rx0+jeNy+CWWhzywQsSRV5033mMu4= github.com/hashicorp/go-multierror v1.0.0/go.mod h1:dHtQlpGsu+cZNNAkkCN/P3hoUDHhCYQXV3UM06sGGrk= github.com/hashicorp/go-multierror v1.1.0/go.mod h1:spPvp8C1qA32ftKqdAHm4hHTbPw+vmowP0z+KUhOZdA= github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+lD48awMYo= @@ -1202,8 +1204,9 @@ github.com/hashicorp/memberlist v0.1.3/go.mod h1:ajVTdAv/9Im8oMAAj5G31PhhMCZJV2p github.com/hashicorp/memberlist v0.1.4/go.mod h1:ajVTdAv/9Im8oMAAj5G31PhhMCZJV2pPBoIllUwCN7I= github.com/hashicorp/memberlist v0.1.5/go.mod h1:ajVTdAv/9Im8oMAAj5G31PhhMCZJV2pPBoIllUwCN7I= github.com/hashicorp/memberlist v0.3.0/go.mod h1:MS2lj3INKhZjWNqd3N0m3J+Jxf3DAOnAH9VT3Sh9MUE= -github.com/hashicorp/memberlist v0.5.0 h1:EtYPN8DpAURiapus508I4n9CzHs2W+8NZGbmmR/prTM= github.com/hashicorp/memberlist v0.5.0/go.mod h1:yvyXLpo0QaGE59Y7hDTsTzDD25JYBZ4mHgHUZ8lrOI0= +github.com/hashicorp/memberlist v0.5.1 h1:mk5dRuzeDNis2bi6LLoQIXfMH7JQvAzt3mQD0vNZZUo= +github.com/hashicorp/memberlist v0.5.1/go.mod h1:zGDXV6AqbDTKTM6yxW0I4+JtFzZAJVoIPvss4hV8F24= github.com/hashicorp/net-rpc-msgpackrpc v0.0.0-20151116020338-a14192a58a69/go.mod h1:/z+jUGRBlwVpUZfjute9jWaF6/HuhjuFQuL1YXzVD1Q= github.com/hashicorp/nomad/api v0.0.0-20240306004928-3e7191ccb702 h1:fI1LXuBaS1d9z1kmb++Og6YD8uMRwadXorCwE+xgOFA= github.com/hashicorp/nomad/api v0.0.0-20240306004928-3e7191ccb702/go.mod h1:z71gkJdrkAt/Rl6C7Q79VE7AwJ5lUF+M+fzFTyIHYB0= diff --git a/internal/alloycli/cluster_builder.go b/internal/alloycli/cluster_builder.go index 8f32447aaa..3a73feaa44 100644 --- a/internal/alloycli/cluster_builder.go +++ b/internal/alloycli/cluster_builder.go @@ -1,6 +1,7 @@ package alloycli import ( + "errors" "fmt" stdlog "log" "net" @@ -9,13 +10,14 @@ import ( "time" "github.com/go-kit/log" - "github.com/grafana/alloy/internal/runtime/logging/level" - "github.com/grafana/alloy/internal/service/cluster" "github.com/grafana/ckit/advertise" "github.com/hashicorp/go-discover" "github.com/hashicorp/go-discover/provider/k8s" "github.com/prometheus/client_golang/prometheus" "go.opentelemetry.io/otel/trace" + + "github.com/grafana/alloy/internal/runtime/logging/level" + "github.com/grafana/alloy/internal/service/cluster" ) type clusterOptions struct { @@ -69,7 +71,7 @@ func buildClusterService(opts clusterOptions) (*cluster.Service, error) { return nil, fmt.Errorf("at most one of join peers and discover peers may be set") case len(opts.JoinPeers) > 0: - config.DiscoverPeers = newStaticDiscovery(opts.JoinPeers, listenPort) + config.DiscoverPeers = newStaticDiscovery(opts.JoinPeers, listenPort, opts.Log) case opts.DiscoverPeers != "": discoverFunc, err := newDynamicDiscovery(config.Log, opts.DiscoverPeers, listenPort) @@ -137,46 +139,66 @@ func appendDefaultPort(addr string, port int) string { type discoverFunc func() ([]string, error) -func newStaticDiscovery(peers []string, defaultPort int) discoverFunc { +func newStaticDiscovery(providedAddr []string, defaultPort int, log log.Logger) discoverFunc { return func() ([]string, error) { - var addrs []string - - for _, addr := range peers { - addrs = appendJoinAddr(addrs, addr) + addresses, err := buildJoinAddresses(providedAddr, log) + if err != nil { + return nil, fmt.Errorf("static peer discovery: %w", err) } - - for i := range addrs { + for i := range addresses { // Default to using the same advertise port as the local node. This may // break in some cases, so the user should make sure the port numbers // align on as many nodes as possible. - addrs[i] = appendDefaultPort(addrs[i], defaultPort) + addresses[i] = appendDefaultPort(addresses[i], defaultPort) } - - return addrs, nil + return addresses, nil } } -func appendJoinAddr(addrs []string, in string) []string { - _, _, err := net.SplitHostPort(in) - if err == nil { - addrs = append(addrs, in) - return addrs +func buildJoinAddresses(providedAddr []string, log log.Logger) ([]string, error) { + // Currently we don't consider it an error to not have any join addresses. + if len(providedAddr) == 0 { + return nil, nil } + var ( + result []string + deferredErr error + ) + for _, addr := range providedAddr { + // If it's a host:port, use it as is. + _, _, err := net.SplitHostPort(addr) + if err != nil { + deferredErr = errors.Join(deferredErr, err) + } else { + level.Debug(log).Log("msg", "found a host:port cluster join address", "addr", addr) + result = append(result, addr) + break + } - ip := net.ParseIP(in) - if ip != nil { - addrs = append(addrs, ip.String()) - return addrs - } + // If it's an IP address, use it. + ip := net.ParseIP(addr) + if ip != nil { + level.Debug(log).Log("msg", "found an IP cluster join address", "addr", addr) + result = append(result, ip.String()) + break + } - _, srvs, err := net.LookupSRV("", "", in) - if err == nil { - for _, srv := range srvs { - addrs = append(addrs, srv.Target) + // Otherwise, do a DNS lookup and return all the records found. + _, srvs, err := net.LookupSRV("", "", addr) + if err != nil { + level.Warn(log).Log("msg", "failed to resolve SRV records", "addr", addr, "err", err) + deferredErr = errors.Join(deferredErr, err) + } else { + level.Debug(log).Log("msg", "found cluster join addresses via SRV records", "addr", addr, "count", len(srvs)) + for _, srv := range srvs { + result = append(result, srv.Target) + } } } - - return addrs + if len(result) == 0 { + return nil, fmt.Errorf("failed to find any valid join addresses: %w", deferredErr) + } + return result, nil } func newDynamicDiscovery(l log.Logger, config string, defaultPort int) (discoverFunc, error) { diff --git a/internal/alloycli/cluster_builder_test.go b/internal/alloycli/cluster_builder_test.go index 9d9622e68e..ff020aa7b8 100644 --- a/internal/alloycli/cluster_builder_test.go +++ b/internal/alloycli/cluster_builder_test.go @@ -1,6 +1,7 @@ package alloycli import ( + "os" "testing" "github.com/go-kit/log" @@ -54,3 +55,52 @@ func TestGetAdvertiseAddress(t *testing.T) { require.Equal(t, "127.0.0.1:80", addr) }) } + +func TestStaticDiscovery(t *testing.T) { + t.Run("no addresses provided", func(t *testing.T) { + logger := log.NewLogfmtLogger(os.Stdout) + sd := newStaticDiscovery([]string{}, 12345, logger) + actual, err := sd() + require.NoError(t, err) + require.Nil(t, actual) + }) + t.Run("host and port provided", func(t *testing.T) { + logger := log.NewLogfmtLogger(os.Stdout) + sd := newStaticDiscovery([]string{"host:8080"}, 12345, logger) + actual, err := sd() + require.NoError(t, err) + require.Equal(t, []string{"host:8080"}, actual) + }) + t.Run("IP provided and default port added", func(t *testing.T) { + logger := log.NewLogfmtLogger(os.Stdout) + sd := newStaticDiscovery([]string{"192.168.0.1"}, 12345, logger) + actual, err := sd() + require.NoError(t, err) + require.Equal(t, []string{"192.168.0.1:12345"}, actual) + }) + t.Run("fallback to next host and port provided", func(t *testing.T) { + logger := log.NewLogfmtLogger(os.Stdout) + sd := newStaticDiscovery([]string{"this | wont | work", "host2:8080"}, 12345, logger) + actual, err := sd() + require.NoError(t, err) + require.Equal(t, []string{"host2:8080"}, actual) + }) + t.Run("fallback to next host and port provided", func(t *testing.T) { + logger := log.NewLogfmtLogger(os.Stdout) + sd := newStaticDiscovery([]string{"this | wont | work", "host2:8080"}, 12345, logger) + actual, err := sd() + require.NoError(t, err) + require.Equal(t, []string{"host2:8080"}, actual) + }) + t.Run("nothing found", func(t *testing.T) { + logger := log.NewLogfmtLogger(os.Stdout) + sd := newStaticDiscovery([]string{"this | wont | work", "and/this/won't/either"}, 12345, logger) + actual, err := sd() + require.Nil(t, actual) + require.ErrorContains(t, err, "failed to find any valid join addresses") + require.ErrorContains(t, err, "this | wont | work: missing port in address") + require.ErrorContains(t, err, "lookup this | wont | work: no such host") + require.ErrorContains(t, err, "and/this/won't/either: missing port in address") + require.ErrorContains(t, err, "lookup and/this/won't/either: no such host") + }) +} diff --git a/internal/alloycli/cmd_run.go b/internal/alloycli/cmd_run.go index 63f40c94af..52d2edc736 100644 --- a/internal/alloycli/cmd_run.go +++ b/internal/alloycli/cmd_run.go @@ -19,6 +19,13 @@ import ( "github.com/KimMachineGun/automemlimit/memlimit" "github.com/fatih/color" "github.com/go-kit/log" + "github.com/grafana/ckit/advertise" + "github.com/grafana/ckit/peer" + "github.com/prometheus/client_golang/prometheus" + "github.com/spf13/cobra" + "go.opentelemetry.io/otel" + "golang.org/x/exp/maps" + "github.com/grafana/alloy/internal/alloyseed" "github.com/grafana/alloy/internal/boringcrypto" "github.com/grafana/alloy/internal/component" @@ -39,12 +46,6 @@ import ( "github.com/grafana/alloy/internal/static/config/instrumentation" "github.com/grafana/alloy/internal/usagestats" "github.com/grafana/alloy/syntax/diag" - "github.com/grafana/ckit/advertise" - "github.com/grafana/ckit/peer" - "github.com/prometheus/client_golang/prometheus" - "github.com/spf13/cobra" - "go.opentelemetry.io/otel" - "golang.org/x/exp/maps" // Install Components _ "github.com/grafana/alloy/internal/component/all" @@ -233,7 +234,7 @@ func (fr *alloyRun) Run(configPath string) error { ) clusterService, err := buildClusterService(clusterOptions{ - Log: l, + Log: log.With(l, "service", "cluster"), Tracer: t, Metrics: reg, diff --git a/internal/cmd/integration-tests/configs/kafka/Dockerfile b/internal/cmd/integration-tests/configs/kafka/Dockerfile index 624421c610..5c0205fcbe 100644 --- a/internal/cmd/integration-tests/configs/kafka/Dockerfile +++ b/internal/cmd/integration-tests/configs/kafka/Dockerfile @@ -1,4 +1,4 @@ -FROM golang:1.22.3 as build +FROM golang:1.22.5 as build WORKDIR /app/ COPY go.mod go.sum ./ COPY syntax/go.mod syntax/go.sum ./syntax/ diff --git a/internal/cmd/integration-tests/configs/otel-metrics-gen/Dockerfile b/internal/cmd/integration-tests/configs/otel-metrics-gen/Dockerfile index 0381f4c7b2..0d402bc156 100644 --- a/internal/cmd/integration-tests/configs/otel-metrics-gen/Dockerfile +++ b/internal/cmd/integration-tests/configs/otel-metrics-gen/Dockerfile @@ -1,4 +1,4 @@ -FROM golang:1.22.3 as build +FROM golang:1.22.5 as build WORKDIR /app/ COPY go.mod go.sum ./ COPY syntax/go.mod syntax/go.sum ./syntax/ diff --git a/internal/cmd/integration-tests/configs/prom-gen/Dockerfile b/internal/cmd/integration-tests/configs/prom-gen/Dockerfile index eb48cbc4a7..3d66ab97ae 100644 --- a/internal/cmd/integration-tests/configs/prom-gen/Dockerfile +++ b/internal/cmd/integration-tests/configs/prom-gen/Dockerfile @@ -1,4 +1,4 @@ -FROM golang:1.22.3 as build +FROM golang:1.22.5 as build WORKDIR /app/ COPY go.mod go.sum ./ COPY syntax/go.mod syntax/go.sum ./syntax/ diff --git a/internal/component/loki/source/kubernetes_events/kubernetes_events.go b/internal/component/loki/source/kubernetes_events/kubernetes_events.go index bf161db7b2..a0baf266fd 100644 --- a/internal/component/loki/source/kubernetes_events/kubernetes_events.go +++ b/internal/component/loki/source/kubernetes_events/kubernetes_events.go @@ -24,7 +24,7 @@ import ( ) // Generous timeout period for configuring informers -const informerSyncTimeout = 10 * time.Second +const informerSyncTimeout = 10 * time.Minute func init() { component.Register(component.Registration{ diff --git a/internal/service/cluster/cluster.go b/internal/service/cluster/cluster.go index d534dd6c34..fb060bc9e7 100644 --- a/internal/service/cluster/cluster.go +++ b/internal/service/cluster/cluster.go @@ -14,11 +14,6 @@ import ( "time" "github.com/go-kit/log" - "github.com/grafana/alloy/internal/component" - "github.com/grafana/alloy/internal/featuregate" - "github.com/grafana/alloy/internal/runtime/logging/level" - "github.com/grafana/alloy/internal/service" - http_service "github.com/grafana/alloy/internal/service/http" "github.com/grafana/ckit" "github.com/grafana/ckit/peer" "github.com/grafana/ckit/shard" @@ -27,25 +22,37 @@ import ( "go.opentelemetry.io/otel/trace" "go.opentelemetry.io/otel/trace/noop" "golang.org/x/net/http2" + + "github.com/grafana/alloy/internal/component" + "github.com/grafana/alloy/internal/featuregate" + "github.com/grafana/alloy/internal/runtime/logging/level" + "github.com/grafana/alloy/internal/service" + http_service "github.com/grafana/alloy/internal/service/http" + "github.com/grafana/alloy/internal/util" ) -// tokensPerNode is used to decide how many tokens each node should be given in -// the hash ring. All nodes must use the same value, otherwise they will have -// different views of the ring and assign work differently. -// -// Using 512 tokens strikes a good balance between distribution accuracy and -// memory consumption. A cluster of 1,000 nodes with 512 tokens per node -// requires 12MB for the hash ring. -// -// Distribution accuracy measures how close a node was to being responsible for -// exactly 1/N keys during simulation. Simulation tests used a cluster of 10 -// nodes and hashing 100,000 random keys: -// -// 512 tokens per node: min 96.1%, median 99.9%, max 103.2% (stddev: 197.9 hashes) -const tokensPerNode = 512 +const ( + // ServiceName defines the name used for the cluster service. + ServiceName = "cluster" -// ServiceName defines the name used for the cluster service. -const ServiceName = "cluster" + // tokensPerNode is used to decide how many tokens each node should be given in + // the hash ring. All nodes must use the same value, otherwise they will have + // different views of the ring and assign work differently. + // + // Using 512 tokens strikes a good balance between distribution accuracy and + // memory consumption. A cluster of 1,000 nodes with 512 tokens per node + // requires 12MB for the hash ring. + // + // Distribution accuracy measures how close a node was to being responsible for + // exactly 1/N keys during simulation. Simulation tests used a cluster of 10 + // nodes and hashing 100,000 random keys: + // + // 512 tokens per node: min 96.1%, median 99.9%, max 103.2% (stddev: 197.9 hashes) + tokensPerNode = 512 + + // maxPeersToLog is the maximum number of peers to log on info level. All peers are logged on debug level. + maxPeersToLog = 10 +) // Options are used to configure the cluster service. Options are constant for // the lifetime of the cluster service. @@ -213,11 +220,7 @@ func (s *Service) Run(ctx context.Context, host service.Host) error { spanCtx, span := tracer.Start(ctx, "NotifyClusterChange", trace.WithSpanKind(trace.SpanKindInternal)) defer span.End() - names := make([]string, len(peers)) - for i, p := range peers { - names[i] = p.Name - } - level.Info(s.log).Log("msg", "peers changed", "new_peers", strings.Join(names, ",")) + s.logPeers("peers changed", toStringSlice(peers)) // Notify all components about the clustering change. components := component.GetAllComponents(host, component.InfoOptions{}) @@ -246,11 +249,18 @@ func (s *Service) Run(ctx context.Context, host service.Host) error { peers, err := s.getPeers() if err != nil { - return fmt.Errorf("failed to get peers to join: %w", err) + // Fail fast on startup if we can't discover peers to prevent a split brain and give a clear signal to the user. + return fmt.Errorf("failed to get peers to join at startup: %w", err) } - level.Info(s.log).Log("msg", "starting cluster node", "peers", strings.Join(peers, ","), - "advertise_addr", s.opts.AdvertiseAddress) + // We log on info level including all the peers (without any abbreviation), as it's happening only on startup and + // won't spam too much in most cases. In other cases we should either abbreviate the list or log on debug level. + level.Info(s.log).Log( + "msg", "starting cluster node", + "peers_count", len(peers), + "peers", strings.Join(peers, ","), + "advertise_addr", s.opts.AdvertiseAddress, + ) if err := s.node.Start(peers); err != nil { level.Warn(s.log).Log("msg", "failed to connect to peers; bootstrapping a new cluster", "err", err) @@ -281,8 +291,8 @@ func (s *Service) Run(ctx context.Context, host service.Host) error { level.Warn(s.log).Log("msg", "failed to refresh list of peers", "err", err) continue } + s.logPeers("rejoining peers", peers) - level.Info(s.log).Log("msg", "rejoining peers", "peers", strings.Join(peers, ",")) if err := s.node.Start(peers); err != nil { level.Error(s.log).Log("msg", "failed to rejoin list of peers", "err", err) continue @@ -306,6 +316,13 @@ func (s *Service) getPeers() ([]string, error) { return nil, err } + // Debug level log all the peers for troubleshooting. + level.Debug(s.log).Log( + "msg", "discovered peers", + "peers_count", len(peers), + "peers", strings.Join(peers, ","), + ) + // Here we return the entire list because we can't take a subset. if s.opts.ClusterMaxJoinPeers == 0 || len(peers) < s.opts.ClusterMaxJoinPeers { return peers, nil @@ -347,6 +364,15 @@ func (s *Service) Data() any { return &sharderCluster{sharder: s.sharder} } +func (s *Service) logPeers(msg string, peers []string) { + // Truncate peers list on info level. + level.Info(s.log).Log( + "msg", msg, + "peers_count", len(peers), + "peers", util.JoinWithTruncation(peers, ",", maxPeersToLog, "..."), + ) +} + // Component is a component which subscribes to clustering updates. type Component interface { component.Component @@ -394,3 +420,11 @@ func (sc *sharderCluster) Lookup(key shard.Key, replicationFactor int, op shard. func (sc *sharderCluster) Peers() []peer.Peer { return sc.sharder.Peers() } + +func toStringSlice[T any](slice []T) []string { + s := make([]string, 0, len(slice)) + for _, p := range slice { + s = append(s, fmt.Sprintf("%v", p)) + } + return s +} diff --git a/internal/service/cluster/cluster_test.go b/internal/service/cluster/cluster_test.go index 02fba7b9be..d11a791a8e 100644 --- a/internal/service/cluster/cluster_test.go +++ b/internal/service/cluster/cluster_test.go @@ -2,8 +2,10 @@ package cluster import ( "math/rand" + "os" "testing" + "github.com/go-kit/log" "github.com/stretchr/testify/require" ) @@ -46,6 +48,7 @@ func TestGetPeers(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { s := &Service{ + log: log.NewLogfmtLogger(os.Stdout), opts: test.opts, randGen: rand.New(rand.NewSource(1)), // Seeded random generator to have consistent results in tests. } diff --git a/internal/util/strings.go b/internal/util/strings.go index 0caaadcecb..cea793251e 100644 --- a/internal/util/strings.go +++ b/internal/util/strings.go @@ -3,6 +3,8 @@ package util import ( "regexp" "strings" + + "k8s.io/utils/strings/slices" ) // CamelToSnake is a helper function for converting CamelCase to Snake Case @@ -13,3 +15,25 @@ func CamelToSnake(str string) string { snake = matchAllCap.ReplaceAllString(snake, "${1}_${2}") return strings.ToLower(snake) } + +// JoinWithTruncation joins a slice of string elements with a separator sep, truncating the middle if the slice is longer +// than maxElements, using abbreviation as a placeholder for the truncated part. The last element of the slice is always +// included in the result. For example: ["1", "2", "3", "4"] with sep=",", maxLength=3 and abbreviation="..." will +// return "1, 2, ..., 4". +func JoinWithTruncation(elements []string, sep string, maxElements int, abbreviation string) string { + if maxElements <= 0 { + return "" + } + if len(elements) <= maxElements { + return strings.Join(elements, sep) + } + // We know now that len(elements) > maxElements >= 1, so we need to truncate something. + // Handle the special case of maxElements == 1. + if maxElements == 1 { + return elements[0] + sep + abbreviation + } + // We know now that len(elements) > maxElements >= 2, can safely truncate the middle. + result := slices.Clone(elements[:maxElements-1]) + result = append(result, abbreviation, elements[len(elements)-1]) + return strings.Join(result, sep) +} diff --git a/internal/util/strings_test.go b/internal/util/strings_test.go new file mode 100644 index 0000000000..cebbc5ad39 --- /dev/null +++ b/internal/util/strings_test.go @@ -0,0 +1,130 @@ +package util + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestJoinWithTruncation(t *testing.T) { + type args struct { + str []string + sep string + maxLength int + abbreviation string + } + tests := []struct { + name string + args args + expected string + }{ + { + name: "empty slice", + args: args{str: []string{}, sep: ", ", maxLength: 0, abbreviation: "..."}, + expected: "", + }, + { + name: "empty slice 2", + args: args{str: []string{}, sep: ", ", maxLength: 10, abbreviation: "..."}, + expected: "", + }, + { + name: "smaller slice", + args: args{str: []string{"one", "two", "three"}, sep: ", ", maxLength: 10}, + expected: "one, two, three", + }, + { + name: "truncate slice", + args: args{ + str: []string{"one", "two", "three", "four", "five", "six"}, + sep: ", ", + maxLength: 4, + abbreviation: "[...]", + }, + expected: "one, two, three, [...], six", + }, + { + name: "truncate to 0", + args: args{ + str: []string{"one", "two", "three", "four", "five", "six"}, + sep: ", ", + maxLength: 0, + abbreviation: "[...]", + }, + expected: "", + }, + { + name: "truncate to 1", + args: args{ + str: []string{"one", "two", "three", "four", "five", "six"}, + sep: ", ", + maxLength: 1, + abbreviation: "[...]", + }, + expected: "one, [...]", + }, + { + name: "truncate to 2", + args: args{ + str: []string{"one", "two", "three", "four", "five", "six"}, + sep: ", ", + maxLength: 2, + abbreviation: "[...]", + }, + expected: "one, [...], six", + }, + { + name: "single element to 0", + args: args{ + str: []string{"one"}, + sep: ", ", + maxLength: 0, + abbreviation: "[...]", + }, + expected: "", + }, + { + name: "single element to 1", + args: args{ + str: []string{"one"}, + sep: ", ", + maxLength: 1, + abbreviation: "[...]", + }, + expected: "one", + }, + { + name: "single element to 2", + args: args{ + str: []string{"one"}, + sep: ", ", + maxLength: 2, + abbreviation: "[...]", + }, + expected: "one", + }, + { + name: "cluster peers example", + args: args{ + str: []string{ + "grafana-agent-helm-15.grafana-agent-helm.grafana-agent.svc.cluster.local.:3090", + "grafana-agent-helm-6.grafana-agent-helm.grafana-agent.svc.cluster.local.:3090", + "grafana-agent-helm-16.grafana-agent-helm.grafana-agent.svc.cluster.local.:3090", + "grafana-agent-helm-2.grafana-agent-helm.grafana-agent.svc.cluster.local.:3090", + }, + sep: ",", + maxLength: 2, + abbreviation: "...", + }, + expected: "grafana-agent-helm-15.grafana-agent-helm.grafana-agent.svc.cluster.local.:3090," + + "...," + + "grafana-agent-helm-2.grafana-agent-helm.grafana-agent.svc.cluster.local.:3090", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + actual := JoinWithTruncation(tt.args.str, tt.args.sep, tt.args.maxLength, tt.args.abbreviation) + require.Equal(t, tt.expected, actual) + }) + } +} diff --git a/tools/build-image/Dockerfile b/tools/build-image/Dockerfile index 4ff97d2bed..7c95dbcca7 100644 --- a/tools/build-image/Dockerfile +++ b/tools/build-image/Dockerfile @@ -4,7 +4,7 @@ # default when running `docker buildx build` or when DOCKER_BUILDKIT=1 is set # in environment variables. -# NOTE: The GO_RUNTIME is used to switch between the default Google go runtime and mcr.microsoft.com/oss/go/microsoft/golang:1.22.3-bullseye which is a Microsoft +# NOTE: The GO_RUNTIME is used to switch between the default Google go runtime and mcr.microsoft.com/oss/go/microsoft/golang:1.22.5-bullseye which is a Microsoft # fork of go that allows using windows crypto instead of boring crypto. Details at https://github.com/microsoft/go/tree/microsoft/main/eng/doc/fips ARG GO_RUNTIME=mustoverride diff --git a/tools/make/build-container.mk b/tools/make/build-container.mk index be79eef452..9c92bd45a4 100644 --- a/tools/make/build-container.mk +++ b/tools/make/build-container.mk @@ -34,7 +34,7 @@ # variable names should be passed through to the container. USE_CONTAINER ?= 0 -BUILD_IMAGE_VERSION ?= v0.1.2 +BUILD_IMAGE_VERSION ?= v0.1.3 BUILD_IMAGE ?= grafana/alloy-build-image:$(BUILD_IMAGE_VERSION) DOCKER_OPTS ?= -it