diff --git a/build/Dockerfile b/build/Dockerfile index ada89965..329f4484 100644 --- a/build/Dockerfile +++ b/build/Dockerfile @@ -12,6 +12,7 @@ RUN go mod download # Copy the go source COPY cmd/operator/main.go cmd/operator/main.go COPY pkg/ pkg/ +COPY must-gather/ must-gather/ # Build RUN GOOS=linux GOARCH=amd64 go build -a -tags netgo,osusergo -o manager cmd/operator/main.go @@ -22,6 +23,7 @@ FROM gcr.io/distroless/static:nonroot WORKDIR / COPY --from=builder /workspace/manager . +COPY --from-builder /workspace/must-gather/collection-scripts/* . USER 65532:65532 ENTRYPOINT ["/manager"] diff --git a/must-gather/README.md b/must-gather/README.md new file mode 100644 index 00000000..d5e9af7d --- /dev/null +++ b/must-gather/README.md @@ -0,0 +1,61 @@ +observability-operator must-gather +================= + +`observability-operator-must-gather` is a tool built on top of [OpenShift must-gather](https://github.com/openshift/must-gather) +that expands its capabilities to gather Observability Operator information. + +**Note:** This image is only built for x86_64 architecture + +### Usage +To gather only Observability Operator information: +```sh +oc adm must-gather --image=quay.io/rhobs/observability-operator:latest -- /usr/bin/gather +``` + +To gather default [OpenShift must-gather](https://github.com/openshift/must-gather) in addition to Observability Operator information: +```sh +oc adm must-gather --image-stream=openshift/must-gather --image=quay.io/rhobs/observability-operator -- /usr/bin/gather +``` + +The command above will create a local directory with a dump of the Observability Operator state. + +You will get a dump of: +- The observability-operator operator deployment +- All observability-operator operant pods +- Alertmanager and Prometheus status for all stacks + +In order to get data about other parts of the cluster (not specific to observability-operator ) you should +run `oc adm must-gather` (without passing a custom image). Run `oc adm must-gather -h` to see more options. + +Example must-gather for observability-operator output: +``` +monitoring +└── observability-operator + ├── [namespace name] + │   └── [monitoring stack name] + │   ├── alertmanager + │   │   ├── status.json + │   │   └── status.stderr + │   └── prometheus + │   ├── alertmanagers.json + │   ├── alertmanagers.stderr + │   ├── prometheus-[monitoring stack name]-[replica] + │   │   ├── status + │   │   │   ├── runtimeinfo.json + │   │   │   ├── runtimeinfo.stderr + │   │   │   ├── tsdb.json + │   │   │   └── tsdb.stderr + │   │   ├── targets-active.json + │   │   ├── targets-active.stderr + │   │   ├── targets?state=active.json + │   │   └── targets?state=active.stderr + │   ├── rules.json + │   ├── rules.stderr + │   └── status + │   ├── config.json + │   ├── config.stderr + │   ├── flags.json + │   └── flags.stderr + ├── operants.yaml + └── operator.yaml +``` diff --git a/must-gather/collection-scripts/common.sh b/must-gather/collection-scripts/common.sh new file mode 100644 index 00000000..7eefb118 --- /dev/null +++ b/must-gather/collection-scripts/common.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +# safeguards +set -o nounset +set -o errexit +set -o pipefail + +get_first_ready_prom_pod() { + local ns="$1"; shift + local name="$1"; shift + readarray -t READY_PROM_PODS < <( + oc get pods -n "$ns" -l app.kubernetes.io/part-of="$name",app.kubernetes.io/component=prometheus --field-selector=status.phase==Running \ + --no-headers -o custom-columns=":metadata.name" + ) + echo "${READY_PROM_PODS[0]}" +} + +get_first_ready_alertmanager_pod() { + local ns="$1"; shift + local name="$1"; shift + readarray -t READY_AM_PODS < <( + oc get pods -n "$ns" -l app.kubernetes.io/part-of="$name",app.kubernetes.io/component=alertmanager --field-selector=status.phase==Running \ + --no-headers -o custom-columns=":metadata.name" + ) + echo "${READY_AM_PODS[0]}" +} diff --git a/must-gather/collection-scripts/gather b/must-gather/collection-scripts/gather new file mode 100644 index 00000000..83ce6e7d --- /dev/null +++ b/must-gather/collection-scripts/gather @@ -0,0 +1,139 @@ +#!/bin/bash + +# safeguards +set -o nounset +set -o errexit +set -o pipefail + +# global readonly constants +declare -r BASE_COLLECTION_PATH="../must-gather" +declare -r COLLECTION_PATH="${BASE_COLLECTION_PATH}/monitoring/observability-operator" + +source "$(dirname "$0")"/common.sh + +# init initializes global variables that need to be computed. +# E.g. get token of the default ServiceAccount +init() { + mkdir -p "${COLLECTION_PATH}" + + readarray -t MON_STACK_NSS < <( + oc get monitoringstacks --all-namespaces --no-headers -o custom-columns=":metadata.namespace" + ) || true +} + +operants_get() { + oc get pods --all-namespaces -l app.kubernetes.io/managed-by=observability-operator -o yaml > "$COLLECTION_PATH"/operants.yaml + oc get pods --all-namespaces -l app.kubernetes.io/part-of=observability-operator -o yaml >> "$COLLECTION_PATH"/operants.yaml + oc get pods --all-namespaces -l app.kubernetes.io/name=observability-operator > "$COLLECTION_PATH"/operator.yaml +} + +# prom_get makes http GET requests to prometheus /api/v1/$object and stores +# the stdout and stderr results +prom_get() { + local object="$1"; shift + local ns=$1; shift + local name="$1"; shift + local pod + pod=$(get_first_ready_prom_pod "$ns" "$name") + + local result_path="$COLLECTION_PATH/$ns/$name/prometheus/$object" + mkdir -p "$(dirname "$result_path")" + + echo "INFO: Getting ${object} from ${pod}" + oc exec "${pod}" \ + -c prometheus \ + -n "$ns" \ + -- /bin/bash -c "curl -sG http://localhost:9090/api/v1/${object}" \ + > "${result_path}.json" \ + 2> "${result_path}.stderr" +} + +prom_get_from_replica() { + local replica="$1"; shift + local object="$1"; shift + local ns=$1; shift + local name="$1"; shift + local path="${1:-$object}"; shift || true + + local result_path="$COLLECTION_PATH/$ns/$name/prometheus/$path" + mkdir -p "$(dirname "${result_path}")" + + echo "INFO: Getting ${object} from ${replica}" + oc exec "${replica}" \ + -c prometheus \ + -n "$ns" \ + -- /bin/bash -c "curl -sG http://localhost:9090/api/v1/${object}" \ + > "${result_path}.json" \ + 2> "${result_path}.stderr" +} + +prom_get_from_replicas() { + local object="$1"; shift + local ns=$1; shift + local name="$1"; shift + local path="${1:-$object}"; shift || true + + readarray -t stss < <( + oc get sts -n "$ns" -l app.kubernetes.io/part-of="$name" --no-headers -o custom-columns=":metadata.uid" + ) || true + for sts in "${stss[@]}"; do + readarray -t pods < <( + oc get pods -n "$ns" -l app.kubernetes.io/component=prometheus -o json | jq -r '.items[] | select(.metadata.ownerReferences[].uid | test("'"$sts"'")).metadata.name' + ) || true + for pod in "${pods[@]}"; do + prom_get_from_replica "${pod}" "${object}" "$ns" "$name" "${pod}/${path}" || true + done + done +} + +alertmanager_get() { + local object="$1"; shift + local ns=$1; shift + local name="$1"; shift + local pod + pod=$(get_first_ready_alertmanager_pod "$ns" "$name") + + local result_path="$COLLECTION_PATH/$ns/$name/alertmanager/$object" + mkdir -p "$(dirname "$result_path")" + + echo "INFO: Getting ${object} from ${pod}" + oc exec "${pod}" \ + -c alertmanager\ + -n openshift-monitoring \ + -- /bin/bash -c "curl -sG http://localhost:9093/api/v2/${object}" \ + > "${result_path}.json" \ + 2> "${result_path}.stderr" +} + + +monitoring_gather(){ + init + + # begin gathering + # NOTE || true ignores failures + + operants_get || true + + for ns in "${MON_STACK_NSS[@]}"; do + readarray -t MON_STACK_NAMES < <( + oc get monitoringstacks -n "$ns" --no-headers -o custom-columns=":metadata.name" + ) || true + for name in "${MON_STACK_NAMES[@]}"; do + prom_get alertmanagers "$ns" "$name" || true + prom_get rules "$ns" "$name" || true + prom_get status/config "$ns" "$name" || true + prom_get status/flags "$ns" "$name" || true + + # using prom_get_from_replica as the state differs for each replica + prom_get_from_replicas status/runtimeinfo "$ns" "$name" || true + prom_get_from_replicas 'targets?state=active' "$ns" "$name" targets-active || true + prom_get_from_replicas status/tsdb "$ns" "$name"|| true + + alertmanager_get status "$ns" "$name" || true + done + done + + sync +} + +monitoring_gather