Skip to content

Commit

Permalink
feat: add basic gather script for monitoring components
Browse files Browse the repository at this point in the history
Signed-off-by: Jan Fajerski <jfajersk@redhat.com>
  • Loading branch information
jan--f committed Oct 30, 2024
1 parent b243203 commit e54621b
Show file tree
Hide file tree
Showing 4 changed files with 228 additions and 0 deletions.
2 changes: 2 additions & 0 deletions build/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ RUN go mod download
# Copy the go source
COPY cmd/operator/main.go cmd/operator/main.go
COPY pkg/ pkg/
COPY must-gather/ must-gather/

# Build
RUN GOOS=linux GOARCH=amd64 go build -a -tags netgo,osusergo -o manager cmd/operator/main.go
Expand All @@ -22,6 +23,7 @@ FROM gcr.io/distroless/static:nonroot
WORKDIR /

COPY --from=builder /workspace/manager .
COPY --from-builder /workspace/must-gather/collection-scripts/* .
USER 65532:65532

ENTRYPOINT ["/manager"]
61 changes: 61 additions & 0 deletions must-gather/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
observability-operator must-gather
=================

`observability-operator-must-gather` is a tool built on top of [OpenShift must-gather](https://github.com/openshift/must-gather)
that expands its capabilities to gather Observability Operator information.

**Note:** This image is only built for x86_64 architecture

### Usage
To gather only Observability Operator information:
```sh
oc adm must-gather --image=quay.io/rhobs/observability-operator:latest -- /usr/bin/gather
```

To gather default [OpenShift must-gather](https://github.com/openshift/must-gather) in addition to Observability Operator information:
```sh
oc adm must-gather --image-stream=openshift/must-gather --image=quay.io/rhobs/observability-operator -- /usr/bin/gather
```

The command above will create a local directory with a dump of the Observability Operator state.

You will get a dump of:
- The observability-operator operator deployment
- All observability-operator operant pods
- Alertmanager and Prometheus status for all stacks

In order to get data about other parts of the cluster (not specific to observability-operator ) you should
run `oc adm must-gather` (without passing a custom image). Run `oc adm must-gather -h` to see more options.

Example must-gather for observability-operator output:
```
monitoring
└── observability-operator
├── [namespace name]
│   └── [monitoring stack name]
│   ├── alertmanager
│   │   ├── status.json
│   │   └── status.stderr
│   └── prometheus
│   ├── alertmanagers.json
│   ├── alertmanagers.stderr
│   ├── prometheus-[monitoring stack name]-[replica]
│   │   ├── status
│   │   │   ├── runtimeinfo.json
│   │   │   ├── runtimeinfo.stderr
│   │   │   ├── tsdb.json
│   │   │   └── tsdb.stderr
│   │   ├── targets-active.json
│   │   ├── targets-active.stderr
│   │   ├── targets?state=active.json
│   │   └── targets?state=active.stderr
│   ├── rules.json
│   ├── rules.stderr
│   └── status
│   ├── config.json
│   ├── config.stderr
│   ├── flags.json
│   └── flags.stderr
├── operants.yaml
└── operator.yaml
```
26 changes: 26 additions & 0 deletions must-gather/collection-scripts/common.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/bin/bash

# safeguards
set -o nounset
set -o errexit
set -o pipefail

get_first_ready_prom_pod() {
local ns="$1"; shift
local name="$1"; shift
readarray -t READY_PROM_PODS < <(
oc get pods -n "$ns" -l app.kubernetes.io/part-of="$name",app.kubernetes.io/component=prometheus --field-selector=status.phase==Running \
--no-headers -o custom-columns=":metadata.name"
)
echo "${READY_PROM_PODS[0]}"
}

get_first_ready_alertmanager_pod() {
local ns="$1"; shift
local name="$1"; shift
readarray -t READY_AM_PODS < <(
oc get pods -n "$ns" -l app.kubernetes.io/part-of="$name",app.kubernetes.io/component=alertmanager --field-selector=status.phase==Running \
--no-headers -o custom-columns=":metadata.name"
)
echo "${READY_AM_PODS[0]}"
}
139 changes: 139 additions & 0 deletions must-gather/collection-scripts/gather
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
#!/bin/bash

# safeguards
set -o nounset
set -o errexit
set -o pipefail

# global readonly constants
declare -r BASE_COLLECTION_PATH="../must-gather"
declare -r COLLECTION_PATH="${BASE_COLLECTION_PATH}/monitoring/observability-operator"

source "$(dirname "$0")"/common.sh

# init initializes global variables that need to be computed.
# E.g. get token of the default ServiceAccount
init() {
mkdir -p "${COLLECTION_PATH}"

readarray -t MON_STACK_NSS < <(
oc get monitoringstacks --all-namespaces --no-headers -o custom-columns=":metadata.namespace"
) || true
}

operants_get() {
oc get pods --all-namespaces -l app.kubernetes.io/managed-by=observability-operator -o yaml > "$COLLECTION_PATH"/operants.yaml
oc get pods --all-namespaces -l app.kubernetes.io/part-of=observability-operator -o yaml >> "$COLLECTION_PATH"/operants.yaml
oc get pods --all-namespaces -l app.kubernetes.io/name=observability-operator > "$COLLECTION_PATH"/operator.yaml
}

# prom_get makes http GET requests to prometheus /api/v1/$object and stores
# the stdout and stderr results
prom_get() {
local object="$1"; shift
local ns=$1; shift
local name="$1"; shift
local pod
pod=$(get_first_ready_prom_pod "$ns" "$name")

local result_path="$COLLECTION_PATH/$ns/$name/prometheus/$object"
mkdir -p "$(dirname "$result_path")"

echo "INFO: Getting ${object} from ${pod}"
oc exec "${pod}" \
-c prometheus \
-n "$ns" \
-- /bin/bash -c "curl -sG http://localhost:9090/api/v1/${object}" \
> "${result_path}.json" \
2> "${result_path}.stderr"
}

prom_get_from_replica() {
local replica="$1"; shift
local object="$1"; shift
local ns=$1; shift
local name="$1"; shift
local path="${1:-$object}"; shift || true

local result_path="$COLLECTION_PATH/$ns/$name/prometheus/$path"
mkdir -p "$(dirname "${result_path}")"

echo "INFO: Getting ${object} from ${replica}"
oc exec "${replica}" \
-c prometheus \
-n "$ns" \
-- /bin/bash -c "curl -sG http://localhost:9090/api/v1/${object}" \
> "${result_path}.json" \
2> "${result_path}.stderr"
}

prom_get_from_replicas() {
local object="$1"; shift
local ns=$1; shift
local name="$1"; shift
local path="${1:-$object}"; shift || true

readarray -t stss < <(
oc get sts -n "$ns" -l app.kubernetes.io/part-of="$name" --no-headers -o custom-columns=":metadata.uid"
) || true
for sts in "${stss[@]}"; do
readarray -t pods < <(
oc get pods -n "$ns" -l app.kubernetes.io/component=prometheus -o json | jq -r '.items[] | select(.metadata.ownerReferences[].uid | test("'"$sts"'")).metadata.name'
) || true
for pod in "${pods[@]}"; do
prom_get_from_replica "${pod}" "${object}" "$ns" "$name" "${pod}/${path}" || true
done
done
}

alertmanager_get() {
local object="$1"; shift
local ns=$1; shift
local name="$1"; shift
local pod
pod=$(get_first_ready_alertmanager_pod "$ns" "$name")

local result_path="$COLLECTION_PATH/$ns/$name/alertmanager/$object"
mkdir -p "$(dirname "$result_path")"

echo "INFO: Getting ${object} from ${pod}"
oc exec "${pod}" \
-c alertmanager\
-n openshift-monitoring \
-- /bin/bash -c "curl -sG http://localhost:9093/api/v2/${object}" \
> "${result_path}.json" \
2> "${result_path}.stderr"
}


monitoring_gather(){
init

# begin gathering
# NOTE || true ignores failures

operants_get || true

for ns in "${MON_STACK_NSS[@]}"; do
readarray -t MON_STACK_NAMES < <(
oc get monitoringstacks -n "$ns" --no-headers -o custom-columns=":metadata.name"
) || true
for name in "${MON_STACK_NAMES[@]}"; do
prom_get alertmanagers "$ns" "$name" || true
prom_get rules "$ns" "$name" || true
prom_get status/config "$ns" "$name" || true
prom_get status/flags "$ns" "$name" || true

# using prom_get_from_replica as the state differs for each replica
prom_get_from_replicas status/runtimeinfo "$ns" "$name" || true
prom_get_from_replicas 'targets?state=active' "$ns" "$name" targets-active || true
prom_get_from_replicas status/tsdb "$ns" "$name"|| true

alertmanager_get status "$ns" "$name" || true
done
done

sync
}

monitoring_gather

0 comments on commit e54621b

Please sign in to comment.