Skip to content

Commit

Permalink
DCGM Exporter Release 3.3.6-3.4.2 (#325)
Browse files Browse the repository at this point in the history
Signed-off-by: Rohit Arora <roarora@nvidia.com>
Co-authored-by: Vadym Fedorov <vfedorov@nvidia.com>
  • Loading branch information
rohit-arora-dev and nvvfedorov authored May 20, 2024
1 parent 7decfd2 commit dd3001a
Show file tree
Hide file tree
Showing 37 changed files with 1,337 additions and 201 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ vendor/
tests.cov
test_results.json
.scannerwork
dist/

###############################################################################
# JetBrains
Expand Down
43 changes: 23 additions & 20 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -14,30 +14,32 @@

include hack/VERSION

MKDIR ?= mkdir
REGISTRY ?= nvidia
REGISTRY ?= nvidia
GO ?= go
MKDIR ?= mkdir
GOLANGCILINT_TIMEOUT ?= 10m

DCGM_VERSION := $(NEW_DCGM_VERSION)
GOLANG_VERSION := 1.21.5
VERSION := $(NEW_EXPORTER_VERSION)
FULL_VERSION := $(DCGM_VERSION)-$(VERSION)
OUTPUT := type=oci,dest=/tmp/dcgm-exporter.tar
OUTPUT := type=oci,dest=/dev/null
PLATFORMS := linux/amd64,linux/arm64
DOCKERCMD := docker buildx build
MODULE := github.com/NVIDIA/dcgm-exporter


.PHONY: all binary install check-format local
all: update-version ubuntu22.04 ubi9

binary: update-version
cd cmd/dcgm-exporter; go build -ldflags "-X main.BuildVersion=${DCGM_VERSION}-${VERSION}"
binary: generate update-version
cd cmd/dcgm-exporter; $(GO) build -ldflags "-X main.BuildVersion=${DCGM_VERSION}-${VERSION}"

test-main:
go test ./... -short
$(GO) test ./... -short

install: binary
install -m 755 cmd/dcgm-exporter/dcgm-exporter /usr/bin/dcgm-exporter
install -m 755 $(DIST_DIR)/dcgm-exporter /usr/bin/dcgm-exporter
install -m 644 -D ./etc/default-counters.csv /etc/dcgm-exporter/default-counters.csv
install -m 644 -D ./etc/dcp-metrics-included.csv /etc/dcgm-exporter/dcp-metrics-included.csv

Expand All @@ -56,24 +58,20 @@ else
$(MAKE) PLATFORMS=linux/amd64 OUTPUT=type=docker DOCKERCMD='docker build'
endif

ubuntu22.04:
$(DOCKERCMD) --pull \
--output $(OUTPUT) \
--platform $(PLATFORMS) \
--build-arg "GOLANG_VERSION=$(GOLANG_VERSION)" \
--build-arg "DCGM_VERSION=$(DCGM_VERSION)" \
--tag "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubuntu22.04" \
--file docker/Dockerfile.ubuntu22.04 .
TARGETS = ubuntu22.04 ubi9

ubi9:
DOCKERFILE.ubuntu22.04 = docker/Dockerfile.ubuntu22.04
DOCKERFILE.ubi9 = docker/Dockerfile.ubi9

$(TARGETS):
$(DOCKERCMD) --pull \
--output $(OUTPUT) \
--platform $(PLATFORMS) \
--build-arg "GOLANG_VERSION=$(GOLANG_VERSION)" \
--build-arg "DCGM_VERSION=$(DCGM_VERSION)" \
--build-arg "VERSION=$(FULL_VERSION)" \
--tag "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubi9" \
--file docker/Dockerfile.ubi9 .
--build-arg "VERSION=$(VERSION)" \
--tag "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-$@" \
--file $(DOCKERFILE.$@) .

.PHONY: integration
test-integration:
Expand All @@ -84,7 +82,7 @@ test-coverage:

.PHONY: lint
lint:
golangci-lint run ./... --timeout $(GOLANGCILINT_TIMEOUT) --new-from-rev=HEAD~1 --verbose
golangci-lint run ./... --timeout $(GOLANGCILINT_TIMEOUT) --new-from-rev=HEAD~1 --fix

.PHONY: validate-modules
validate-modules:
Expand Down Expand Up @@ -132,3 +130,8 @@ update-version:

# Update DCGM and DCGM Exporter versions
update-versions: update-version

.PHONY: generate
# Generate code (Mocks)
generate:
go generate ./...
94 changes: 61 additions & 33 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,10 @@ Official documentation for DCGM-Exporter can be found on [docs.nvidia.com](https
### Quickstart

To gather metrics on a GPU node, simply start the `dcgm-exporter` container:
```
$ docker run -d --gpus all --rm -p 9400:9400 nvcr.io/nvidia/k8s/dcgm-exporter:3.3.5-3.4.1-ubuntu22.04
$ curl localhost:9400/metrics

```shell
docker run -d --gpus all --rm -p 9400:9400 nvcr.io/nvidia/k8s/dcgm-exporter:3.3.6-3.4.2-ubuntu22.04
curl localhost:9400/metrics
# HELP DCGM_FI_DEV_SM_CLOCK SM clock frequency (in MHz).
# TYPE DCGM_FI_DEV_SM_CLOCK gauge
# HELP DCGM_FI_DEV_MEM_CLOCK Memory clock frequency (in MHz).
Expand All @@ -32,33 +33,38 @@ Note: Consider using the [NVIDIA GPU Operator](https://github.com/NVIDIA/gpu-ope
Ensure you have already setup your cluster with the [default runtime as NVIDIA](https://github.com/NVIDIA/nvidia-container-runtime#docker-engine-setup).

The recommended way to install DCGM-Exporter is to use the Helm chart:
```
$ helm repo add gpu-helm-charts \

```shell
helm repo add gpu-helm-charts \
https://nvidia.github.io/dcgm-exporter/helm-charts
```

Update the repo:

```shell
helm repo update
```
$ helm repo update
```

And install the chart:
```
$ helm install \

```shell
helm install \
--generate-name \
gpu-helm-charts/dcgm-exporter
```

Once the `dcgm-exporter` pod is deployed, you can use port forwarding to obtain metrics quickly:


```
$ kubectl create -f https://raw.githubusercontent.com/NVIDIA/dcgm-exporter/master/dcgm-exporter.yaml
```shell
kubectl create -f https://raw.githubusercontent.com/NVIDIA/dcgm-exporter/master/dcgm-exporter.yaml

# Let's get the output of a random pod:
$ NAME=$(kubectl get pods -l "app.kubernetes.io/name=dcgm-exporter" \
NAME=$(kubectl get pods -l "app.kubernetes.io/name=dcgm-exporter" \
-o "jsonpath={ .items[0].metadata.name}")

$ kubectl port-forward $NAME 8080:9400 &
$ curl -sL http://127.0.0.1:8080/metrics
kubectl port-forward $NAME 8080:9400 &

curl -sL http://127.0.0.1:8080/metrics
# HELP DCGM_FI_DEV_SM_CLOCK SM clock frequency (in MHz).
# TYPE DCGM_FI_DEV_SM_CLOCK gauge
# HELP DCGM_FI_DEV_MEM_CLOCK Memory clock frequency (in MHz).
Expand All @@ -72,33 +78,50 @@ DCGM_FI_DEV_MEMORY_TEMP{gpu="0", UUID="GPU-604ac76c-d9cf-fef3-62e9-d92044ab6e52"
...

```

To integrate DCGM-Exporter with Prometheus and Grafana, see the full instructions in the [user guide](https://docs.nvidia.com/datacenter/cloud-native/gpu-telemetry/latest/).
`dcgm-exporter` is deployed as part of the GPU Operator. To get started with integrating with Prometheus, check the Operator [user guide](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/getting-started.html#gpu-telemetry).

### TLS and Basic Auth

Exporter supports TLS and basic auth using [exporter-toolkit](https://github.com/prometheus/exporter-toolkit). To use TLS and/or basic auth, users need to use `--web-config-file` CLI flag as follows

```
```shell
dcgm-exporter --web-config-file=web-config.yaml
```

A sample `web-config.yaml` file can be fetched from [exporter-toolkit repository](https://github.com/prometheus/exporter-toolkit/blob/master/docs/web-config.yml). The reference of the `web-config.yaml` file can be consulted in the [docs](https://github.com/prometheus/exporter-toolkit/blob/master/docs/web-configuration.md).

### How to include HPC jobs in metric labels

The DCGM-exporter can include High-Performance Computing (HPC) job information into its metric labels. To achieve this, HPC environment administrators must configure their HPC environment to generate files that map GPUs to HPC jobs.

#### File Conventions

These mapping files follow a specific format:

* Each file is named after a unique GPU ID (e.g., 0, 1, 2, etc.).
* Each line in the file contains JOB IDs that run on the corresponding GPU.

#### Enabling HPC Job Mapping on DCGM-Exporter

To enable GPU-to-job mapping on the DCGM-exporter side, users must run the DCGM-exporter with the --hpc-job-mapping-dir command-line parameter, pointing to a directory where the HPC cluster creates job mapping files. Or, users can set the environment variable DCGM_HPC_JOB_MAPPING_DIR to achieve the same result.

### Building from Source

In order to build dcgm-exporter ensure you have the following:
- [Golang >= 1.21 installed](https://golang.org/)
- [DCGM installed](https://developer.nvidia.com/dcgm)

```
$ git clone https://github.com/NVIDIA/dcgm-exporter.git
$ cd dcgm-exporter
$ make binary
$ sudo make install
* [Golang >= 1.21 installed](https://golang.org/)
* [DCGM installed](https://developer.nvidia.com/dcgm)

```shell
git clone https://github.com/NVIDIA/dcgm-exporter.git
cd dcgm-exporter
make binary
sudo make install
...
$ dcgm-exporter &
$ curl localhost:9400/metrics
dcgm-exporter &
curl localhost:9400/metrics
# HELP DCGM_FI_DEV_SM_CLOCK SM clock frequency (in MHz).
# TYPE DCGM_FI_DEV_SM_CLOCK gauge
# HELP DCGM_FI_DEV_MEM_CLOCK Memory clock frequency (in MHz).
Expand All @@ -118,6 +141,7 @@ With `dcgm-exporter` you can configure which fields are collected by specifying
You will find the default CSV file under `etc/default-counters.csv` in the repository, which is copied on your system or container to `/etc/dcgm-exporter/default-counters.csv`

The layout and format of this file is as follows:

```
# Format
# If line starts with a '#' it is considered a comment
Expand All @@ -129,39 +153,43 @@ DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz).
```

A custom csv file can be specified using the `-f` option or `--collectors` as follows:
```
$ dcgm-exporter -f /tmp/custom-collectors.csv

```shell
dcgm-exporter -f /tmp/custom-collectors.csv
```

Notes:
- Always make sure your entries have 2 commas (',')
- The complete list of counters that can be collected can be found on the DCGM API reference manual: https://docs.nvidia.com/datacenter/dcgm/latest/dcgm-api/dcgm-api-field-ids.html

* Always make sure your entries have 2 commas (',')
* The complete list of counters that can be collected can be found on the DCGM API reference manual: <https://docs.nvidia.com/datacenter/dcgm/latest/dcgm-api/dcgm-api-field-ids.html>

### What about a Grafana Dashboard?

You can find the official NVIDIA DCGM-Exporter dashboard here: https://grafana.com/grafana/dashboards/12239
You can find the official NVIDIA DCGM-Exporter dashboard here: <https://grafana.com/grafana/dashboards/12239>

You will also find the `json` file on this repo under `grafana/dcgm-exporter-dashboard.json`

Pull requests are accepted!


### Building the containers

This project uses [docker buildx](https://docs.docker.com/buildx/working-with-buildx/) for multi-arch image creation. Follow the instructions on that page to get a working builder instance for creating these containers. Some other useful build options follow.

Builds local images based on the machine architecture and makes them available in 'docker images'

```
make local
```

Build the ubuntu image and export to 'docker images'
```

```shell
make ubuntu22.04 PLATFORMS=linux/amd64 OUTPUT=type=docker
```

Build and push the images to some other 'private_registry'
```

```shell
make REGISTRY=<private_registry> push
```

Expand Down
12 changes: 6 additions & 6 deletions dcgm-exporter.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,23 +18,23 @@ metadata:
name: "dcgm-exporter"
labels:
app.kubernetes.io/name: "dcgm-exporter"
app.kubernetes.io/version: "3.4.1"
app.kubernetes.io/version: "3.4.2"
spec:
updateStrategy:
type: RollingUpdate
selector:
matchLabels:
app.kubernetes.io/name: "dcgm-exporter"
app.kubernetes.io/version: "3.4.1"
app.kubernetes.io/version: "3.4.2"
template:
metadata:
labels:
app.kubernetes.io/name: "dcgm-exporter"
app.kubernetes.io/version: "3.4.1"
app.kubernetes.io/version: "3.4.2"
name: "dcgm-exporter"
spec:
containers:
- image: "nvcr.io/nvidia/k8s/dcgm-exporter:3.3.5-3.4.1-ubuntu22.04"
- image: "nvcr.io/nvidia/k8s/dcgm-exporter:3.3.6-3.4.2-ubuntu22.04"
env:
- name: "DCGM_EXPORTER_LISTEN"
value: ":9400"
Expand Down Expand Up @@ -64,11 +64,11 @@ metadata:
name: "dcgm-exporter"
labels:
app.kubernetes.io/name: "dcgm-exporter"
app.kubernetes.io/version: "3.4.1"
app.kubernetes.io/version: "3.4.2"
spec:
selector:
app.kubernetes.io/name: "dcgm-exporter"
app.kubernetes.io/version: "3.4.1"
app.kubernetes.io/version: "3.4.2"
ports:
- name: "metrics"
port: 9400
4 changes: 2 additions & 2 deletions deployment/Chart.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
apiVersion: v2
name: dcgm-exporter
description: A Helm chart for DCGM exporter
version: "3.4.1"
version: "3.4.2"
kubeVersion: ">= 1.19.0-0"
appVersion: "3.4.1"
appVersion: "3.4.2"
sources:
- https://github.com/nvidia/dcgm-exporter
home: https://github.com/nvidia/dcgm-exporter/
Expand Down
2 changes: 1 addition & 1 deletion deployment/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ image:
pullPolicy: IfNotPresent
# Image tag defaults to AppVersion, but you can use the tag key
# for the image tag, e.g:
tag: 3.3.5-3.4.1-ubuntu22.04
tag: 3.3.6-3.4.2-ubuntu22.04

# Change the following reference to "/etc/dcgm-exporter/default-counters.csv"
# to stop profiling metrics from DCGM
Expand Down
Loading

0 comments on commit dd3001a

Please sign in to comment.