Skip to content

Commit

Permalink
Make nvidia resource names configurable (#359)
Browse files Browse the repository at this point in the history
* Make nvidia resource names configurable

Signed-off-by: lx1036 <lx1036@126.com>

* added unit test

Signed-off-by: Vadym Fedorov <vfedorov@nvidia.com>

---------

Signed-off-by: lx1036 <lx1036@126.com>
Signed-off-by: Vadym Fedorov <vfedorov@nvidia.com>
Co-authored-by: Vadym Fedorov <vfedorov@nvidia.com>
  • Loading branch information
lx1036 and nvvfedorov authored Jul 15, 2024
1 parent 219eb08 commit b4552f0
Show file tree
Hide file tree
Showing 4 changed files with 20 additions and 1 deletion.
8 changes: 8 additions & 0 deletions pkg/cmd/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ const (
CLIDCGMLogLevel = "dcgm-log-level"
CLIPodResourcesKubeletSocket = "pod-resources-kubelet-socket"
CLIHPCJobMappingDir = "hpc-job-mapping-dir"
CLINvidiaResourceNames = "nvidia-resource-names"
)

func NewApp(buildVersion ...string) *cli.App {
Expand Down Expand Up @@ -237,6 +238,12 @@ func NewApp(buildVersion ...string) *cli.App {
Usage: "Path to HPC job mapping file directory used for mapping GPUs to jobs.",
EnvVars: []string{"DCGM_HPC_JOB_MAPPING_DIR"},
},
&cli.StringSliceFlag{
Name: CLINvidiaResourceNames,
Value: cli.NewStringSlice(),
Usage: "Nvidia resource names for specified GPU type like nvidia.com/a100, nvidia.com/a10.",
EnvVars: []string{"NVIDIA_RESOURCE_NAMES"},
},
}

if runtime.GOOS == "linux" {
Expand Down Expand Up @@ -631,5 +638,6 @@ func contextToConfig(c *cli.Context) (*dcgmexporter.Config, error) {
DCGMLogLevel: dcgmLogLevel,
PodResourcesKubeletSocket: c.String(CLIPodResourcesKubeletSocket),
HPCJobMappingDir: c.String(CLIHPCJobMappingDir),
NvidiaResourceNames: c.StringSlice(CLINvidiaResourceNames),
}, nil
}
1 change: 1 addition & 0 deletions pkg/dcgmexporter/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,4 +58,5 @@ type Config struct {
DCGMLogLevel string
PodResourcesKubeletSocket string
HPCJobMappingDir string
NvidiaResourceNames []string
}
3 changes: 2 additions & 1 deletion pkg/dcgmexporter/kubernetes.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"fmt"
"net"
"regexp"
"slices"
"strings"
"time"

Expand Down Expand Up @@ -147,7 +148,7 @@ func (p *PodMapper) toDeviceToPod(
for _, device := range container.GetDevices() {

resourceName := device.GetResourceName()
if resourceName != nvidiaResourceName {
if resourceName != nvidiaResourceName && !slices.Contains(p.Config.NvidiaResourceNames, resourceName) {
// Mig resources appear differently than GPU resources
if !strings.HasPrefix(resourceName, nvidiaMigResourcePrefix) {
continue
Expand Down
9 changes: 9 additions & 0 deletions pkg/dcgmexporter/kubernetes_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,7 @@ func TestProcessPodMapper_WithD_Different_Format_Of_DeviceID(t *testing.T) {
MetricGPUDevice string
MetricMigProfile string
PODGPUID string
NvidiaResourceNames []string
}

testCases := []TestCase{
Expand Down Expand Up @@ -232,6 +233,13 @@ func TestProcessPodMapper_WithD_Different_Format_Of_DeviceID(t *testing.T) {
MetricGPUDevice: "0",
GPUInstanceID: 3,
},
{
KubernetesGPUIDType: GPUUID,
ResourceName: "nvidia.com/a100",
MetricGPUID: "b8ea3855-276c-c9cb-b366-c6fa655957c5",
PODGPUID: "b8ea3855-276c-c9cb-b366-c6fa655957c5",
NvidiaResourceNames: []string{"nvidia.com/a100"},
},
}

for _, tc := range testCases {
Expand Down Expand Up @@ -272,6 +280,7 @@ func TestProcessPodMapper_WithD_Different_Format_Of_DeviceID(t *testing.T) {
podMapper, err := NewPodMapper(&Config{
KubernetesGPUIdType: tc.KubernetesGPUIDType,
PodResourcesKubeletSocket: socketPath,
NvidiaResourceNames: tc.NvidiaResourceNames,
})
require.NoError(t, err)
require.NotNil(t, podMapper)
Expand Down

0 comments on commit b4552f0

Please sign in to comment.