From 86575ba061ba248ab43350e8d4b4697047771f5c Mon Sep 17 00:00:00 2001 From: Danny Clark Date: Sat, 14 Dec 2024 02:06:32 +0000 Subject: [PATCH] fix: enable pod attributes for gpu-sharing in gke We make a small fix to the Kubernetes PodMapper tranform processor. Specifically we update the regular expression used in building the device mapping to properly capture pod attributes in both MIG and MIG-with-sharing GPUs in GKE. --- pkg/dcgmexporter/kubernetes.go | 3 ++- pkg/dcgmexporter/kubernetes_test.go | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/pkg/dcgmexporter/kubernetes.go b/pkg/dcgmexporter/kubernetes.go index 8fb8d7d2..9a563056 100644 --- a/pkg/dcgmexporter/kubernetes.go +++ b/pkg/dcgmexporter/kubernetes.go @@ -36,7 +36,8 @@ import ( var ( connectionTimeout = 10 * time.Second - gkeMigDeviceIDRegex = regexp.MustCompile(`^nvidia([0-9]+)/gi([0-9]+)$`) + // Allow for MIG devices with or without GPU sharing to match in GKE. + gkeMigDeviceIDRegex = regexp.MustCompile(`^nvidia([0-9]+)/gi([0-9]+)(/vgpu[0-9]+)?$$`) gkeVirtualGPUDeviceIDSeparator = "/vgpu" nvmlGetMIGDeviceInfoByIDHook = nvmlprovider.GetMIGDeviceInfoByID ) diff --git a/pkg/dcgmexporter/kubernetes_test.go b/pkg/dcgmexporter/kubernetes_test.go index 3b48efe2..a955198b 100644 --- a/pkg/dcgmexporter/kubernetes_test.go +++ b/pkg/dcgmexporter/kubernetes_test.go @@ -218,6 +218,20 @@ func TestProcessPodMapper_WithD_Different_Format_Of_DeviceID(t *testing.T) { MetricGPUDevice: "0", PODGPUID: "0/vgpu", }, + { + KubernetesGPUIDType: DeviceName, + ResourceName: nvidiaResourceName, + MetricMigProfile: "1g.10gb", + GPUInstanceID: 0, + PODGPUID: "nvidia0/gi0/vgpu0", + }, + { + KubernetesGPUIDType: DeviceName, + ResourceName: nvidiaResourceName, + MetricMigProfile: "1g.10gb", + GPUInstanceID: 1, + PODGPUID: "nvidia0/gi1/vgpu0", + }, { KubernetesGPUIDType: GPUUID, ResourceName: nvidiaResourceName,