From d33b8b036a429cd116a0f66fd5cbadcdb4f3c68e Mon Sep 17 00:00:00 2001 From: Lujie Duan Date: Wed, 26 Jun 2024 18:22:21 +0000 Subject: [PATCH] Add GCE recommended alerts for GPU VMs --- ...ory-utilization-too-high-within-vm.v1.json | 25 ++++++++++++++ .../gpu-memory-utilization-too-high.v1.json | 25 ++++++++++++++ ...gpu-utilization-too-high-within-vm.v1.json | 34 +++++++++++++++++++ .../gpu-utilization-too-high.v1.json | 34 +++++++++++++++++++ alerts/google-gce/metadata.yaml | 28 +++++++++++++++ 5 files changed, 146 insertions(+) create mode 100644 alerts/google-gce/gpu-memory-utilization-too-high-within-vm.v1.json create mode 100644 alerts/google-gce/gpu-memory-utilization-too-high.v1.json create mode 100644 alerts/google-gce/gpu-utilization-too-high-within-vm.v1.json create mode 100644 alerts/google-gce/gpu-utilization-too-high.v1.json diff --git a/alerts/google-gce/gpu-memory-utilization-too-high-within-vm.v1.json b/alerts/google-gce/gpu-memory-utilization-too-high-within-vm.v1.json new file mode 100644 index 0000000000..b11ed50891 --- /dev/null +++ b/alerts/google-gce/gpu-memory-utilization-too-high-within-vm.v1.json @@ -0,0 +1,25 @@ +{ + "displayName": "VM Instance - High GPU Memory Utilization (${INSTANCE_NAME})", + "documentation": { + "content": "This alert fires when the GPU memory utilization on the VM instance ${INSTANCE_NAME} rises above 90% for 5 minutes or more.", + "mimeType": "text/markdown" + }, + "userLabels": {}, + "conditions": [ + { + "displayName": "VM Instance - High GPU memory utilization (${INSTANCE_NAME})", + "conditionMonitoringQueryLanguage": { + "duration": "0s", + "trigger": { + "count": 1 + }, + "query": "{ fetch gce_instance\n | metric 'agent.googleapis.com/gpu/memory/bytes_used'\n | filter (metadata.system_labels.name == '${INSTANCE_NAME}')\n | filter metric.memory_state == 'used'\n | group_by 5m, [value_bytes_used_mean: mean(value.bytes_used)]\n | every 5m\n | group_by [metric.gpu_number, metric.model, metric.uuid, resource.instance_id, resource.project_id, resource.zone, metadata.system_labels.name], [value_bytes_used_mean_aggregate: aggregate(value_bytes_used_mean)]\n; fetch gce_instance\n | metric 'agent.googleapis.com/gpu/memory/bytes_used' \n | filter (metadata.system_labels.name == '${INSTANCE_NAME}')\n | group_by 5m, [value_bytes_used_mean: mean(value.bytes_used)]\n | every 5m\n | group_by [metric.gpu_number, metric.model, metric.uuid, resource.instance_id, resource.project_id, resource.zone, metadata.system_labels.name], [value_bytes_used_mean_aggregate: aggregate(value_bytes_used_mean)] }\n| ratio\n| mul (100)\n| cast_units ('%')\n| every 5m\n| condition val() > 0.9 '10^2.%'" + } + } + ], + "alertStrategy": { + "autoClose": "604800s" + }, + "combiner": "OR", + "enabled": true +} diff --git a/alerts/google-gce/gpu-memory-utilization-too-high.v1.json b/alerts/google-gce/gpu-memory-utilization-too-high.v1.json new file mode 100644 index 0000000000..b3b522419c --- /dev/null +++ b/alerts/google-gce/gpu-memory-utilization-too-high.v1.json @@ -0,0 +1,25 @@ +{ + "displayName": "VM Instance - High GPU Memory Utilization", + "documentation": { + "content": "This alert fires when the GPU memory utilization on any VM instance in the project rises above 90% for 5 minutes or more.", + "mimeType": "text/markdown" + }, + "userLabels": {}, + "conditions": [ + { + "displayName": "VM Instance - High GPU memory utilization", + "conditionMonitoringQueryLanguage": { + "duration": "0s", + "trigger": { + "count": 1 + }, + "query": "{ fetch gce_instance\n | metric 'agent.googleapis.com/gpu/memory/bytes_used'\n | filter metric.memory_state == 'used'\n | group_by 5m, [value_bytes_used_mean: mean(value.bytes_used)]\n | every 5m\n | group_by [metric.gpu_number, metric.model, metric.uuid, resource.instance_id, resource.project_id, resource.zone, metadata.system_labels.name], [value_bytes_used_mean_aggregate: aggregate(value_bytes_used_mean)]\n; fetch gce_instance\n | metric 'agent.googleapis.com/gpu/memory/bytes_used' \n | group_by 5m, [value_bytes_used_mean: mean(value.bytes_used)]\n | every 5m\n | group_by [metric.gpu_number, metric.model, metric.uuid, resource.instance_id, resource.project_id, resource.zone, metadata.system_labels.name], [value_bytes_used_mean_aggregate: aggregate(value_bytes_used_mean)] }\n| ratio\n| mul (100)\n| cast_units ('%')\n| every 5m\n| condition val() > 0.9 '10^2.%'" + } + } + ], + "alertStrategy": { + "autoClose": "604800s" + }, + "combiner": "OR", + "enabled": true +} \ No newline at end of file diff --git a/alerts/google-gce/gpu-utilization-too-high-within-vm.v1.json b/alerts/google-gce/gpu-utilization-too-high-within-vm.v1.json new file mode 100644 index 0000000000..0b785d4ccf --- /dev/null +++ b/alerts/google-gce/gpu-utilization-too-high-within-vm.v1.json @@ -0,0 +1,34 @@ +{ + "displayName": "VM Instance - High GPU Utilization (${INSTANCE_NAME})", + "documentation": { + "content": "This alert fires when the GPU utilization on the VM instance (${INSTANCE_NAME}) rises above 90% for 5 minutes or more.", + "mimeType": "text/markdown" + }, + "userLabels": {}, + "conditions": [ + { + "displayName": "VM Instance - High GPU utilization (${INSTANCE_NAME})", + "conditionThreshold": { + "filter": "resource.type = \"gce_instance\" AND metric.type = \"agent.googleapis.com/gpu/utilization\" AND metadata.system_labels.name = \"${INSTANCE_NAME}\"", + "aggregations": [ + { + "alignmentPeriod": "300s", + "crossSeriesReducer": "REDUCE_NONE", + "perSeriesAligner": "ALIGN_MEAN" + } + ], + "comparison": "COMPARISON_GT", + "duration": "0s", + "trigger": { + "count": 1 + }, + "thresholdValue": 90 + } + } + ], + "alertStrategy": { + "autoClose": "604800s" + }, + "combiner": "OR", + "enabled": true +} diff --git a/alerts/google-gce/gpu-utilization-too-high.v1.json b/alerts/google-gce/gpu-utilization-too-high.v1.json new file mode 100644 index 0000000000..0bce0c4e7b --- /dev/null +++ b/alerts/google-gce/gpu-utilization-too-high.v1.json @@ -0,0 +1,34 @@ +{ + "displayName": "VM Instance - High GPU Utilization", + "documentation": { + "content": "This alert fires when the GPU utilization on any VM instance in the project rises above 90% for 5 minutes or more.", + "mimeType": "text/markdown" + }, + "userLabels": {}, + "conditions": [ + { + "displayName": "VM Instance - High GPU utilization", + "conditionThreshold": { + "filter": "resource.type = \"gce_instance\" AND metric.type = \"agent.googleapis.com/gpu/utilization\"", + "aggregations": [ + { + "alignmentPeriod": "300s", + "crossSeriesReducer": "REDUCE_NONE", + "perSeriesAligner": "ALIGN_MEAN" + } + ], + "comparison": "COMPARISON_GT", + "duration": "0s", + "trigger": { + "count": 1 + }, + "thresholdValue": 90 + } + } + ], + "alertStrategy": { + "autoClose": "604800s" + }, + "combiner": "OR", + "enabled": true +} \ No newline at end of file diff --git a/alerts/google-gce/metadata.yaml b/alerts/google-gce/metadata.yaml index f494ba50d0..278d4d6f19 100644 --- a/alerts/google-gce/metadata.yaml +++ b/alerts/google-gce/metadata.yaml @@ -69,3 +69,31 @@ alert_policy_templates: related_integrations: - id: gce platform: GCP +- + id: gpu-utilization-too-high + description: "Monitors GPU utilization across all GCE VMs in the current project and will notify you if the GPU utilization on any VM instance rises above 90% for 5 minutes or more. This requires the Ops Agent to be installed on VMs to collect the gpu utilization metric." + version: 1 + related_integrations: + - id: gce + platform: GCP +- + id: gpu-utilization-too-high-within-vm + description: "Monitors GPU utilization in the specified GCE VM and will notify you if the GPU utilization rises above 90% for 5 minutes or more. This requires the Ops Agent to be installed on VMs to collect the gpu utilization metric." + version: 1 + related_integrations: + - id: gce + platform: GCP +- + id: gpu-memory-utilization-too-high + description: "Monitors gpu memory utilization across all GCE VMs in the current project and will notify you if the gpu memory utilization on any VM instance rises above 90% for 5 minutes or more. This requires the Ops Agent to be installed on VMs to collect the gpu memory utilization metric." + version: 1 + related_integrations: + - id: gce + platform: GCP +- + id: gpu-memory-utilization-too-high-within-vm + description: "Monitors gpu memory utilization in the specified GCE VM and will notify you if the gpu memory utilization rises above 90% for 5 minutes or more. This requires the Ops Agent to be installed on the VM to collect the gpu memory utilization metric." + version: 1 + related_integrations: + - id: gce + platform: GCP \ No newline at end of file