Skip to content

Commit

Permalink
Add GCE recommended alerts for GPU VMs
Browse files Browse the repository at this point in the history
  • Loading branch information
LujieDuan committed Jun 27, 2024
1 parent e0b122f commit d33b8b0
Show file tree
Hide file tree
Showing 5 changed files with 146 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{
"displayName": "VM Instance - High GPU Memory Utilization (${INSTANCE_NAME})",
"documentation": {
"content": "This alert fires when the GPU memory utilization on the VM instance ${INSTANCE_NAME} rises above 90% for 5 minutes or more.",
"mimeType": "text/markdown"
},
"userLabels": {},
"conditions": [
{
"displayName": "VM Instance - High GPU memory utilization (${INSTANCE_NAME})",
"conditionMonitoringQueryLanguage": {
"duration": "0s",
"trigger": {
"count": 1
},
"query": "{ fetch gce_instance\n | metric 'agent.googleapis.com/gpu/memory/bytes_used'\n | filter (metadata.system_labels.name == '${INSTANCE_NAME}')\n | filter metric.memory_state == 'used'\n | group_by 5m, [value_bytes_used_mean: mean(value.bytes_used)]\n | every 5m\n | group_by [metric.gpu_number, metric.model, metric.uuid, resource.instance_id, resource.project_id, resource.zone, metadata.system_labels.name], [value_bytes_used_mean_aggregate: aggregate(value_bytes_used_mean)]\n; fetch gce_instance\n | metric 'agent.googleapis.com/gpu/memory/bytes_used' \n | filter (metadata.system_labels.name == '${INSTANCE_NAME}')\n | group_by 5m, [value_bytes_used_mean: mean(value.bytes_used)]\n | every 5m\n | group_by [metric.gpu_number, metric.model, metric.uuid, resource.instance_id, resource.project_id, resource.zone, metadata.system_labels.name], [value_bytes_used_mean_aggregate: aggregate(value_bytes_used_mean)] }\n| ratio\n| mul (100)\n| cast_units ('%')\n| every 5m\n| condition val() > 0.9 '10^2.%'"
}
}
],
"alertStrategy": {
"autoClose": "604800s"
},
"combiner": "OR",
"enabled": true
}
25 changes: 25 additions & 0 deletions alerts/google-gce/gpu-memory-utilization-too-high.v1.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{
"displayName": "VM Instance - High GPU Memory Utilization",
"documentation": {
"content": "This alert fires when the GPU memory utilization on any VM instance in the project rises above 90% for 5 minutes or more.",
"mimeType": "text/markdown"
},
"userLabels": {},
"conditions": [
{
"displayName": "VM Instance - High GPU memory utilization",
"conditionMonitoringQueryLanguage": {
"duration": "0s",
"trigger": {
"count": 1
},
"query": "{ fetch gce_instance\n | metric 'agent.googleapis.com/gpu/memory/bytes_used'\n | filter metric.memory_state == 'used'\n | group_by 5m, [value_bytes_used_mean: mean(value.bytes_used)]\n | every 5m\n | group_by [metric.gpu_number, metric.model, metric.uuid, resource.instance_id, resource.project_id, resource.zone, metadata.system_labels.name], [value_bytes_used_mean_aggregate: aggregate(value_bytes_used_mean)]\n; fetch gce_instance\n | metric 'agent.googleapis.com/gpu/memory/bytes_used' \n | group_by 5m, [value_bytes_used_mean: mean(value.bytes_used)]\n | every 5m\n | group_by [metric.gpu_number, metric.model, metric.uuid, resource.instance_id, resource.project_id, resource.zone, metadata.system_labels.name], [value_bytes_used_mean_aggregate: aggregate(value_bytes_used_mean)] }\n| ratio\n| mul (100)\n| cast_units ('%')\n| every 5m\n| condition val() > 0.9 '10^2.%'"
}
}
],
"alertStrategy": {
"autoClose": "604800s"
},
"combiner": "OR",
"enabled": true
}
34 changes: 34 additions & 0 deletions alerts/google-gce/gpu-utilization-too-high-within-vm.v1.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
{
"displayName": "VM Instance - High GPU Utilization (${INSTANCE_NAME})",
"documentation": {
"content": "This alert fires when the GPU utilization on the VM instance (${INSTANCE_NAME}) rises above 90% for 5 minutes or more.",
"mimeType": "text/markdown"
},
"userLabels": {},
"conditions": [
{
"displayName": "VM Instance - High GPU utilization (${INSTANCE_NAME})",
"conditionThreshold": {
"filter": "resource.type = \"gce_instance\" AND metric.type = \"agent.googleapis.com/gpu/utilization\" AND metadata.system_labels.name = \"${INSTANCE_NAME}\"",
"aggregations": [
{
"alignmentPeriod": "300s",
"crossSeriesReducer": "REDUCE_NONE",
"perSeriesAligner": "ALIGN_MEAN"
}
],
"comparison": "COMPARISON_GT",
"duration": "0s",
"trigger": {
"count": 1
},
"thresholdValue": 90
}
}
],
"alertStrategy": {
"autoClose": "604800s"
},
"combiner": "OR",
"enabled": true
}
34 changes: 34 additions & 0 deletions alerts/google-gce/gpu-utilization-too-high.v1.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
{
"displayName": "VM Instance - High GPU Utilization",
"documentation": {
"content": "This alert fires when the GPU utilization on any VM instance in the project rises above 90% for 5 minutes or more.",
"mimeType": "text/markdown"
},
"userLabels": {},
"conditions": [
{
"displayName": "VM Instance - High GPU utilization",
"conditionThreshold": {
"filter": "resource.type = \"gce_instance\" AND metric.type = \"agent.googleapis.com/gpu/utilization\"",
"aggregations": [
{
"alignmentPeriod": "300s",
"crossSeriesReducer": "REDUCE_NONE",
"perSeriesAligner": "ALIGN_MEAN"
}
],
"comparison": "COMPARISON_GT",
"duration": "0s",
"trigger": {
"count": 1
},
"thresholdValue": 90
}
}
],
"alertStrategy": {
"autoClose": "604800s"
},
"combiner": "OR",
"enabled": true
}
28 changes: 28 additions & 0 deletions alerts/google-gce/metadata.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -69,3 +69,31 @@ alert_policy_templates:
related_integrations:
- id: gce
platform: GCP
-
id: gpu-utilization-too-high
description: "Monitors GPU utilization across all GCE VMs in the current project and will notify you if the GPU utilization on any VM instance rises above 90% for 5 minutes or more. This requires the Ops Agent to be installed on VMs to collect the gpu utilization metric."
version: 1
related_integrations:
- id: gce
platform: GCP
-
id: gpu-utilization-too-high-within-vm
description: "Monitors GPU utilization in the specified GCE VM and will notify you if the GPU utilization rises above 90% for 5 minutes or more. This requires the Ops Agent to be installed on VMs to collect the gpu utilization metric."
version: 1
related_integrations:
- id: gce
platform: GCP
-
id: gpu-memory-utilization-too-high
description: "Monitors gpu memory utilization across all GCE VMs in the current project and will notify you if the gpu memory utilization on any VM instance rises above 90% for 5 minutes or more. This requires the Ops Agent to be installed on VMs to collect the gpu memory utilization metric."
version: 1
related_integrations:
- id: gce
platform: GCP
-
id: gpu-memory-utilization-too-high-within-vm
description: "Monitors gpu memory utilization in the specified GCE VM and will notify you if the gpu memory utilization rises above 90% for 5 minutes or more. This requires the Ops Agent to be installed on the VM to collect the gpu memory utilization metric."
version: 1
related_integrations:
- id: gce
platform: GCP

0 comments on commit d33b8b0

Please sign in to comment.