-
Notifications
You must be signed in to change notification settings - Fork 2
/
cluster_error.kql
47 lines (42 loc) · 2.52 KB
/
cluster_error.kql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
DatabricksClusters
| extend ErrorMessage = parse_json(Response).errorMessage
| where isnotempty(ErrorMessage)
//count errors by hour, group by ErrorMessage and Day-Hour
DatabricksClusters
| extend ErrorMessage = tostring(parse_json(Response).errorMessage)
| where isnotempty(ErrorMessage)
| summarize ErrorCount =count(TimeGenerated) by ErrorMessage, DayHour = bin(TimeGenerated,1h)
| order by DayHour
//attempt to classify errors, summarise by day and workspace
DatabricksClusters
| extend ErrorMessage = tostring(parse_json(Response).errorMessage)
| where isnotempty(ErrorMessage)
| extend ADBWorkspaceName = substring(
reverse(split(reverse(_ResourceId),"/",0)),2,strlen(split(reverse(_ResourceId),"/",0))-4)
| extend ErrorShort = case(ErrorMessage contains "quota", "Core quota issue",
ErrorMessage contains "unexpected state Pending", "Cluster unexpected state",
ErrorMessage contains "Slow instance", "Pool issue",
ErrorMessage)
| summarize ErrorCount =count(TimeGenerated) by Day = bin(TimeGenerated,1d), ADBWorkspaceName, ErrorShort
| order by Day
//attempt to classify errors, summarise by error and workspace
DatabricksClusters
| extend ErrorMessage = tostring(parse_json(Response).errorMessage)
| where isnotempty(ErrorMessage)
| extend ADBWorkspaceName = substring(
reverse(split(reverse(_ResourceId),"/",0)),2,strlen(split(reverse(_ResourceId),"/",0))-4)
| extend ErrorShort = case(ErrorMessage contains "quota", "Core quota issue",
ErrorMessage contains "unexpected state Pending", "Cluster unexpected state",
ErrorMessage contains "Slow instance", "Pool issue",
ErrorMessage contains "INTERNAL_ERROR: Could not launch cluster due to cloud provider failures. instance_id:", "Cloud provider failure",
ErrorMessage contains "can be allocated from the pool before its max capacity of", "Pool capacity",
ErrorMessage)
| summarize ErrorCount =count(TimeGenerated) by ADBWorkspaceName, ErrorShort
//count errors by hour, group by Day
DatabricksClusters
| extend ADBWorkspaceName = substring(
reverse(split(reverse(_ResourceId),"/",0)),2,strlen(split(reverse(_ResourceId),"/",0))-4)
| extend ErrorMessage = tostring(parse_json(Response).errorMessage)
| where isnotempty(ErrorMessage)
| summarize ErrorCount =count(TimeGenerated) by ADBWorkspaceName, Day = bin(TimeGenerated,1d)
| order by Day