Skip to content

Commit

Permalink
Fix kubernetes filters (#52)
Browse files Browse the repository at this point in the history
* fix kubernetes filters

* update README

* fix replicaset
  • Loading branch information
Aohzan authored Feb 14, 2024
1 parent a449a13 commit 5e07bbb
Show file tree
Hide file tree
Showing 6 changed files with 16 additions and 16 deletions.
18 changes: 9 additions & 9 deletions caas/kubernetes/node/monitors-k8s-node.tf
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ resource "datadog_monitor" "disk_pressure" {
type = "service check"

query = <<EOQ
"kubernetes_state.node.disk_pressure"${module.filter-tags.service_check}.by("node","kube_cluster_name").last(6).count_by_status()
"kubernetes_state.node.disk_pressure"${module.filter-tags.service_check}.by("kube_node","kube_cluster_name").last(6).count_by_status()
EOQ

monitor_thresholds {
Expand All @@ -32,7 +32,7 @@ resource "datadog_monitor" "disk_out" {
type = "service check"

query = <<EOQ
"kubernetes_state.node.out_of_disk"${module.filter-tags.service_check}.by("node","kube_cluster_name").last(6).count_by_status()
"kubernetes_state.node.out_of_disk"${module.filter-tags.service_check}.by("kube_node","kube_cluster_name").last(6).count_by_status()
EOQ

monitor_thresholds {
Expand All @@ -59,7 +59,7 @@ resource "datadog_monitor" "memory_pressure" {
type = "service check"

query = <<EOQ
"kubernetes_state.node.memory_pressure"${module.filter-tags.service_check}.by("node","kube_cluster_name").last(6).count_by_status()
"kubernetes_state.node.memory_pressure"${module.filter-tags.service_check}.by("kube_node","kube_cluster_name").last(6).count_by_status()
EOQ

monitor_thresholds {
Expand All @@ -86,7 +86,7 @@ resource "datadog_monitor" "ready" {
type = "service check"

query = <<EOQ
"kubernetes_state.node.ready"${module.filter-tags.service_check}.by("node","kube_cluster_name").last(6).count_by_status()
"kubernetes_state.node.ready"${module.filter-tags.service_check}.by("kube_node","kube_cluster_name").last(6).count_by_status()
EOQ

monitor_thresholds {
Expand Down Expand Up @@ -187,7 +187,7 @@ resource "datadog_monitor" "node_unschedulable" {

query = <<EOQ
${var.node_unschedulable_time_aggregator}(${var.node_unschedulable_timeframe}):
sum:kubernetes_state.node.status${module.filter-tags-unschedulable.query_alert} by {node,kube_cluster_name}
sum:kubernetes_state.node.status${module.filter-tags-unschedulable.query_alert} by {kube_node,kube_cluster_name}
> 0
EOQ

Expand Down Expand Up @@ -216,8 +216,8 @@ resource "datadog_monitor" "volume_space" {

query = <<EOQ
${var.volume_space_time_aggregator}(${var.volume_space_timeframe}):
avg:kubernetes.kubelet.volume.stats.used_bytes${module.filter-tags.query_alert} by {name,persistentvolumeclaim,kube_cluster_name} /
avg:kubernetes.kubelet.volume.stats.capacity_bytes${module.filter-tags.query_alert} by {name,persistentvolumeclaim,kube_cluster_name}
avg:kubernetes.kubelet.volume.stats.used_bytes${module.filter-tags.query_alert} by {persistentvolumeclaim,kube_cluster_name} /
avg:kubernetes.kubelet.volume.stats.capacity_bytes${module.filter-tags.query_alert} by {persistentvolumeclaim,kube_cluster_name}
* 100 > ${var.volume_space_threshold_critical}
EOQ

Expand Down Expand Up @@ -247,8 +247,8 @@ resource "datadog_monitor" "volume_inodes" {

query = <<EOQ
${var.volume_inodes_time_aggregator}(${var.volume_inodes_timeframe}):
avg:kubernetes.kubelet.volume.stats.inodes_used${module.filter-tags.query_alert} by {name,persistentvolumeclaim,kube_cluster_name} /
avg:kubernetes.kubelet.volume.stats.inodes${module.filter-tags.query_alert} by {name,persistentvolumeclaim,kube_cluster_name}
avg:kubernetes.kubelet.volume.stats.inodes_used${module.filter-tags.query_alert} by {persistentvolumeclaim,kube_cluster_name} /
avg:kubernetes.kubelet.volume.stats.inodes${module.filter-tags.query_alert} by {persistentvolumeclaim,kube_cluster_name}
* 100 > ${var.volume_inodes_threshold_critical}
EOQ

Expand Down
4 changes: 2 additions & 2 deletions caas/kubernetes/pod/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,13 +72,13 @@ Creates DataDog monitors with the following checks:
| <a name="input_new_group_delay"></a> [new\_group\_delay](#input\_new\_group\_delay) | Delay in seconds before monitor new resource | `number` | `300` | no |
| <a name="input_new_host_delay"></a> [new\_host\_delay](#input\_new\_host\_delay) | Delay in seconds before monitor new resource | `number` | `300` | no |
| <a name="input_notify_no_data"></a> [notify\_no\_data](#input\_notify\_no\_data) | Will raise no data alert if set to true | `bool` | `true` | no |
| <a name="input_pod_group_by"></a> [pod\_group\_by](#input\_pod\_group\_by) | Select group by element on monitors (error and terminated) | `list` | <pre>[<br> "namespace",<br> "pod",<br> "reason",<br> "kube_cluster_name"<br>]</pre> | no |
| <a name="input_pod_group_by"></a> [pod\_group\_by](#input\_pod\_group\_by) | Select group by element on monitors (error and terminated) | `list` | <pre>[<br> "kube_namespace",<br> "pod_name",<br> "reason",<br> "kube_cluster_name"<br>]</pre> | no |
| <a name="input_pod_phase_status_enabled"></a> [pod\_phase\_status\_enabled](#input\_pod\_phase\_status\_enabled) | Flag to enable Pod phase status monitor | `string` | `"true"` | no |
| <a name="input_pod_phase_status_extra_tags"></a> [pod\_phase\_status\_extra\_tags](#input\_pod\_phase\_status\_extra\_tags) | Extra tags for Pod phase status monitor | `list(string)` | `[]` | no |
| <a name="input_pod_phase_status_message"></a> [pod\_phase\_status\_message](#input\_pod\_phase\_status\_message) | Custom message for Pod phase status monitor | `string` | `""` | no |
| <a name="input_pod_phase_status_time_aggregator"></a> [pod\_phase\_status\_time\_aggregator](#input\_pod\_phase\_status\_time\_aggregator) | Monitor aggregator for Pod phase status [available values: min, max or avg] | `string` | `"max"` | no |
| <a name="input_pod_phase_status_timeframe"></a> [pod\_phase\_status\_timeframe](#input\_pod\_phase\_status\_timeframe) | Monitor timeframe for Pod phase status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | `string` | `"last_5m"` | no |
| <a name="input_pod_status_group_by"></a> [pod\_status\_group\_by](#input\_pod\_status\_group\_by) | Select group by element on monitors (phase status) | `list` | <pre>[<br> "namespace",<br> "kube_cluster_name"<br>]</pre> | no |
| <a name="input_pod_status_group_by"></a> [pod\_status\_group\_by](#input\_pod\_status\_group\_by) | Select group by element on monitors (phase status) | `list` | <pre>[<br> "kube_namespace",<br> "kube_cluster_name"<br>]</pre> | no |
| <a name="input_prefix_slug"></a> [prefix\_slug](#input\_prefix\_slug) | Prefix string to prepend between brackets on every monitors names | `string` | `""` | no |
| <a name="input_tags"></a> [tags](#input\_tags) | Global variables | `list(string)` | <pre>[<br> "type:caas",<br> "provider:kubernetes",<br> "resource:kubernetes-pod"<br>]</pre> | no |
| <a name="input_team"></a> [team](#input\_team) | n/a | `string` | `"claranet"` | no |
Expand Down
4 changes: 2 additions & 2 deletions caas/kubernetes/pod/inputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -172,11 +172,11 @@ variable "terminated_threshold_warning" {
}

variable "pod_group_by" {
default = ["namespace", "pod", "reason", "kube_cluster_name"]
default = ["kube_namespace", "pod_name", "reason", "kube_cluster_name"]
description = "Select group by element on monitors (error and terminated)"
}

variable "pod_status_group_by" {
default = ["namespace", "kube_cluster_name"]
default = ["kube_namespace", "kube_cluster_name"]
description = "Select group by element on monitors (phase status)"
}
2 changes: 1 addition & 1 deletion caas/kubernetes/workload/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ Creates DataDog monitors with the following checks:
| <a name="input_replica_current_threshold_critical"></a> [replica\_current\_threshold\_critical](#input\_replica\_current\_threshold\_critical) | Current replica critical threshold | `number` | `1` | no |
| <a name="input_replica_current_time_aggregator"></a> [replica\_current\_time\_aggregator](#input\_replica\_current\_time\_aggregator) | Monitor aggregator for Current replica [available values: min, max or avg] | `string` | `"max"` | no |
| <a name="input_replica_current_timeframe"></a> [replica\_current\_timeframe](#input\_replica\_current\_timeframe) | Monitor timeframe for Current replica [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | `string` | `"last_15m"` | no |
| <a name="input_replica_group_by"></a> [replica\_group\_by](#input\_replica\_group\_by) | Select group by element on monitors | `list` | <pre>[<br> "namespace",<br> "replicaset",<br> "kube_cluster_name"<br>]</pre> | no |
| <a name="input_replica_group_by"></a> [replica\_group\_by](#input\_replica\_group\_by) | Select group by element on monitors | `list` | <pre>[<br> "kube_namespace",<br> "kube_replica_set",<br> "kube_cluster_name"<br>]</pre> | no |
| <a name="input_replica_ready_enabled"></a> [replica\_ready\_enabled](#input\_replica\_ready\_enabled) | Flag to enable Ready replica monitor | `string` | `"true"` | no |
| <a name="input_replica_ready_extra_tags"></a> [replica\_ready\_extra\_tags](#input\_replica\_ready\_extra\_tags) | Extra tags for Ready replica monitor | `list(string)` | `[]` | no |
| <a name="input_replica_ready_message"></a> [replica\_ready\_message](#input\_replica\_ready\_message) | Custom message for Ready replica monitor | `string` | `""` | no |
Expand Down
2 changes: 1 addition & 1 deletion caas/kubernetes/workload/inputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ variable "replica_current_threshold_critical" {
}

variable "replica_group_by" {
default = ["namespace", "replicaset", "kube_cluster_name"]
default = ["kube_namespace", "kube_replica_set", "kube_cluster_name"]
description = "Select group by element on monitors"
}

2 changes: 1 addition & 1 deletion caas/kubernetes/workload/monitors-k8s-workload.tf
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ resource "datadog_monitor" "cronjob" {
type = "service check"

query = <<EOQ
"kubernetes_state.cronjob.on_schedule_check"${module.filter-tags.service_check}.by("cronjob").last(6).count_by_status()
"kubernetes_state.cronjob.on_schedule_check"${module.filter-tags.service_check}.by("kube_cronjob").last(6).count_by_status()
EOQ

monitor_thresholds {
Expand Down

0 comments on commit 5e07bbb

Please sign in to comment.