Skip to content

Commit

Permalink
Merge pull request #3 from GoogleCloudPlatform/master
Browse files Browse the repository at this point in the history
Merge with head
  • Loading branch information
google-nalin authored Aug 16, 2023
2 parents 87cb9af + 8abd38c commit 5535830
Show file tree
Hide file tree
Showing 22 changed files with 289 additions and 59 deletions.
18 changes: 18 additions & 0 deletions alerts/google-cloud-chronicle/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Alerts for Chronicle

### Silent Forwarder

This alert policy detects the absence of data for a chronicle collector with collector_id = 10479925-878c-11e7-9421-10604b7cb5c1 over a 1 hour window. These generally require further investigation and indicate an issue with the Chronicle collector.

### All silent Chronicle forwarder and logtype combinations

This alert policy fires an alert everytime a chronicle forwarder goes silent for a log type. Eg: If 4 forwarders are setup supplying 5 log types each, there would be 20 alerts firing (one for each combination). Similarly if a single chronicle forwarder goes down 5 alerts will be active.

### All silent Chronicle forwarder and logtype combinations except few logtypes

This alert policy similar to the above alert policy except it will not fire alerts for the excluded log types. In context of this template it won't fire alerts if Chronicle forwarders stop sending logs for BIND_DNS, CS_DETECTS or BRO_DNS.


### Forwarder buffer usage threshold

This alert policy sends out alerts when any Chronicle forwarder collecting logs from pcap has mean buffer usage above 1% for a 1 hour time window.
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
{
"displayName": "sample policy to detect all silent Chronicle forwarder and logtype combinations except few logtypes",
"conditions": [
{
"displayName": "chronicle forwarder and logtypes silent for 1 hour except few",
"conditionAbsent": {
"aggregations": [
{
"alignmentPeriod": "3600s",
"crossSeriesReducer": "REDUCE_MEAN",
"groupByFields": [
"resource.label.collector_id",
"resource.label.log_type"
],
"perSeriesAligner": "ALIGN_DELTA"
}
],
"duration": "3600s",
"filter": "resource.type = \"chronicle.googleapis.com/Collector\" AND resource.labels.log_type != one_of(\"BIND_DNS\", \"BRO_DNS\", \"CS_DETECTS\") AND metric.type = \"chronicle.googleapis.com/ingestion/log/record_count\"",
"trigger": {
"count": 1
}
}
}
],
"combiner": "OR",
"enabled": true
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
{
"displayName": "sample policy to detect all silent Chronicle forwarder and logtype combinations",
"conditions": [
{
"displayName": "chronicle forwarder and logtypes silent for 1 hour",
"conditionAbsent": {
"aggregations": [
{
"alignmentPeriod": "3600s",
"crossSeriesReducer": "REDUCE_MEAN",
"groupByFields": [
"resource.label.collector_id",
"resource.label.log_type"
],
"perSeriesAligner": "ALIGN_DELTA"
}
],
"duration": "3600s",
"filter": "resource.type = \"chronicle.googleapis.com/Collector\" AND metric.type = \"chronicle.googleapis.com/ingestion/log/record_count\"",
"trigger": {
"count": 1
}
}
}
],
"combiner": "OR",
"enabled": true
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
{
"displayName": "sample policy to detect forwarder mean buffer used is more than 1% over a 1 hour window for input type pcap and buffer type memory",
"conditions": [
{
"displayName": "forwarder mean buffer used is more than 1% over 1 hour window",
"conditionThreshold": {
"aggregations": [
{
"alignmentPeriod": "3600s",
"crossSeriesReducer": "REDUCE_MEAN",
"groupByFields": [
"resource.label.project_id"
],
"perSeriesAligner": "ALIGN_MEAN"
}
],
"comparison": "COMPARISON_GT",
"duration": "0s",
"filter": "resource.type = \"chronicle.googleapis.com/Collector\" AND metric.type = \"chronicle.googleapis.com/forwarder/buffer_used\" AND (metric.labels.input_type = \"pcap\" AND metric.labels.buffer_type = \"memory\")",
"thresholdValue": 0.01,
"trigger": {
"count": 1
}
}
}
],
"combiner": "OR",
"enabled": true
}
29 changes: 29 additions & 0 deletions alerts/google-cloud-chronicle/metadata.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
alert_policy_templates:
-
id: silent-forwarder
description: "sample policy to detect a single silent Chronicle forwarder using collector_id filter"
version: 1
related_integrations:
- id: chronicle_security
platform: GCP
-
id: forwarder-buffer-usage-more-than-threshold-with-filters
description: "sample policy to detect forwarder mean buffer used is more than 1% over a 1 hour window for input type pcap and buffer type memory"
version: 1
related_integrations:
- id: chronicle_security
platform: GCP
-
id: all-silent-forwarder-logtype-combinations-except-few-logtypes
description: "sample policy to detect all silent Chronicle forwarder and logtype combinations except few logtypes"
version: 1
related_integrations:
- id: chronicle_security
platform: GCP
-
id: all-silent-forwarder-logtype-combinations
description: "sample policy to detect all silent Chronicle forwarder and logtype combinations"
version: 1
related_integrations:
- id: chronicle_security
platform: GCP
27 changes: 27 additions & 0 deletions alerts/google-cloud-chronicle/silent-forwarder.v1.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
{
"displayName": "sample policy to detect a single silent Chronicle forwarder using collector_id filter",
"conditions": [
{
"displayName": "chronicle forwarder silent for 1 hour",
"conditionAbsent": {
"aggregations": [
{
"alignmentPeriod": "3600s",
"crossSeriesReducer": "REDUCE_MEAN",
"groupByFields": [
"resource.label.project_id"
],
"perSeriesAligner": "ALIGN_DELTA"
}
],
"duration": "3600s",
"filter": "resource.type = \"chronicle.googleapis.com/Collector\" AND resource.labels.collector_id = \"10479925-878c-11e7-9421-10604b7cb5c1\" AND metric.type = \"chronicle.googleapis.com/ingestion/log/record_count\"",
"trigger": {
"count": 1
}
}
}
],
"combiner": "OR",
"enabled": true
}
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
"userLabels": {
"context": "${CONTEXT}",
"resource_type": "${RESOURCE_TYPE}",
"instance_id": "${INSTANCE_NAME}"
"project_id": "${PROJECT_ID}",
"region": "${REGION}",
"instance_id": "${INSTANCE_ID}"
},
"conditions": [
{
Expand Down
4 changes: 3 additions & 1 deletion alerts/google-cloud-redis/standard-instance-failover.v1.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
"userLabels": {
"context": "${CONTEXT}",
"resource_type": "${RESOURCE_TYPE}",
"instance_id": "${INSTANCE_NAME}"
"project_id": "${PROJECT_ID}",
"region": "${REGION}",
"instance_id": "${INSTANCE_ID}"
},
"conditions": [
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
"userLabels": {
"context": "${CONTEXT}",
"resource_type": "${RESOURCE_TYPE}",
"instance_id": "${INSTANCE_NAME}"
"project_id": "${PROJECT_ID}",
"region": "${REGION}",
"instance_id": "${INSTANCE_ID}"
},
"conditions": [
{
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ sample_dashboards:
-
category: Nginx
id: nginx-ingress-prometheus
display_name: Nginx Ingress
display_name: Nginx Ingress Prometheus Overview
description: |-
This dashboard has charts displaying: Controller Request Volume, Controller Connections, Config Reloads, Ingress Request Rate, Ingress Request Volume, Network I/O Pressure, Memory Usage, and CPU Usage
related_integrations:
Expand Down
5 changes: 0 additions & 5 deletions dashboards/nvidia-gpu/metadata.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,6 @@ sample_dashboards:
display_name: NVIDIA GPU Monitoring Overview (GCE & GKE)
description: |-
Displays GPU metrics for both GKE Nodes and GCE VMs. GPU metrics for the GCE VMs require the Ops Agent to be installed.
related_integrations:
- id: nvml
platform: GCE
- id: dcgm
platform: GCE
-
category: NVIDIA GPUs
id: nvidia-dcgm
Expand Down
10 changes: 5 additions & 5 deletions dashboards/nvidia-gpu/nvidia-dcgm.json
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
"plotType": "LINE",
"targetAxis": "Y1",
"timeSeriesQuery": {
"timeSeriesQueryLanguage": "fetch gce_instance\n| metric 'workload.googleapis.com/dcgm.gpu.pipe_utilization'\n"
"timeSeriesQueryLanguage": "fetch gce_instance\n| metric 'workload.googleapis.com/dcgm.gpu.profiling.pipe_utilization'\n"
}
}
],
Expand Down Expand Up @@ -46,7 +46,7 @@
"plotType": "LINE",
"targetAxis": "Y1",
"timeSeriesQuery": {
"timeSeriesQueryLanguage": "fetch gce_instance\n| metric 'workload.googleapis.com/dcgm.gpu.pcie_traffic_rate'\n| cast_units(\"By/s\")"
"timeSeriesQueryLanguage": "fetch gce_instance\n| metric 'workload.googleapis.com/dcgm.gpu.profiling.pcie_traffic_rate'\n| cast_units(\"By/s\")"
}
}
],
Expand Down Expand Up @@ -74,7 +74,7 @@
"plotType": "LINE",
"targetAxis": "Y1",
"timeSeriesQuery": {
"timeSeriesQueryLanguage": "fetch gce_instance\n| metric 'workload.googleapis.com/dcgm.gpu.sm_utilization'\n"
"timeSeriesQueryLanguage": "fetch gce_instance\n| metric 'workload.googleapis.com/dcgm.gpu.profiling.sm_utilization'\n"
}
}
],
Expand Down Expand Up @@ -122,7 +122,7 @@
"plotType": "LINE",
"targetAxis": "Y1",
"timeSeriesQuery": {
"timeSeriesQueryLanguage": "fetch gce_instance\n| metric 'workload.googleapis.com/dcgm.gpu.sm_occupancy'\n"
"timeSeriesQueryLanguage": "fetch gce_instance\n| metric 'workload.googleapis.com/dcgm.gpu.profiling.sm_occupancy'\n"
}
}
],
Expand Down Expand Up @@ -150,7 +150,7 @@
"plotType": "LINE",
"targetAxis": "Y1",
"timeSeriesQuery": {
"timeSeriesQueryLanguage": "fetch gce_instance\n| metric 'workload.googleapis.com/dcgm.gpu.nvlink_traffic_rate'\n| cast_units(\"By/s\")"
"timeSeriesQueryLanguage": "fetch gce_instance\n| metric 'workload.googleapis.com/dcgm.gpu.profiling.nvlink_traffic_rate'\n| cast_units(\"By/s\")"
}
}
],
Expand Down
24 changes: 13 additions & 11 deletions integrations/airflow/documentation.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,22 @@ app_name_short: Airflow
app_name: Apache {{app_name_short}}
app_site_name: Airflow
app_site_url: https://airflow.apache.org/
exporter_name: the Airflow exporter
exporter_pkg_name: airflow
exporter_name: StatsD
exporter_pkg_name: statsd
exporter_repo_url: https://airflow.apache.org/docs/apache-airflow/stable/logging-monitoring/metrics.html
additional_prereq_info: |
{{exporter_name}} exposes Prometheus-format metrics automatically; you do not have to
install it separately. To verify that {{exporter_name}} is emitting metrics on the expected
endpoints, set up port-forwarding with the following command:
The official {{app_name_short}} [Helm chart](https://airflow.apache.org/docs/helm-chart/){:class=external}
includes a {{exporter_name}} deployment that exposes Prometheus-format metrics automatically.
To verify that {{exporter_name}} is emitting metrics on the expected endpoints, do the following:
0. Set up port forwarding by using the following command:
<pre class="devsite-click-to-copy">
kubectl -n {{namespace_name}} port-forward deploy/airflow-statsd 9102
kubectl -n {{namespace_name}} port-forward deploy/<var>AIRFLOW_RELEASE_NAME</var>-statsd 9102
</pre>
Access the endpoint `localhost:9102/metrics` by using the browser or curl in another terminal session
to verify that the metrics are being exposed by the exporter for scraping.
0. Access the endpoint `localhost:9102/metrics` by using the browser
or the `curl` utility in another terminal session.
dashboard_available: true
multiple_dashboards: false
dashboard_display_name: {{app_name_short}} Prometheus Overview
Expand All @@ -40,9 +42,9 @@ podmonitoring_config: |
component: statsd
release: airflow
additional_podmonitoring_info: |
Ensure that the values of the `port` and `matchLabels` fields match those of the {{app_name_short}} pods you want to monitor.
The labels and values shown here are set by default when Airflow is deployed with
[Helm](https://airflow.apache.org/docs/helm-chart/){:class=external}.
Ensure that the values of the `port` and `matchLabels` fields match those of the {{exporter_name}} pods you want to monitor.
The labels and values shown here are set by default when {{app_name_short}} is
deployed with [Helm](https://airflow.apache.org/docs/helm-chart/){:class=external}.
sample_promql_query: up{job="airflow", cluster="{{cluster_name}}", namespace="{{namespace_name}}"}
alerts_config: |
apiVersion: monitoring.googleapis.com/v1
Expand Down
9 changes: 9 additions & 0 deletions integrations/cassandra/ops_agent_metadata.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,15 @@ platforms:
- name: jsonPayload.timeStopping
type: string
description: Seconds the JVM took to stop threads before garbage collection
- name: jsonPayload.level
type: string
description: Level of the log entry
- name: jsonPayload.pid
type: string
description: Process ID of the Java process logging the message
- name: jsonPayload.tid
type: string
description: Thread ID of the Java process logging the message
- name: severity
type: string
description: ''
Expand Down
23 changes: 22 additions & 1 deletion integrations/elasticsearch/ops_agent_metadata.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ platforms:
patch: 0
metrics_minimum_supported_version:
major: 2
minor: 21
minor: 32
patch: 0
detections:
- characteristic_log:
Expand Down Expand Up @@ -298,3 +298,24 @@ platforms:
kind: GAUGE
labels:
- state
- name: workload.googleapis.com/elasticsearch.cluster.in_flight_fetch
value_type: INT64
kind: GAUGE
labels: []
- name: workload.googleapis.com/elasticsearch.cluster.pending_tasks
value_type: INT64
kind: GAUGE
labels: []
- name: workload.googleapis.com/elasticsearch.node.cache.count
value_type: INT64
kind: GAUGE
labels:
- type
- name: workload.googleapis.com/elasticsearch.node.fs.disk.free
value_type: INT64
kind: GAUGE
labels: []
- name: workload.googleapis.com/elasticsearch.node.fs.disk.total
value_type: INT64
kind: GAUGE
labels: []
Loading

0 comments on commit 5535830

Please sign in to comment.