From 5340a551fbbfe41952d55b8f2df9d07c2b31b8fe Mon Sep 17 00:00:00 2001 From: Xavier Basty Date: Thu, 24 Aug 2023 13:16:03 +0200 Subject: [PATCH] feat: add HTTP metrics to Grafana --- terraform/monitoring/dashboard.jsonnet | 44 +++++---- terraform/monitoring/dashboard.tf | 7 +- terraform/monitoring/grafonnet-lib | 2 +- .../panels/app/app_cpu_memory.libsonnet | 95 ------------------- terraform/monitoring/panels/app/cpu.libsonnet | 44 +++++++++ .../monitoring/panels/app/memory.libsonnet | 44 +++++++++ .../panels/app/nlb_target_resets.libsonnet | 33 ------- .../panels/docdb/available_memory.libsonnet | 15 +-- .../docdb/buffer_cache_hit_ratio.libsonnet | 8 +- .../panels/docdb/connections.libsonnet | 12 +-- .../monitoring/panels/docdb/cpu.libsonnet | 25 ++--- .../docdb/low_mem_op_throttled.libsonnet | 15 +-- .../monitoring/panels/docdb/volume.libsonnet | 8 +- .../panels/lb/active_connections.libsonnet | 24 +++++ .../monitoring/panels/lb/error_4xx.libsonnet | 67 +++++++++++++ .../monitoring/panels/lb/error_5xx.libsonnet | 54 +++++++++++ .../{app => lb}/healthy_hosts.libsonnet | 19 ++-- .../requests.libsonnet} | 23 ++--- terraform/monitoring/panels/panels.libsonnet | 26 +++-- 19 files changed, 337 insertions(+), 228 deletions(-) delete mode 100644 terraform/monitoring/panels/app/app_cpu_memory.libsonnet create mode 100644 terraform/monitoring/panels/app/cpu.libsonnet create mode 100644 terraform/monitoring/panels/app/memory.libsonnet delete mode 100644 terraform/monitoring/panels/app/nlb_target_resets.libsonnet create mode 100644 terraform/monitoring/panels/lb/active_connections.libsonnet create mode 100644 terraform/monitoring/panels/lb/error_4xx.libsonnet create mode 100644 terraform/monitoring/panels/lb/error_5xx.libsonnet rename terraform/monitoring/panels/{app => lb}/healthy_hosts.libsonnet (75%) rename terraform/monitoring/panels/{app/active_nlb_flows.libsonnet => lb/requests.libsonnet} (52%) diff --git a/terraform/monitoring/dashboard.jsonnet b/terraform/monitoring/dashboard.jsonnet index c1de396..a4a8ad3 100644 --- a/terraform/monitoring/dashboard.jsonnet +++ b/terraform/monitoring/dashboard.jsonnet @@ -2,6 +2,7 @@ local grafana = import 'grafonnet-lib/grafana.libsonnet'; local panels = import 'panels/panels.libsonnet'; local dashboard = grafana.dashboard; +local row = grafana.row; local ds = { prometheus: { @@ -14,10 +15,13 @@ local ds = { } }; local vars = { - notifications: std.parseJson(std.extVar('notifications')), + namespace: 'Keys', environment: std.extVar('environment'), + notifications: std.parseJson(std.extVar('notifications')), + ecs_service_name: std.extVar('ecs_service_name'), load_balancer: std.extVar('load_balancer'), + target_group: std.extVar('target_group'), docdb_cluster_id: std.extVar('docdb_cluster_id'), }; @@ -44,25 +48,25 @@ dashboard.new( }, ) ) -.addPanels( - grafana.layout.generate_grid([ - panels.app.app_cpu_memory(ds, vars) { gridPos: pos._2 }, - panels.app.healthy_hosts(ds, vars) { gridPos: pos._2 }, - panels.app.active_nlb_flows(ds, vars) { gridPos: pos._2 }, - panels.app.nlb_target_resets(ds, vars) { gridPos: pos._2 }, +.addPanels(grafana.layout.generate_grid([ + row.new('Application'), + panels.app.cpu(ds, vars) { gridPos: pos._2 }, + panels.app.memory(ds, vars) { gridPos: pos._2 }, - //////////////////////////////////////////////////////////////////////////// - grafana.panels.text( - content = '# DocumentDB', - transparent = true - ) { gridPos: pos.title }, + row.new('Load Balancer'), + panels.lb.active_connections(ds, vars) { gridPos: pos._2 }, + panels.lb.healthy_hosts(ds, vars) { gridPos: pos._2 }, - panels.docdb.cpu(ds, vars) { gridPos: pos._3 }, - panels.docdb.available_memory(ds, vars) { gridPos: pos._3 }, - panels.docdb.connections(ds, vars) { gridPos: pos._3 }, + panels.lb.requests(ds, vars) { gridPos: pos._3 }, + panels.lb.error_4xx(ds, vars) { gridPos: pos._3 }, + panels.lb.error_5xx(ds, vars) { gridPos: pos._3 }, - panels.docdb.low_mem_op_throttled(ds, vars) { gridPos: pos._3 }, - panels.docdb.volume(ds, vars) { gridPos: pos._3 }, - panels.docdb.buffer_cache_hit_ratio(ds, vars) { gridPos: pos._3 }, - ]) -) + row.new('DocumentDB'), + panels.docdb.cpu(ds, vars) { gridPos: pos._3 }, + panels.docdb.available_memory(ds, vars) { gridPos: pos._3 }, + panels.docdb.connections(ds, vars) { gridPos: pos._3 }, + + panels.docdb.low_mem_op_throttled(ds, vars) { gridPos: pos._3 }, + panels.docdb.volume(ds, vars) { gridPos: pos._3 }, + panels.docdb.buffer_cache_hit_ratio(ds, vars) { gridPos: pos._3 }, +])) diff --git a/terraform/monitoring/dashboard.tf b/terraform/monitoring/dashboard.tf index 7ab951b..11f1d5b 100644 --- a/terraform/monitoring/dashboard.tf +++ b/terraform/monitoring/dashboard.tf @@ -8,11 +8,12 @@ data "jsonnet_file" "dashboard" { prometheus_uid = grafana_data_source.prometheus.uid cloudwatch_uid = grafana_data_source.cloudwatch.uid - notifications = jsonencode(var.notification_channels) - environment = module.this.stage + environment = module.this.stage + notifications = jsonencode(var.notification_channels) + ecs_service_name = var.ecs_service_name - target_group = var.ecs_target_group_arn load_balancer = var.load_balancer_arn + target_group = var.ecs_target_group_arn docdb_cluster_id = var.keystore_cluster_id } } diff --git a/terraform/monitoring/grafonnet-lib b/terraform/monitoring/grafonnet-lib index cf551c3..343c394 160000 --- a/terraform/monitoring/grafonnet-lib +++ b/terraform/monitoring/grafonnet-lib @@ -1 +1 @@ -Subproject commit cf551c34583643cf3e52afc0549a40d8079a786e +Subproject commit 343c39494ec8f1561af5c158010dafc71fbb1351 diff --git a/terraform/monitoring/panels/app/app_cpu_memory.libsonnet b/terraform/monitoring/panels/app/app_cpu_memory.libsonnet deleted file mode 100644 index bf59af5..0000000 --- a/terraform/monitoring/panels/app/app_cpu_memory.libsonnet +++ /dev/null @@ -1,95 +0,0 @@ -local grafana = import '../../grafonnet-lib/grafana.libsonnet'; -local panels = grafana.panels; -local targets = grafana.targets; -local alert = grafana.alert; -local alertCondition = grafana.alertCondition; - -local defaults = import '../defaults.libsonnet'; - -local _configuration = defaults.configuration.timeseries_resource - .withUnit('percent') - .withSoftLimit( - axisSoftMin = 0, - axisSoftMax = 30, - ); - -local cpu_alert(vars) = alert.new( - name = "%s Keys-Server App CPU/Memory alert" % vars.environment, - message = "%s Keys-Server App CPU/Memory" % vars.environment, - period = '25m', - frequency = '1m', - conditions = [ - alertCondition.new( - evaluatorParams = [ 50 ], - evaluatorType = 'gt', - operatorType = 'or', - queryRefId = 'CPU_Avg', - queryTimeStart = '25m', - reducerType = 'max', - ), - alertCondition.new( - evaluatorParams = [ 50 ], - evaluatorType = 'gt', - operatorType = 'or', - queryRefId = 'Mem_Avg', - queryTimeStart = '25m', - reducerType = 'max', - ), - ] -); - -{ - new(ds, vars):: - panels.timeseries( - title = 'App CPU/Memory', - datasource = ds.cloudwatch, - ) - .configure(_configuration) - .setAlert(cpu_alert(vars)) - - .addTarget(targets.cloudwatch( - refId = 'CPU_Max', - alias = 'CPU (Max)', - datasource = ds.cloudwatch, - namespace = 'AWS/ECS', - metricName = 'CPUUtilization', - statistic = 'Maximum', - dimensions = { - ServiceName: vars.ecs_service_name - }, - )) - .addTarget(targets.cloudwatch( - refId = 'CPU_Avg', - alias = 'CPU (Avg)', - datasource = ds.cloudwatch, - namespace = 'AWS/ECS', - metricName = 'CPUUtilization', - statistic = 'Average', - dimensions = { - ServiceName: vars.ecs_service_name - }, - )) - - .addTarget(targets.cloudwatch( - refId = 'Mem_Max', - alias = 'Memory (Max)', - datasource = ds.cloudwatch, - namespace = 'AWS/ECS', - metricName = 'MemoryUtilization', - statistic = 'Maximum', - dimensions = { - ServiceName: vars.ecs_service_name - }, - )) - .addTarget(targets.cloudwatch( - refId = 'Mem_Avg', - alias = 'Memory (Avg)', - datasource = ds.cloudwatch, - namespace = 'AWS/ECS', - metricName = 'MemoryUtilization', - statistic = 'Average', - dimensions = { - ServiceName: vars.ecs_service_name - }, - )) -} diff --git a/terraform/monitoring/panels/app/cpu.libsonnet b/terraform/monitoring/panels/app/cpu.libsonnet new file mode 100644 index 0000000..22ae5b0 --- /dev/null +++ b/terraform/monitoring/panels/app/cpu.libsonnet @@ -0,0 +1,44 @@ +local grafana = import '../../grafonnet-lib/grafana.libsonnet'; +local defaults = import '../../grafonnet-lib/defaults.libsonnet'; + +local panels = grafana.panels; +local targets = grafana.targets; +local overrides = defaults.overrides; + +{ + new(ds, vars):: + panels.timeseries( + title = 'CPU Utilization', + datasource = ds.cloudwatch, + ) + .configure(overrides.cpu(defaults.configuration.timeseries_resource)) + .setAlert(defaults.alerts.cpu( + namespace = vars.namespace, + title = 'ECS', + env = vars.environment, + notifications = vars.notifications, + )) + + .addTarget(targets.cloudwatch( + alias = 'CPU (Max)', + datasource = ds.cloudwatch, + dimensions = { + ServiceName: vars.ecs_service_name + }, + metricName = 'CPUUtilization', + namespace = 'AWS/ECS', + statistic = 'Maximum', + refId = 'CPU_Max', + )) + .addTarget(targets.cloudwatch( + alias = 'CPU (Avg)', + datasource = ds.cloudwatch, + dimensions = { + ServiceName: vars.ecs_service_name + }, + metricName = 'CPUUtilization', + namespace = 'AWS/ECS', + statistic = 'Average', + refId = 'CPU_Avg', + )) +} diff --git a/terraform/monitoring/panels/app/memory.libsonnet b/terraform/monitoring/panels/app/memory.libsonnet new file mode 100644 index 0000000..045f0b0 --- /dev/null +++ b/terraform/monitoring/panels/app/memory.libsonnet @@ -0,0 +1,44 @@ +local grafana = import '../../grafonnet-lib/grafana.libsonnet'; +local defaults = import '../../grafonnet-lib/defaults.libsonnet'; + +local panels = grafana.panels; +local targets = grafana.targets; + +{ + new(ds, vars):: + panels.timeseries( + title = 'Memory Utilization', + datasource = ds.cloudwatch, + ) + .configure(defaults.overrides.memory(defaults.configuration.timeseries_resource)) + + .setAlert(defaults.alerts.memory( + namespace = vars.namespace, + title = 'ECS', + env = vars.environment, + notifications = vars.notifications, + )) + + .addTarget(targets.cloudwatch( + alias = 'Memory (Max)', + datasource = ds.cloudwatch, + namespace = 'AWS/ECS', + metricName = 'MemoryUtilization', + dimensions = { + ServiceName: vars.ecs_service_name + }, + statistic = 'Maximum', + refId = 'Mem_Max', + )) + .addTarget(targets.cloudwatch( + alias = 'Memory (Avg)', + datasource = ds.cloudwatch, + namespace = 'AWS/ECS', + metricName = 'MemoryUtilization', + dimensions = { + ServiceName: vars.ecs_service_name + }, + statistic = 'Average', + refId = 'Mem_Avg', + )) +} diff --git a/terraform/monitoring/panels/app/nlb_target_resets.libsonnet b/terraform/monitoring/panels/app/nlb_target_resets.libsonnet deleted file mode 100644 index 3d881b7..0000000 --- a/terraform/monitoring/panels/app/nlb_target_resets.libsonnet +++ /dev/null @@ -1,33 +0,0 @@ -local grafana = import '../../grafonnet-lib/grafana.libsonnet'; -local panels = grafana.panels; -local targets = grafana.targets; - -local defaults = import '../defaults.libsonnet'; - -local _configuration = defaults.configuration.timeseries_tr80 - .withSoftLimit( - axisSoftMin = 0, - axisSoftMax = 250, - ); - - -{ - new(ds, vars):: - panels.timeseries( - title = 'NLB Target Resets', - description = "When the NLB has connection failures to the targets then these jump. We for instance had this when we had a too low file descriptor limit.", - datasource = ds.cloudwatch, - ) - .configure(_configuration) - .addTarget(targets.cloudwatch( - alias = 'LB-0', - datasource = ds.cloudwatch, - namespace = 'AWS/NetworkELB', - metricName = 'TCP_Target_Reset_Count', - statistic = 'Sum', - dimensions = { - LoadBalancer: vars.load_balancer - }, - matchExact = true, - )) -} diff --git a/terraform/monitoring/panels/docdb/available_memory.libsonnet b/terraform/monitoring/panels/docdb/available_memory.libsonnet index ac5c7c8..3e3df16 100644 --- a/terraform/monitoring/panels/docdb/available_memory.libsonnet +++ b/terraform/monitoring/panels/docdb/available_memory.libsonnet @@ -1,11 +1,11 @@ local grafana = import '../../grafonnet-lib/grafana.libsonnet'; +local defaults = import '../../grafonnet-lib/defaults.libsonnet'; + local panels = grafana.panels; local targets = grafana.targets; local alert = grafana.alert; local alertCondition = grafana.alertCondition; -local defaults = import '../defaults.libsonnet'; - local mem_threshold = 4000000000; // 4GiB local max_memory = 16000000000; // 16GiB (AWS DocDB max on db.r6g.large) @@ -45,12 +45,13 @@ local _configuration = defaults.configuration.timeseries local mem_alert(vars) = alert.new( - name = "%s Keys-Server DocumentDB Freeable Memory Alert" % vars.environment, - message = "%s Keys-Server DocumentDB Freeable Memory" % vars.environment, - period = '5m', - frequency = '1m', + namespace = vars.namespace, + name = "%s DocumentDB Freeable Memory Alert" % vars.environment, + message = "%s DocumentDB Freeable Memory" % vars.environment, + period = '5m', + frequency = '1m', notifications = vars.notifications, - conditions = [ + conditions = [ alertCondition.new( evaluatorParams = [ mem_threshold ], evaluatorType = 'lt', diff --git a/terraform/monitoring/panels/docdb/buffer_cache_hit_ratio.libsonnet b/terraform/monitoring/panels/docdb/buffer_cache_hit_ratio.libsonnet index 3a02b42..7bfe9e0 100644 --- a/terraform/monitoring/panels/docdb/buffer_cache_hit_ratio.libsonnet +++ b/terraform/monitoring/panels/docdb/buffer_cache_hit_ratio.libsonnet @@ -1,8 +1,8 @@ -local grafana = import '../../grafonnet-lib/grafana.libsonnet'; -local panels = grafana.panels; -local targets = grafana.targets; +local grafana = import '../../grafonnet-lib/grafana.libsonnet'; +local defaults = import '../../grafonnet-lib/defaults.libsonnet'; -local defaults = import '../defaults.libsonnet'; +local panels = grafana.panels; +local targets = grafana.targets; local _configuration = defaults.configuration.timeseries .withUnit('percent') diff --git a/terraform/monitoring/panels/docdb/connections.libsonnet b/terraform/monitoring/panels/docdb/connections.libsonnet index 9cfc1e6..ecbc466 100644 --- a/terraform/monitoring/panels/docdb/connections.libsonnet +++ b/terraform/monitoring/panels/docdb/connections.libsonnet @@ -1,10 +1,8 @@ -local grafana = import '../../grafonnet-lib/grafana.libsonnet'; -local panels = grafana.panels; -local targets = grafana.targets; +local grafana = import '../../grafonnet-lib/grafana.libsonnet'; +local defaults = import '../../grafonnet-lib/defaults.libsonnet'; -local defaults = import '../defaults.libsonnet'; - -local _configuration = defaults.configuration.timeseries; +local panels = grafana.panels; +local targets = grafana.targets; { new(ds, vars):: @@ -12,7 +10,7 @@ local _configuration = defaults.configuration.timeseries; title = 'Database Connections', datasource = ds.cloudwatch, ) - .configure(_configuration) + .configure(defaults.configuration.timeseries) .addTarget(targets.cloudwatch( alias = 'Database Connections', diff --git a/terraform/monitoring/panels/docdb/cpu.libsonnet b/terraform/monitoring/panels/docdb/cpu.libsonnet index 537d004..7815cb1 100644 --- a/terraform/monitoring/panels/docdb/cpu.libsonnet +++ b/terraform/monitoring/panels/docdb/cpu.libsonnet @@ -1,26 +1,19 @@ local grafana = import '../../grafonnet-lib/grafana.libsonnet'; +local defaults = import '../../grafonnet-lib/defaults.libsonnet'; + local panels = grafana.panels; local targets = grafana.targets; local alert = grafana.alert; local alertCondition = grafana.alertCondition; -local defaults = import '../defaults.libsonnet'; - -local _configuration = defaults.configuration.timeseries_resource - .withUnit('percent') - .withSoftLimit( - axisSoftMin = 0, - axisSoftMax = 30, - ); - - local cpu_alert(vars) = alert.new( - name = "%s Keys-Server DocumentDB CPU alert" % vars.environment, - message = "%s Keys-Server DocumentDB CPU alert" % vars.environment, - period = '5m', - frequency = '1m', + namespace = vars.namespace, + name = "%s DocumentDB CPU alert" % vars.environment, + message = "%s DocumentDB CPU alert" % vars.environment, + period = '5m', + frequency = '1m', notifications = vars.notifications, - conditions = [ + conditions = [ alertCondition.new( evaluatorParams = [ 50 ], evaluatorType = 'gt', @@ -39,7 +32,7 @@ local cpu_alert(vars) = alert.new( title = 'CPU Utilization', datasource = ds.cloudwatch, ) - .configure(_configuration) + .configure(defaults.configuration.timeseries_resource) .setAlert(cpu_alert(vars)) .addTarget(targets.cloudwatch( diff --git a/terraform/monitoring/panels/docdb/low_mem_op_throttled.libsonnet b/terraform/monitoring/panels/docdb/low_mem_op_throttled.libsonnet index e7f63ed..f74e19d 100644 --- a/terraform/monitoring/panels/docdb/low_mem_op_throttled.libsonnet +++ b/terraform/monitoring/panels/docdb/low_mem_op_throttled.libsonnet @@ -1,11 +1,11 @@ local grafana = import '../../grafonnet-lib/grafana.libsonnet'; +local defaults = import '../../grafonnet-lib/defaults.libsonnet'; + local panels = grafana.panels; local targets = grafana.targets; local alert = grafana.alert; local alertCondition = grafana.alertCondition; -local defaults = import '../defaults.libsonnet'; - local ops_threshold = 2; local _configuration = defaults.configuration.timeseries @@ -31,12 +31,13 @@ local _configuration = defaults.configuration.timeseries local ops_alert(vars) = alert.new( - name = "%s Keys-Server DocumentDB LowMem Num Operations Throttled Alert" % vars.environment, - message = "%s Keys-Server DocumentDB LowMem Num Operations Throttled" % vars.environment, - period = '5m', - frequency = '1m', + namespace = vars.namespace, + name = "%s DocumentDB LowMem Num Operations Throttled Alert" % vars.environment, + message = "%s DocumentDB LowMem Num Operations Throttled" % vars.environment, + period = '5m', + frequency = '1m', notifications = vars.notifications, - conditions = [ + conditions = [ alertCondition.new( evaluatorParams = [ ops_threshold ], evaluatorType = 'gt', diff --git a/terraform/monitoring/panels/docdb/volume.libsonnet b/terraform/monitoring/panels/docdb/volume.libsonnet index 48ef2cb..cf819e1 100644 --- a/terraform/monitoring/panels/docdb/volume.libsonnet +++ b/terraform/monitoring/panels/docdb/volume.libsonnet @@ -1,8 +1,8 @@ -local grafana = import '../../grafonnet-lib/grafana.libsonnet'; -local panels = grafana.panels; -local targets = grafana.targets; +local grafana = import '../../grafonnet-lib/grafana.libsonnet'; +local defaults = import '../../grafonnet-lib/defaults.libsonnet'; -local defaults = import '../defaults.libsonnet'; +local panels = grafana.panels; +local targets = grafana.targets; local _configuration = defaults.configuration.timeseries .withUnit('decbytes') diff --git a/terraform/monitoring/panels/lb/active_connections.libsonnet b/terraform/monitoring/panels/lb/active_connections.libsonnet new file mode 100644 index 0000000..cae7e22 --- /dev/null +++ b/terraform/monitoring/panels/lb/active_connections.libsonnet @@ -0,0 +1,24 @@ +local grafana = import '../../grafonnet-lib/grafana.libsonnet'; +local defaults = import '../../grafonnet-lib/defaults.libsonnet'; + +local panels = grafana.panels; +local targets = grafana.targets; + +{ + new(ds, vars):: + panels.timeseries( + title = 'Active Connections', + datasource = ds.cloudwatch, + ) + .configure(defaults.configuration.timeseries) + + .addTarget(targets.cloudwatch( + datasource = ds.cloudwatch, + namespace = 'AWS/ApplicationELB', + metricName = 'ActiveConnectionCount', + dimensions = { + LoadBalancer: vars.load_balancer + }, + statistic = 'Average', + )) +} diff --git a/terraform/monitoring/panels/lb/error_4xx.libsonnet b/terraform/monitoring/panels/lb/error_4xx.libsonnet new file mode 100644 index 0000000..15ae59f --- /dev/null +++ b/terraform/monitoring/panels/lb/error_4xx.libsonnet @@ -0,0 +1,67 @@ +local grafana = import '../../grafonnet-lib/grafana.libsonnet'; +local defaults = import '../../grafonnet-lib/defaults.libsonnet'; + +local panels = grafana.panels; +local targets = grafana.targets; + +local threshold = 100; + +local _configuration = defaults.configuration.timeseries + .withSoftLimit( + axisSoftMin = 0, + axisSoftMax = threshold * 1.2, + ) + .withThresholdStyle(grafana.fieldConfig.thresholdStyle.dashed) + .addThreshold({ + color : defaults.values.colors.critical, + value : threshold, + }); + +{ + new(ds, vars):: + panels.timeseries( + title = '4XX', + datasource = ds.cloudwatch, + ) + .configure( + defaults.configuration.timeseries + .withSoftLimit( + axisSoftMin = 0, + axisSoftMax = threshold * 1.2, + ) + .withThresholdStyle(grafana.fieldConfig.thresholdStyle.dashed) + .addThreshold({ + color : defaults.values.colors.critical, + value : threshold, + }) + ) + .addPanelThreshold( + op = 'gt', + value = threshold, + ) + + .addTarget(targets.cloudwatch( + alias = 'ELB', + datasource = ds.cloudwatch, + namespace = 'AWS/ApplicationELB', + metricName = 'HTTPCode_ELB_4XX_Count', + dimensions = { + LoadBalancer: vars.load_balancer + }, + matchExact = true, + statistic = 'Sum', + refId = 'ELB', + )) + .addTarget(targets.cloudwatch( + alias = 'Target', + datasource = ds.cloudwatch, + namespace = 'AWS/ApplicationELB', + metricName = 'HTTPCode_Target_4XX_Count', + dimensions = { + LoadBalancer: vars.load_balancer + }, + matchExact = true, + statistic = 'Sum', + refId = 'Target', + )) +} diff --git a/terraform/monitoring/panels/lb/error_5xx.libsonnet b/terraform/monitoring/panels/lb/error_5xx.libsonnet new file mode 100644 index 0000000..594c047 --- /dev/null +++ b/terraform/monitoring/panels/lb/error_5xx.libsonnet @@ -0,0 +1,54 @@ +local grafana = import '../../grafonnet-lib/grafana.libsonnet'; +local defaults = import '../../grafonnet-lib/defaults.libsonnet'; + +local panels = grafana.panels; +local targets = grafana.targets; + +local threshold = 100; + +local _configuration = defaults.configuration.timeseries + .withSoftLimit( + axisSoftMin = 0, + axisSoftMax = threshold * 1.2, + ) + .withThresholdStyle(grafana.fieldConfig.thresholdStyle.dashed) + .addThreshold({ + color : defaults.values.colors.critical, + value : threshold, + }); + +{ + new(ds, vars):: + panels.timeseries( + title = '5XX', + datasource = ds.cloudwatch, + ) + .configure(_configuration) + .addPanelThreshold( + op = 'gt', + value = threshold, + ) + + .addTarget(targets.cloudwatch( + alias = 'ELB', + datasource = ds.cloudwatch, + namespace = 'AWS/ApplicationELB', + metricName = 'HTTPCode_ELB_5XX_Count', + dimensions = { + LoadBalancer: vars.load_balancer + }, + statistic = 'Sum', + refId = 'ELB', + )) + .addTarget(targets.cloudwatch( + alias = 'Target', + datasource = ds.cloudwatch, + namespace = 'AWS/ApplicationELB', + metricName = 'HTTPCode_Target_5XX_Count', + dimensions = { + LoadBalancer: vars.load_balancer + }, + statistic = 'Sum', + refId = 'Target', + )) +} diff --git a/terraform/monitoring/panels/app/healthy_hosts.libsonnet b/terraform/monitoring/panels/lb/healthy_hosts.libsonnet similarity index 75% rename from terraform/monitoring/panels/app/healthy_hosts.libsonnet rename to terraform/monitoring/panels/lb/healthy_hosts.libsonnet index 4763b77..67c9952 100644 --- a/terraform/monitoring/panels/app/healthy_hosts.libsonnet +++ b/terraform/monitoring/panels/lb/healthy_hosts.libsonnet @@ -1,9 +1,9 @@ local grafana = import '../../grafonnet-lib/grafana.libsonnet'; +local defaults = import '../../grafonnet-lib/defaults.libsonnet'; + local panels = grafana.panels; local targets = grafana.targets; -local defaults = import '../defaults.libsonnet'; - local _configuration = defaults.configuration.timeseries .withSoftLimit( axisSoftMin = 0, @@ -19,16 +19,18 @@ local _configuration = defaults.configuration.timeseries .configure(_configuration) .addTarget(targets.cloudwatch( - alias = 'Hosts Count', - metricQueryType = grafana.target.cloudwatch.metricQueryTypes.query, datasource = ds.cloudwatch, - namespace = 'AWS/NetworkELB', - metricName = 'HealthyHostCount', + metricQueryType = grafana.target.cloudwatch.metricQueryTypes.query, + dimensions = { + TargetGroup: vars.target_group + }, + metricName = 'HealthyHostCount', + namespace = 'AWS/ApplicationELB', sql = { from: { property: { - name: "AWS/NetworkELB", + name: "AWS/ApplicationELB", type: "string" }, type: "property" @@ -60,6 +62,7 @@ local _configuration = defaults.configuration.timeseries type: "and" } }, - sqlExpression = "SELECT MAX(HealthyHostCount) FROM \"AWS/NetworkELB\" WHERE LoadBalancer = '%s'" % [vars.load_balancer], + sqlExpression = "SELECT MAX(HealthyHostCount) FROM \"AWS/ApplicationELB\" WHERE LoadBalancer = '%s'" % [vars.load_balancer], + statistic = 'Maximum', )) } diff --git a/terraform/monitoring/panels/app/active_nlb_flows.libsonnet b/terraform/monitoring/panels/lb/requests.libsonnet similarity index 52% rename from terraform/monitoring/panels/app/active_nlb_flows.libsonnet rename to terraform/monitoring/panels/lb/requests.libsonnet index 5da4109..e5d6b66 100644 --- a/terraform/monitoring/panels/app/active_nlb_flows.libsonnet +++ b/terraform/monitoring/panels/lb/requests.libsonnet @@ -1,30 +1,27 @@ local grafana = import '../../grafonnet-lib/grafana.libsonnet'; +local defaults = import '../../grafonnet-lib/defaults.libsonnet'; + local panels = grafana.panels; local targets = grafana.targets; -local defaults = import '../defaults.libsonnet'; - -local _configuration = defaults.configuration.timeseries - .withSoftLimit( - axisSoftMin = 0 - ); - { new(ds, vars):: panels.timeseries( - title = 'Active NLB Flows', + title = 'Requests', datasource = ds.cloudwatch, ) - .configure(_configuration) + .configure(defaults.configuration.timeseries) + .addTarget(targets.cloudwatch( - alias = 'LB-0', + alias = 'Requests', datasource = ds.cloudwatch, - namespace = 'AWS/NetworkELB', - metricName = 'ActiveFlowCount_TLS', - statistic = 'Maximum', + namespace = 'AWS/ApplicationELB', + metricName = 'RequestCount', dimensions = { LoadBalancer: vars.load_balancer }, matchExact = true, + statistic = 'Sum', + refId = 'Requests', )) } diff --git a/terraform/monitoring/panels/panels.libsonnet b/terraform/monitoring/panels/panels.libsonnet index 0442b37..219a8d2 100644 --- a/terraform/monitoring/panels/panels.libsonnet +++ b/terraform/monitoring/panels/panels.libsonnet @@ -1,17 +1,23 @@ { app: { - app_cpu_memory: (import 'app/app_cpu_memory.libsonnet' ).new, - healthy_hosts: (import 'app/healthy_hosts.libsonnet' ).new, - active_nlb_flows: (import 'app/active_nlb_flows.libsonnet' ).new, - nlb_target_resets: (import 'app/nlb_target_resets.libsonnet' ).new, + cpu: (import 'app/cpu.libsonnet').new, + memory: (import 'app/memory.libsonnet').new, + }, + + lb: { + active_connections: (import 'lb/active_connections.libsonnet').new, + error_4xx: (import 'lb/error_4xx.libsonnet').new, + error_5xx: (import 'lb/error_5xx.libsonnet').new, + healthy_hosts: (import 'lb/healthy_hosts.libsonnet').new, + requests: (import 'lb/requests.libsonnet').new, }, docdb: { - buffer_cache_hit_ratio: (import 'docdb/buffer_cache_hit_ratio.libsonnet' ).new, - cpu: (import 'docdb/cpu.libsonnet' ).new, - volume: (import 'docdb/volume.libsonnet' ).new, - available_memory: (import 'docdb/available_memory.libsonnet' ).new, - connections: (import 'docdb/connections.libsonnet' ).new, - low_mem_op_throttled: (import 'docdb/low_mem_op_throttled.libsonnet' ).new, + buffer_cache_hit_ratio: (import 'docdb/buffer_cache_hit_ratio.libsonnet').new, + cpu: (import 'docdb/cpu.libsonnet').new, + volume: (import 'docdb/volume.libsonnet').new, + available_memory: (import 'docdb/available_memory.libsonnet').new, + connections: (import 'docdb/connections.libsonnet').new, + low_mem_op_throttled: (import 'docdb/low_mem_op_throttled.libsonnet').new, }, }