Skip to content

Commit

Permalink
Merge pull request #163 from PermanentOrg/per-9861_add_ec2_outage_not…
Browse files Browse the repository at this point in the history
…ifications

Add status check failure alerts to EC2 instances
  • Loading branch information
liam-lloyd authored Oct 7, 2024
2 parents 4029ae8 + 5165223 commit 22423c5
Show file tree
Hide file tree
Showing 3 changed files with 206 additions and 0 deletions.
68 changes: 68 additions & 0 deletions instances/dev/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,23 @@ resource "aws_instance" "api" {
}
}

resource "aws_cloudwatch_metric_alarm" "api_outage_alarm" {
alarm_name = "${var.perm_env.name}-api-instance-outage-alarm"
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = "1"
metric_name = "StatusCheckFailed"
namespace = "AWS/EC2"
period = "300"
statistic = "Average"
threshold = "0.99"
actions_enabled = "true"
alarm_actions = ["arn:aws:sns:us-west-2:364159549467:ec2-outage-notifications"]
ok_actions = ["arn:aws:sns:us-west-2:364159549467:ec2-outage-notifications"]
dimensions = {
InstanceId = aws_instance.api.id
}
}

resource "aws_instance" "taskrunner" {
ami = module.perm_env_data.taskrunner_ami
instance_type = "c4.large"
Expand All @@ -56,6 +73,23 @@ resource "aws_instance" "taskrunner" {
}
}

resource "aws_cloudwatch_metric_alarm" "taskrunner_outage_alarm" {
alarm_name = "${var.perm_env.name}-taskrunner-instance-outage-alarm"
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = "1"
metric_name = "StatusCheckFailed"
namespace = "AWS/EC2"
period = "300"
statistic = "Average"
threshold = "0.99"
actions_enabled = "true"
alarm_actions = ["arn:aws:sns:us-west-2:364159549467:ec2-outage-notifications"]
ok_actions = ["arn:aws:sns:us-west-2:364159549467:ec2-outage-notifications"]
dimensions = {
InstanceId = aws_instance.taskrunner[0].id
}
}

resource "aws_instance" "cron" {
ami = module.perm_env_data.cron_ami
instance_type = "t2.micro"
Expand All @@ -68,6 +102,23 @@ resource "aws_instance" "cron" {
}
}

resource "aws_cloudwatch_metric_alarm" "cron_outage_alarm" {
alarm_name = "${var.perm_env.name}-cron-instance-outage-alarm"
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = "1"
metric_name = "StatusCheckFailed"
namespace = "AWS/EC2"
period = "300"
statistic = "Average"
threshold = "0.99"
actions_enabled = "true"
alarm_actions = ["arn:aws:sns:us-west-2:364159549467:ec2-outage-notifications"]
ok_actions = ["arn:aws:sns:us-west-2:364159549467:ec2-outage-notifications"]
dimensions = {
InstanceId = aws_instance.cron.id
}
}

resource "aws_instance" "sftp" {
ami = module.perm_env_data.sftp_ami
instance_type = "c4.large"
Expand All @@ -81,6 +132,23 @@ resource "aws_instance" "sftp" {
}
}

resource "aws_cloudwatch_metric_alarm" "sftp_outage_alarm" {
alarm_name = "${var.perm_env.name}-sftp-instance-outage-alarm"
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = "1"
metric_name = "StatusCheckFailed"
namespace = "AWS/EC2"
period = "300"
statistic = "Average"
threshold = "0.99"
actions_enabled = "true"
alarm_actions = ["arn:aws:sns:us-west-2:364159549467:ec2-outage-notifications"]
ok_actions = ["arn:aws:sns:us-west-2:364159549467:ec2-outage-notifications"]
dimensions = {
InstanceId = aws_instance.sftp.id
}
}

module "perm_env_data" {
source = "../modules/get-data"
perm_env = var.perm_env
Expand Down
69 changes: 69 additions & 0 deletions instances/production/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,23 @@ resource "aws_instance" "api" {
}
}

resource "aws_cloudwatch_metric_alarm" "api_outage_alarm" {
alarm_name = "${var.perm_env.name}-api-instance-outage-alarm"
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = "1"
metric_name = "StatusCheckFailed"
namespace = "AWS/EC2"
period = "300"
statistic = "Average"
threshold = "0.99"
actions_enabled = "true"
alarm_actions = ["arn:aws:sns:us-west-2:364159549467:ec2-outage-notifications"]
ok_actions = ["arn:aws:sns:us-west-2:364159549467:ec2-outage-notifications"]
dimensions = {
InstanceId = aws_instance.api.id
}
}

resource "aws_instance" "taskrunner" {
ami = module.perm_env_data.taskrunner_ami
instance_type = "c4.xlarge"
Expand All @@ -56,6 +73,24 @@ resource "aws_instance" "taskrunner" {
}
}

resource "aws_cloudwatch_metric_alarm" "taskrunner_outage_alarm" {
count = 2
alarm_name = "${var.perm_env.name}-taskrunner${count.index}-instance-outage-alarm"
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = "1"
metric_name = "StatusCheckFailed"
namespace = "AWS/EC2"
period = "300"
statistic = "Average"
threshold = "0.99"
actions_enabled = "true"
alarm_actions = ["arn:aws:sns:us-west-2:364159549467:ec2-outage-notifications"]
ok_actions = ["arn:aws:sns:us-west-2:364159549467:ec2-outage-notifications"]
dimensions = {
InstanceId = aws_instance.taskrunner[count.index].id
}
}

resource "aws_instance" "cron" {
ami = module.perm_env_data.cron_ami
instance_type = "t2.micro"
Expand All @@ -68,6 +103,23 @@ resource "aws_instance" "cron" {
}
}

resource "aws_cloudwatch_metric_alarm" "cron_outage_alarm" {
alarm_name = "${var.perm_env.name}-cron-instance-outage-alarm"
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = "1"
metric_name = "StatusCheckFailed"
namespace = "AWS/EC2"
period = "300"
statistic = "Average"
threshold = "0.99"
actions_enabled = "true"
alarm_actions = ["arn:aws:sns:us-west-2:364159549467:ec2-outage-notifications"]
ok_actions = ["arn:aws:sns:us-west-2:364159549467:ec2-outage-notifications"]
dimensions = {
InstanceId = aws_instance.cron.id
}
}

resource "aws_instance" "sftp" {
ami = module.perm_env_data.sftp_ami
instance_type = "m4.large"
Expand All @@ -81,6 +133,23 @@ resource "aws_instance" "sftp" {
}
}

resource "aws_cloudwatch_metric_alarm" "sftp_outage_alarm" {
alarm_name = "${var.perm_env.name}-sftp-instance-outage-alarm"
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = "1"
metric_name = "StatusCheckFailed"
namespace = "AWS/EC2"
period = "300"
statistic = "Average"
threshold = "0.99"
actions_enabled = "true"
alarm_actions = ["arn:aws:sns:us-west-2:364159549467:ec2-outage-notifications"]
ok_actions = ["arn:aws:sns:us-west-2:364159549467:ec2-outage-notifications"]
dimensions = {
InstanceId = aws_instance.sftp.id
}
}

module "perm_env_data" {
source = "../modules/get-data"
perm_env = var.perm_env
Expand Down
69 changes: 69 additions & 0 deletions instances/staging/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,23 @@ resource "aws_instance" "api" {
}
}

resource "aws_cloudwatch_metric_alarm" "api_outage_alarm" {
alarm_name = "${var.perm_env.name}-api-instance-outage-alarm"
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = "1"
metric_name = "StatusCheckFailed"
namespace = "AWS/EC2"
period = "300"
statistic = "Average"
threshold = "0.99"
actions_enabled = "true"
alarm_actions = ["arn:aws:sns:us-west-2:364159549467:ec2-outage-notifications"]
ok_actions = ["arn:aws:sns:us-west-2:364159549467:ec2-outage-notifications"]
dimensions = {
InstanceId = aws_instance.api.id
}
}

resource "aws_instance" "taskrunner" {
ami = module.perm_env_data.taskrunner_ami
instance_type = "c4.large"
Expand All @@ -56,6 +73,24 @@ resource "aws_instance" "taskrunner" {
}
}

resource "aws_cloudwatch_metric_alarm" "taskrunner_outage_alarm" {
count = 2
alarm_name = "${var.perm_env.name}-taskrunner${count.index}-instance-outage-alarm"
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = "1"
metric_name = "StatusCheckFailed"
namespace = "AWS/EC2"
period = "300"
statistic = "Average"
threshold = "0.99"
actions_enabled = "true"
alarm_actions = ["arn:aws:sns:us-west-2:364159549467:ec2-outage-notifications"]
ok_actions = ["arn:aws:sns:us-west-2:364159549467:ec2-outage-notifications"]
dimensions = {
InstanceId = aws_instance.taskrunner[count.index].id
}
}

resource "aws_instance" "cron" {
ami = module.perm_env_data.cron_ami
instance_type = "t2.micro"
Expand All @@ -68,6 +103,23 @@ resource "aws_instance" "cron" {
}
}

resource "aws_cloudwatch_metric_alarm" "cron_outage_alarm" {
alarm_name = "${var.perm_env.name}-cron-instance-outage-alarm"
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = "1"
metric_name = "StatusCheckFailed"
namespace = "AWS/EC2"
period = "300"
statistic = "Average"
threshold = "0.99"
actions_enabled = "true"
alarm_actions = ["arn:aws:sns:us-west-2:364159549467:ec2-outage-notifications"]
ok_actions = ["arn:aws:sns:us-west-2:364159549467:ec2-outage-notifications"]
dimensions = {
InstanceId = aws_instance.cron.id
}
}

resource "aws_instance" "sftp" {
ami = module.perm_env_data.sftp_ami
instance_type = "m4.large"
Expand All @@ -81,6 +133,23 @@ resource "aws_instance" "sftp" {
}
}

resource "aws_cloudwatch_metric_alarm" "sftp_outage_alarm" {
alarm_name = "${var.perm_env.name}-sftp-instance-outage-alarm"
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = "1"
metric_name = "StatusCheckFailed"
namespace = "AWS/EC2"
period = "300"
statistic = "Average"
threshold = "0.99"
actions_enabled = "true"
alarm_actions = ["arn:aws:sns:us-west-2:364159549467:ec2-outage-notifications"]
ok_actions = ["arn:aws:sns:us-west-2:364159549467:ec2-outage-notifications"]
dimensions = {
InstanceId = aws_instance.sftp.id
}
}

module "perm_env_data" {
source = "../modules/get-data"
perm_env = var.perm_env
Expand Down

0 comments on commit 22423c5

Please sign in to comment.