Skip to content

Commit

Permalink
BFD-3348: Alarm on SFTP Outbound Timeouts (#2260)
Browse files Browse the repository at this point in the history
  • Loading branch information
mjburling authored Mar 26, 2024
1 parent e572614 commit c94a999
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 2 deletions.
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# `bft_eft_outbound_o11y` Submodule
# `bft_eft_outbound_o11y` Module

This Submodule contains the resources and Lambda source code for several Alarms and a Lambda that sends BFD EFT Outbound status notifications to a Slack channel configured in `base` configuration.
This module contains the resources and Lambda source code for several Alarms and a Lambda that sends BFD EFT Outbound status notifications to a Slack channel configured in `base` configuration.

<!-- BEGIN_TF_DOCS -->
<!-- GENERATED WITH `terraform-docs .`
Expand Down Expand Up @@ -59,6 +59,7 @@ No outputs.
| Name | Type |
|------|------|
| [aws_cloudwatch_metric_alarm.lambda_errors](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_metric_alarm) | resource |
| [aws_cloudwatch_metric_alarm.lambda_timeouts](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_metric_alarm) | resource |
| [aws_cloudwatch_metric_alarm.sns_failures](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_metric_alarm) | resource |
| [aws_iam_policy.slack_notifier_logs](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource |
| [aws_iam_role.slack_notifier](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | resource |
Expand All @@ -68,6 +69,7 @@ No outputs.
| [aws_sns_topic_subscription.sns_to_slack_notifier](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/sns_topic_subscription) | resource |
| [archive_file.slack_notifier_src](https://registry.terraform.io/providers/hashicorp/archive/latest/docs/data-sources/file) | data source |
| [aws_caller_identity.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/caller_identity) | data source |
| [aws_lambda_function.outbound_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/lambda_function) | data source |
| [aws_region.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/region) | data source |
| [aws_sns_topic.breach_topics](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/sns_topic) | data source |
| [aws_sns_topic.ok_topics](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/sns_topic) | data source |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ data "aws_sqs_queue" "outbound_lambda_dlq" {
name = var.outbound_lambda_dlq_name
}

data "aws_lambda_function" "outbound_lambda" {
function_name = var.outbound_lambda_name
}

data "aws_ssm_parameter" "slack_webhook" {
name = local.slack_webhook_ssm_path
with_decryption = true
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
resource "aws_cloudwatch_metric_alarm" "lambda_timeouts" {
alarm_name = "${local.alarms_config.lambda_errors.alarm_name}-timeout"
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = 1
# `period` is expressed in minutes, calculated here for previous 15 minutes to catch **all** lambda timeouts
period = 60 * 15
statistic = "Maximum"
# `threshold` expressed in milliseconds
threshold = data.aws_lambda_function.outbound_lambda.timeout * 1000
datapoints_to_alarm = 1
treat_missing_data = "notBreaching"

alarm_description = join("", [
"The ${var.outbound_lambda_name} has timed out in ${local.env}. View the ",
"linked CloudWatch Log Group for more details on the failure, and inspect the failing event ",
"in the linked DLQ",
"\n",
"\n* CloudWatch Log Group: <${local.alarms_config.lambda_errors.log_group_url}|${local.alarms_config.lambda_errors.log_group_name}>",
"\n* Dead Letter Queue: <${local.alarms_config.lambda_errors.queue_url}|${var.outbound_lambda_dlq_name}>",
])

metric_name = "Duration"
namespace = local.lambda_metrics_namespace
dimensions = {
FunctionName = var.outbound_lambda_name
}

alarm_actions = local.alarms_config.lambda_errors.breach_topic_arn != null ? [local.alarms_config.lambda_errors.breach_topic_arn] : null
ok_actions = local.alarms_config.lambda_errors.ok_topic_arn != null ? [local.alarms_config.lambda_errors.ok_topic_arn] : null
}

0 comments on commit c94a999

Please sign in to comment.