From abf99db95dcf03d91bf8a053033171f48c2de8ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Thu, 13 Jun 2024 16:40:17 +0100 Subject: [PATCH] Remove Grafana monitoring service (#5750) --- README.md | 2 - images/dockerfiles/nginx/grafana.nginx.conf | 35 ----- images/terraform/ecr.tf | 13 -- monitoring/README.md | 9 -- monitoring/terraform/efs.tf | 51 -------- monitoring/terraform/main.tf | 20 --- monitoring/terraform/stack/dns.tf | 18 --- monitoring/terraform/stack/ecs.tf | 3 - monitoring/terraform/stack/iam_role_policy.tf | 28 ---- monitoring/terraform/stack/load_balancer.tf | 65 ---------- monitoring/terraform/stack/provider.tf | 8 -- monitoring/terraform/stack/secrets.tf | 27 ---- monitoring/terraform/stack/security_groups.tf | 73 ----------- monitoring/terraform/stack/service.tf | 121 ------------------ monitoring/terraform/stack/variables.tf | 34 ----- 15 files changed, 507 deletions(-) delete mode 100644 images/dockerfiles/nginx/grafana.nginx.conf delete mode 100644 monitoring/terraform/efs.tf delete mode 100644 monitoring/terraform/main.tf delete mode 100644 monitoring/terraform/stack/dns.tf delete mode 100644 monitoring/terraform/stack/ecs.tf delete mode 100644 monitoring/terraform/stack/iam_role_policy.tf delete mode 100644 monitoring/terraform/stack/load_balancer.tf delete mode 100644 monitoring/terraform/stack/provider.tf delete mode 100644 monitoring/terraform/stack/secrets.tf delete mode 100644 monitoring/terraform/stack/security_groups.tf delete mode 100644 monitoring/terraform/stack/service.tf delete mode 100644 monitoring/terraform/stack/variables.tf diff --git a/README.md b/README.md index 73a9aa94..39f86e4c 100644 --- a/README.md +++ b/README.md @@ -12,8 +12,6 @@ Wellcome Collection common infrastructure. - [cloudfront](cloudfront/README.md): Managing the infrastructure for Wellcome Collection's CloudFront distributions & DNS. -- [monitoring](monitoring/README.md): Grafana platform monitoring stack. - - **photography_backups**: Backup storage for photography (needs cleanup?) ## No longer in this repo diff --git a/images/dockerfiles/nginx/grafana.nginx.conf b/images/dockerfiles/nginx/grafana.nginx.conf deleted file mode 100644 index 567151fe..00000000 --- a/images/dockerfiles/nginx/grafana.nginx.conf +++ /dev/null @@ -1,35 +0,0 @@ -worker_processes 1; - -events { worker_connections 1024; } - -http { - server { - listen 9000; - - # Ensure that requests sent over HTTP are redirected to HTTPS. - # http://scottwb.com/blog/2013/10/28/always-on-https-with-nginx-behind-an-elb/ - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header Host $http_host; - proxy_redirect off; - proxy_next_upstream error; - - if ($http_x_forwarded_proto != "https") { - set $https_redirect "1"; - } - - if ($http_user_agent != "ELB-HealthChecker/2.0") { - set $https_redirect "${https_redirect}1"; - } - - if ($https_redirect = "11") { - rewrite ^(.*)$ https://monitoring.wellcomecollection.org$1 permanent; - } - - add_header Strict-Transport-Security "max-age=31536000; includeSubDomains;"; - - location / { - proxy_pass http://app:3000; - } - } -} diff --git a/images/terraform/ecr.tf b/images/terraform/ecr.tf index f1964d81..4ef72bc0 100644 --- a/images/terraform/ecr.tf +++ b/images/terraform/ecr.tf @@ -49,19 +49,6 @@ module "ecr_nginx_frontend_identity" { } } -module "ecr_nginx_grafana" { - source = "./repo_pair" - - namespace = local.namespace - repo_name = "nginx_grafana" - - description = "An nginx image for reverse proxying Grafana" - - providers = { - aws.ecr_public = aws.ecr_public - } -} - module "ecr_nginx_apigw" { source = "./repo_pair" diff --git a/monitoring/README.md b/monitoring/README.md index 6506cbb1..567553d5 100644 --- a/monitoring/README.md +++ b/monitoring/README.md @@ -1,13 +1,4 @@ # Monitoring - -## Grafana dashboard - -We have a [Grafana][grafana] dashboard for monitoring load tests, queue sizes, and our AWS bill, among other things. - -It can be viewed at (note this is only accessible from within the Wellcome IP range). - -[grafana]: https://grafana.com/ - ## Slack alarms We have Slack alerts for certain failures across the platform, which are published into the [#wc-platform-alerts channel][slack]. diff --git a/monitoring/terraform/efs.tf b/monitoring/terraform/efs.tf deleted file mode 100644 index 8a8dc962..00000000 --- a/monitoring/terraform/efs.tf +++ /dev/null @@ -1,51 +0,0 @@ -resource "aws_efs_file_system" "efs" { - creation_token = "grafana_efs" - performance_mode = "generalPurpose" -} - -resource "aws_efs_mount_target" "mount_target" { - count = length(local.private_subnets) - file_system_id = aws_efs_file_system.efs.id - subnet_id = local.private_subnets[count.index] - security_groups = [aws_security_group.efs_mnt.id] -} - -resource "aws_security_group" "efs_mnt" { - description = "security groupt for efs mounts" - vpc_id = local.vpc_id - name = "grafana_efs_sg" - - ingress { - protocol = "tcp" - from_port = 2049 - to_port = 2049 - - security_groups = [ - aws_security_group.efs_security_group.id, - ] - } -} - -resource "aws_security_group" "efs_security_group" { - name = "${local.namespace}_efs_security_group" - description = "Allow traffic between services and efs" - vpc_id = local.vpc_id - - ingress { - from_port = 0 - to_port = 0 - protocol = "-1" - self = true - } - - egress { - from_port = 0 - to_port = 0 - protocol = "-1" - cidr_blocks = ["0.0.0.0/0"] - } - - tags = { - Name = "${local.namespace}-efs" - } -} diff --git a/monitoring/terraform/main.tf b/monitoring/terraform/main.tf deleted file mode 100644 index fdd89cc1..00000000 --- a/monitoring/terraform/main.tf +++ /dev/null @@ -1,20 +0,0 @@ -module "monitoring-271118" { - source = "./stack" - - grafana_version = "9.5.2" - namespace = "monitoring-271118" - - efs_id = aws_efs_file_system.efs.id - efs_security_group_id = aws_security_group.efs_security_group.id - ec_privatelink_security_group_id = local.ec_privatelink_sg_id - - domain = "monitoring.wellcomecollection.org" - - vpc_id = local.vpc_id - public_subnets = local.public_subnets - private_subnets = local.private_subnets - - providers = { - aws.dns = aws.dns - } -} diff --git a/monitoring/terraform/stack/dns.tf b/monitoring/terraform/stack/dns.tf deleted file mode 100644 index 4e76378a..00000000 --- a/monitoring/terraform/stack/dns.tf +++ /dev/null @@ -1,18 +0,0 @@ -resource "aws_route53_record" "monitoring_wc_org" { - provider = aws.dns - - zone_id = data.aws_route53_zone.dotorg.zone_id - name = var.domain - type = "A" - - alias { - name = aws_alb.alb.dns_name - zone_id = aws_alb.alb.zone_id - evaluate_target_health = true - } -} - -data "aws_route53_zone" "dotorg" { - provider = aws.dns - name = "wellcomecollection.org." -} diff --git a/monitoring/terraform/stack/ecs.tf b/monitoring/terraform/stack/ecs.tf deleted file mode 100644 index b72dbeff..00000000 --- a/monitoring/terraform/stack/ecs.tf +++ /dev/null @@ -1,3 +0,0 @@ -resource "aws_ecs_cluster" "cluster" { - name = var.namespace -} diff --git a/monitoring/terraform/stack/iam_role_policy.tf b/monitoring/terraform/stack/iam_role_policy.tf deleted file mode 100644 index 367a1c23..00000000 --- a/monitoring/terraform/stack/iam_role_policy.tf +++ /dev/null @@ -1,28 +0,0 @@ -# Grafana - -resource "aws_iam_role_policy" "ecs_grafana_task_cloudwatch_read" { - role = module.task_definition.task_role_name - policy = data.aws_iam_policy_document.read_cloudwatch_metrics.json -} - -data "aws_iam_policy_document" "read_cloudwatch_metrics" { - statement { - actions = [ - "cloudwatch:DescribeAlarmHistory", - "cloudwatch:DescribeAlarms", - "cloudwatch:DescribeAlarmsForMetric", - "cloudwatch:GetMetricData", - "cloudwatch:GetMetricStatistics", - "cloudwatch:ListMetrics", - ] - - resources = [ - "*", - ] - } - - statement { - actions = ["oam:ListSinks", "oam:ListAttachedLinks"] - resources = ["*"] - } -} diff --git a/monitoring/terraform/stack/load_balancer.tf b/monitoring/terraform/stack/load_balancer.tf deleted file mode 100644 index c5d818c2..00000000 --- a/monitoring/terraform/stack/load_balancer.tf +++ /dev/null @@ -1,65 +0,0 @@ -resource "aws_alb" "alb" { - # This name can only contain alphanumerics and hyphens - name = "${replace(var.namespace, "_", "-")}-grafana" - - subnets = var.public_subnets - - security_groups = [ - aws_security_group.service_lb_security_group.id, - aws_security_group.external_lb_security_group.id, - ] -} - -resource "aws_alb_target_group" "grafana_ecs_service" { - name = "monitoring-grafana" - - target_type = "ip" - protocol = "HTTP" - port = local.container_port - vpc_id = var.vpc_id - - health_check { - protocol = "HTTP" - path = "/api/health" - matcher = "200" - } -} - -resource "aws_alb_listener" "https" { - load_balancer_arn = aws_alb.alb.id - port = "443" - protocol = "HTTPS" - ssl_policy = "ELBSecurityPolicy-2015-05" - certificate_arn = module.cert.arn - - default_action { - target_group_arn = aws_alb_target_group.grafana_ecs_service.arn - type = "forward" - } -} - -resource "aws_alb_listener" "http" { - load_balancer_arn = aws_alb.alb.id - port = "80" - protocol = "HTTP" - - default_action { - type = "redirect" - redirect { - port = "443" - protocol = "HTTPS" - status_code = "HTTP_301" - } - } -} - -module "cert" { - source = "github.com/wellcomecollection/terraform-aws-acm-certificate?ref=v2.0.0" - - domain_name = var.domain - zone_id = data.aws_route53_zone.dotorg.id - - providers = { - aws.dns = aws.dns - } -} diff --git a/monitoring/terraform/stack/provider.tf b/monitoring/terraform/stack/provider.tf deleted file mode 100644 index c47d4300..00000000 --- a/monitoring/terraform/stack/provider.tf +++ /dev/null @@ -1,8 +0,0 @@ -terraform { - required_providers { - aws = { - source = "hashicorp/aws" - configuration_aliases = [aws.dns] - } - } -} diff --git a/monitoring/terraform/stack/secrets.tf b/monitoring/terraform/stack/secrets.tf deleted file mode 100644 index bc1867f1..00000000 --- a/monitoring/terraform/stack/secrets.tf +++ /dev/null @@ -1,27 +0,0 @@ -resource "aws_secretsmanager_secret" "grafana_admin_password" { - name = "monitoring/${var.namespace}/grafana/admin_password" -} - -resource "aws_secretsmanager_secret_version" "grafana_admin_password" { - secret_id = aws_secretsmanager_secret.grafana_admin_password.id - secret_string = random_password.grafana_admin_password.result -} - -resource "random_password" "grafana_admin_password" { - length = 20 -} - -locals { - # These are configured outside of terraform - external_grafana_secrets = toset([ - "azure_application_id", - "azure_client_secret", - "azure_auth_url", - "azure_token_url", - ]) -} - -resource "aws_secretsmanager_secret" "external_grafana_secrets" { - for_each = local.external_grafana_secrets - name = "monitoring/${var.namespace}/grafana/${each.key}" -} diff --git a/monitoring/terraform/stack/security_groups.tf b/monitoring/terraform/stack/security_groups.tf deleted file mode 100644 index 4b583c86..00000000 --- a/monitoring/terraform/stack/security_groups.tf +++ /dev/null @@ -1,73 +0,0 @@ -resource "aws_security_group" "service_egress_security_group" { - name = "${var.namespace}-grafana_service_egress_security_group" - description = "Allow the service to make network requests" - vpc_id = var.vpc_id - - egress { - from_port = 0 - to_port = 0 - protocol = "-1" - cidr_blocks = ["0.0.0.0/0"] - } - - tags = { - Name = "${var.namespace}-grafana-egress" - } -} - -resource "aws_security_group" "service_lb_security_group" { - name = "${var.namespace}-grafana_service_lb_security_group" - description = "Allow traffic between services and load balancer" - vpc_id = var.vpc_id - - ingress { - protocol = "tcp" - from_port = 3000 - to_port = 3000 - self = true - } - - egress { - from_port = 0 - to_port = 0 - protocol = "-1" - cidr_blocks = ["0.0.0.0/0"] - } - - tags = { - Name = "${var.namespace}-grafana-service-lb" - } -} - -resource "aws_security_group" "external_lb_security_group" { - name = "${var.namespace}-grafana_external_lb_security_group" - description = "Allow traffic between load balancer and internet" - vpc_id = var.vpc_id - - ingress { - protocol = "tcp" - from_port = 443 - to_port = 443 - - cidr_blocks = ["0.0.0.0/0"] - } - - ingress { - protocol = "tcp" - from_port = 80 - to_port = 80 - - cidr_blocks = ["0.0.0.0/0"] - } - - egress { - from_port = 0 - to_port = 0 - protocol = "-1" - cidr_blocks = ["0.0.0.0/0"] - } - - tags = { - Name = "${var.namespace}-grafana-external-lb" - } -} diff --git a/monitoring/terraform/stack/service.tf b/monitoring/terraform/stack/service.tf deleted file mode 100644 index c48dbfd4..00000000 --- a/monitoring/terraform/stack/service.tf +++ /dev/null @@ -1,121 +0,0 @@ -locals { - container_port = 3000 - container_name = "app" - efs_volume_name = "efs" - - grafana_env = { - GF_SERVER_DOMAIN = var.domain - GF_SERVER_ROOT_URL = "https://${var.domain}/" - GF_SECURITY_ADMIN_USER = "admin" - GF_USERS_AUTO_ASSIGN_ORG_ROLE = "Editor" - # This is used to fetch the current pipeline date - # https://grafana.github.io/grafana-json-datasource/ - GF_INSTALL_PLUGINS = "marcusolsson-json-datasource" - # See https://grafana.com/docs/grafana/v9.3/setup-grafana/configure-security/configure-authentication/azuread/#enable-azure-ad-oauth-in-grafana - GF_AUTH_AZUREAD_NAME = "Azure AD" - GF_AUTH_AZUREAD_SCOPES = "openid email profile offline_access" - GF_AUTH_AZUREAD_ENABLED = true - GF_AUTH_AZUREAD_ALLOW_SIGN_UP = true - GF_AUTH_AZUREAD_AUTO_LOGIN = false - GF_AUTH_AZUREAD_ROLE_ATTRIBUTE_STRICT = false - GF_AUTH_AZUREAD_ALLOW_ASSIGN_GRAFANA_ADMIN = false - GF_AUTH_AZUREAD_USE_PKCE = true - } - grafana_secrets = { - GF_SECURITY_ADMIN_PASSWORD = "monitoring/${var.namespace}/grafana/admin_password" - GF_AUTH_AZUREAD_CLIENT_ID = "monitoring/${var.namespace}/grafana/azure_application_id" - GF_AUTH_AZUREAD_CLIENT_SECRET = "monitoring/${var.namespace}/grafana/azure_client_secret" - GF_AUTH_AZUREAD_AUTH_URL = "monitoring/${var.namespace}/grafana/azure_auth_url" - GF_AUTH_AZUREAD_TOKEN_URL = "monitoring/${var.namespace}/grafana/azure_token_url" - } -} - -module "log_router_container" { - source = "github.com/wellcomecollection/terraform-aws-ecs-service//modules/firelens?ref=v3.13.2" - - namespace = "${var.namespace}-grafana" - use_privatelink_endpoint = true -} - -module "log_router_container_secrets_permissions" { - source = "github.com/wellcomecollection/terraform-aws-ecs-service//modules/secrets?ref=v3.13.2" - secrets = module.log_router_container.shared_secrets_logging - role_name = module.task_definition.task_execution_role_name -} - -module "grafana_app_container" { - source = "github.com/wellcomecollection/terraform-aws-ecs-service//modules/container_definition?ref=v3.13.2" - - name = "app" - image = "grafana/grafana-oss:${var.grafana_version}" - - mount_points = [{ - containerPath = "/var/lib/grafana" - sourceVolume = local.efs_volume_name - }] - - environment = local.grafana_env - secrets = local.grafana_secrets - - port_mappings = [{ - containerPort = local.container_port - hostPort = local.container_port - protocol = "tcp" - }] - - log_configuration = module.log_router_container.container_log_configuration -} - -module "app_permissions" { - source = "git::github.com/wellcomecollection/terraform-aws-ecs-service.git//modules/secrets?ref=v3.13.2" - secrets = local.grafana_secrets - role_name = module.task_definition.task_execution_role_name -} - -module "task_definition" { - source = "github.com/wellcomecollection/terraform-aws-ecs-service//modules/task_definition?ref=v3.13.2" - - task_name = "${var.namespace}-grafana" - launch_types = ["FARGATE"] - - // Grafana previously ran on a t2.small (1vCPU/2GB) - // That was probably too large but its CPU usage is very bursty - // and unfortunately Fargate is not burstable (t2s are), so - // we're trying this as a compromise between latency and cost - cpu = 512 - memory = 1024 - - container_definitions = [ - module.grafana_app_container.container_definition, - module.log_router_container.container_definition - ] - - efs_volumes = [{ - name = local.efs_volume_name - file_system_id = var.efs_id - root_directory = "/grafana" - }] -} - -module "service" { - source = "github.com/wellcomecollection/terraform-aws-ecs-service//modules/service?ref=v3.13.2" - - service_name = "${var.namespace}-grafana" - cluster_arn = aws_ecs_cluster.cluster.arn - task_definition_arn = module.task_definition.arn - - // This service doesn't need to be up all the time, trying this to keep things frugal - use_fargate_spot = true - - container_name = local.container_name - container_port = local.container_port - - target_group_arn = aws_alb_target_group.grafana_ecs_service.arn - subnets = var.private_subnets - security_group_ids = [ - aws_security_group.service_lb_security_group.id, - aws_security_group.service_egress_security_group.id, - var.efs_security_group_id, - var.ec_privatelink_security_group_id - ] -} diff --git a/monitoring/terraform/stack/variables.tf b/monitoring/terraform/stack/variables.tf deleted file mode 100644 index f658742e..00000000 --- a/monitoring/terraform/stack/variables.tf +++ /dev/null @@ -1,34 +0,0 @@ -variable "namespace" { - type = string -} - -variable "vpc_id" { - type = string -} - -variable "efs_id" { - type = string -} -variable "efs_security_group_id" { - type = string -} -variable "ec_privatelink_security_group_id" { - type = string -} - -variable "domain" { - type = string -} - -variable "public_subnets" { - type = list(string) -} - -variable "private_subnets" { - type = list(string) -} - -# Grafana -variable "grafana_version" { - type = string -}