From e760ce4ac5ef2a64d93c89045c401d7100fb30d3 Mon Sep 17 00:00:00 2001
From: Robert Fairburn <8029478+rfairburn@users.noreply.github.com>
Date: Wed, 22 Feb 2023 21:25:25 -0600
Subject: [PATCH] Separate monitoring notifications per alert (#10032)
---
terraform/addons/monitoring/.header.md | 2 +-
terraform/addons/monitoring/README.md | 5 +--
terraform/addons/monitoring/main.tf | 46 ++++++++++++------------
terraform/addons/monitoring/variables.tf | 10 ++++--
4 files changed, 35 insertions(+), 28 deletions(-)
diff --git a/terraform/addons/monitoring/.header.md b/terraform/addons/monitoring/.header.md
index 59c57b5b74..5de045a3a5 100644
--- a/terraform/addons/monitoring/.header.md
+++ b/terraform/addons/monitoring/.header.md
@@ -30,7 +30,7 @@ module "monitoring" {
alb_target_group_name = module.main.byo-vpc.byo-db.alb.target_group_names[0]
alb_target_group_arn_suffix = module.main.byo-vpc.byo-db.alb.target_group_arn_suffixes[0]
alb_arn_suffix = module.main.byo-vpc.byo-db.alb.lb_arn_suffix
- sns_topic_arn = var.sns_topic_arn
+ default_sns_topic_arns = [var.sns_topic_arn]
mysql_cluster_members = module.main.byo-vpc.rds.cluster_members
redis_cluster_members = module.main.byo-vpc.redis.member_clusters[0]
acm_certificate_arn = module.acm.acm_certificate_arn
diff --git a/terraform/addons/monitoring/README.md b/terraform/addons/monitoring/README.md
index a775e392a5..fbf3365fdb 100644
--- a/terraform/addons/monitoring/README.md
+++ b/terraform/addons/monitoring/README.md
@@ -30,7 +30,7 @@ module "monitoring" {
alb_target_group_name = module.main.byo-vpc.byo-db.alb.target_group_names[0]
alb_target_group_arn_suffix = module.main.byo-vpc.byo-db.alb.target_group_arn_suffixes[0]
alb_arn_suffix = module.main.byo-vpc.byo-db.alb.lb_arn_suffix
- sns_topic_arn = var.sns_topic_arn
+ default_sns_topic_arns = [var.sns_topic_arn]
mysql_cluster_members = module.main.byo-vpc.rds.cluster_members
redis_cluster_members = module.main.byo-vpc.redis.member_clusters[0]
acm_certificate_arn = module.acm.acm_certificate_arn
@@ -77,11 +77,12 @@ No modules.
| [alb\_target\_group\_arn\_suffix](#input\_alb\_target\_group\_arn\_suffix) | n/a | `string` | `null` | no |
| [alb\_target\_group\_name](#input\_alb\_target\_group\_name) | n/a | `string` | `null` | no |
| [customer\_prefix](#input\_customer\_prefix) | n/a | `string` | `"fleet"` | no |
+| [default\_sns\_topic\_arns](#input\_default\_sns\_topic\_arns) | n/a | `list(string)` | `[]` | no |
| [fleet\_ecs\_service\_name](#input\_fleet\_ecs\_service\_name) | n/a | `string` | `null` | no |
| [fleet\_min\_containers](#input\_fleet\_min\_containers) | n/a | `number` | `1` | no |
| [mysql\_cluster\_members](#input\_mysql\_cluster\_members) | n/a | `list(string)` | `[]` | no |
| [redis\_cluster\_members](#input\_redis\_cluster\_members) | n/a | `list(string)` | `[]` | no |
-| [sns\_topic\_arn](#input\_sns\_topic\_arn) | n/a | `string` | n/a | yes |
+| [sns\_topic\_arns\_map](#input\_sns\_topic\_arns\_map) | n/a | `map(list(string))` | `{}` | no |
## Outputs
diff --git a/terraform/addons/monitoring/main.tf b/terraform/addons/monitoring/main.tf
index 835b59dc46..2911a290eb 100644
--- a/terraform/addons/monitoring/main.tf
+++ b/terraform/addons/monitoring/main.tf
@@ -10,17 +10,17 @@ resource "aws_cloudwatch_metric_alarm" "cpu_utilization_too_high" {
statistic = "Average"
threshold = 80
alarm_description = "Average database CPU utilization over last 5 minutes too high"
- alarm_actions = [var.sns_topic_arn]
- ok_actions = [var.sns_topic_arn]
+ alarm_actions = lookup(var.sns_topic_arns_map, "rds_cpu_untilizaton_too_high", var.default_sns_topic_arns)
+ ok_actions = lookup(var.sns_topic_arns_map, "rds_cpu_untilizaton_too_high", var.default_sns_topic_arns)
dimensions = {
DBInstanceIdentifier = each.key
}
}
resource "aws_db_event_subscription" "default" {
- count = var.mysql_cluster_members == [] ? 0 : 1
+ count = length(var.mysql_cluster_members) == 0 || (contains(keys(var.sns_topic_arns_map), "rds_db_event_subscription") == false && length(var.default_sns_topic_arns) == 0) ? 0 : 1
name = "rds-event-sub-${var.customer_prefix}"
- sns_topic = var.sns_topic_arn
+ sns_topic = try(var.sns_topic_arns_map.rds_db_event_subscription[0], var.default_sns_topic_arns[0])
source_type = "db-instance"
source_ids = var.mysql_cluster_members
@@ -49,8 +49,8 @@ resource "aws_cloudwatch_metric_alarm" "alb_healthyhosts" {
threshold = var.fleet_min_containers
alarm_description = "This alarm indicates the number of Healthy Fleet hosts is lower than expected. Please investigate the load balancer \"${var.alb_name}\" or the target group \"${var.alb_target_group_name}\" and the fleet backend service \"${var.fleet_ecs_service_name}\""
actions_enabled = "true"
- alarm_actions = [var.sns_topic_arn]
- ok_actions = [var.sns_topic_arn]
+ alarm_actions = lookup(var.sns_topic_arns_map, "alb_helthyhosts", var.default_sns_topic_arns)
+ ok_actions = lookup(var.sns_topic_arns_map, "alb_helthyhosts", var.default_sns_topic_arns)
dimensions = {
TargetGroup = var.alb_target_group_arn_suffix
LoadBalancer = var.alb_arn_suffix
@@ -65,8 +65,8 @@ resource "aws_cloudwatch_metric_alarm" "target_response_time" {
evaluation_periods = "2"
threshold_metric_id = "e1"
alarm_description = "This alarm indicates the Fleet server response time is greater than it usually is. Please investigate the ecs service \"${var.fleet_ecs_service_name}\" because the backend might need to be scaled up."
- alarm_actions = [var.sns_topic_arn]
- ok_actions = [var.sns_topic_arn]
+ alarm_actions = lookup(var.sns_topic_arns_map, "backend_response_time", var.default_sns_topic_arns)
+ ok_actions = lookup(var.sns_topic_arns_map, "backend_response_time", var.default_sns_topic_arns)
insufficient_data_actions = []
metric_query {
@@ -105,8 +105,8 @@ resource "aws_cloudwatch_metric_alarm" "lb" {
statistic = "Sum"
threshold = "0"
alarm_description = "This alarm indicates there are an abnormal amount of 5XX responses. Either the lb cannot talk with the Fleet backend target or Fleet is returning an error."
- alarm_actions = [var.sns_topic_arn]
- ok_actions = [var.sns_topic_arn]
+ alarm_actions = lookup(var.sns_topic_arns_map, "alb_httpcode_5xx", var.default_sns_topic_arns)
+ ok_actions = lookup(var.sns_topic_arns_map, "alb_httpcode_5xx", var.default_sns_topic_arns)
treat_missing_data = "notBreaching"
dimensions = {
LoadBalancer = var.alb_arn_suffix
@@ -125,8 +125,8 @@ resource "aws_cloudwatch_metric_alarm" "redis_cpu" {
namespace = "AWS/ElastiCache"
period = "300"
statistic = "Average"
- alarm_actions = [var.sns_topic_arn]
- ok_actions = [var.sns_topic_arn]
+ alarm_actions = lookup(var.sns_topic_arns_map, "redis_cpu_utilization", var.default_sns_topic_arns)
+ ok_actions = lookup(var.sns_topic_arns_map, "redis_cpu_utilization", var.default_sns_topic_arns)
threshold = "70"
@@ -146,8 +146,8 @@ resource "aws_cloudwatch_metric_alarm" "redis_cpu_engine_utilization" {
namespace = "AWS/ElastiCache"
period = "300"
statistic = "Average"
- alarm_actions = [var.sns_topic_arn]
- ok_actions = [var.sns_topic_arn]
+ alarm_actions = lookup(var.sns_topic_arns_map, "redis_cpu_engine_utilization", var.default_sns_topic_arns)
+ ok_actions = lookup(var.sns_topic_arns_map, "redis_cpu_engine_utilization", var.default_sns_topic_arns)
threshold = "25"
@@ -167,8 +167,8 @@ resource "aws_cloudwatch_metric_alarm" "redis-database-memory-percentage" {
namespace = "AWS/ElastiCache"
period = "300"
statistic = "Average"
- alarm_actions = [var.sns_topic_arn]
- ok_actions = [var.sns_topic_arn]
+ alarm_actions = lookup(var.sns_topic_arns_map, "redis_database_memory_percentage", var.default_sns_topic_arns)
+ ok_actions = lookup(var.sns_topic_arns_map, "redis_database_memory_percentage", var.default_sns_topic_arns)
threshold = "80"
@@ -185,8 +185,8 @@ resource "aws_cloudwatch_metric_alarm" "redis-current-connections" {
comparison_operator = "LessThanLowerOrGreaterThanUpperThreshold"
evaluation_periods = "5"
threshold_metric_id = "e1"
- alarm_actions = [var.sns_topic_arn]
- ok_actions = [var.sns_topic_arn]
+ alarm_actions = lookup(var.sns_topic_arns_map, "redis_current_connections", var.default_sns_topic_arns)
+ ok_actions = lookup(var.sns_topic_arns_map, "redis_current_connections", var.default_sns_topic_arns)
insufficient_data_actions = []
metric_query {
@@ -215,13 +215,13 @@ resource "aws_cloudwatch_metric_alarm" "redis-current-connections" {
resource "aws_cloudwatch_metric_alarm" "redis-replication-lag" {
for_each = toset(var.redis_cluster_members)
- alarm_name = "redis-replication-lag-${var.customer_prefix}"
+ alarm_name = "redis-replication-lag-${each.key}-${var.customer_prefix}"
alarm_description = "This metric is only applicable for a node running as a read replica. It represents how far behind, in seconds, the replica is in applying changes from the primary node. For Redis engine version 5.0.6 onwards, the lag can be measured in milliseconds."
comparison_operator = "GreaterThanUpperThreshold"
evaluation_periods = "3"
threshold_metric_id = "e1"
- alarm_actions = [var.sns_topic_arn]
- ok_actions = [var.sns_topic_arn]
+ alarm_actions = lookup(var.sns_topic_arns_map, "redis_replication_lag", var.default_sns_topic_arns)
+ ok_actions = lookup(var.sns_topic_arns_map, "redis_replication_lag", var.default_sns_topic_arns)
insufficient_data_actions = []
metric_query {
@@ -260,8 +260,8 @@ resource "aws_cloudwatch_metric_alarm" "acm_certificate_expired" {
metric_name = "DaysToExpiry"
actions_enabled = "true"
alarm_description = "ACM Certificate will expire soon"
- alarm_actions = [var.sns_topic_arn]
- ok_actions = [var.sns_topic_arn]
+ alarm_actions = lookup(var.sns_topic_arns_map, "acm_certificate_expired", var.default_sns_topic_arns)
+ ok_actions = lookup(var.sns_topic_arns_map, "acm_certificate_expired", var.default_sns_topic_arns)
dimensions = {
CertificateArn = var.acm_certificate_arn
diff --git a/terraform/addons/monitoring/variables.tf b/terraform/addons/monitoring/variables.tf
index 1b1838dbde..5b2efdcbc4 100644
--- a/terraform/addons/monitoring/variables.tf
+++ b/terraform/addons/monitoring/variables.tf
@@ -33,8 +33,14 @@ variable "alb_arn_suffix" {
default = null
}
-variable "sns_topic_arn" {
- type = string
+variable "default_sns_topic_arns" {
+ type = list(string)
+ default = []
+}
+
+variable "sns_topic_arns_map" {
+ type = map(list(string))
+ default = {}
}
variable "mysql_cluster_members" {