Separate monitoring notifications per alert (#10032)

This commit is contained in:
Robert Fairburn 2023-02-22 21:25:25 -06:00 committed by GitHub
parent c1a7b565f1
commit e760ce4ac5
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 35 additions and 28 deletions

View file

@ -30,7 +30,7 @@ module "monitoring" {
alb_target_group_name = module.main.byo-vpc.byo-db.alb.target_group_names[0]
alb_target_group_arn_suffix = module.main.byo-vpc.byo-db.alb.target_group_arn_suffixes[0]
alb_arn_suffix = module.main.byo-vpc.byo-db.alb.lb_arn_suffix
sns_topic_arn = var.sns_topic_arn
default_sns_topic_arns = [var.sns_topic_arn]
mysql_cluster_members = module.main.byo-vpc.rds.cluster_members
redis_cluster_members = module.main.byo-vpc.redis.member_clusters[0]
acm_certificate_arn = module.acm.acm_certificate_arn

View file

@ -30,7 +30,7 @@ module "monitoring" {
alb_target_group_name = module.main.byo-vpc.byo-db.alb.target_group_names[0]
alb_target_group_arn_suffix = module.main.byo-vpc.byo-db.alb.target_group_arn_suffixes[0]
alb_arn_suffix = module.main.byo-vpc.byo-db.alb.lb_arn_suffix
sns_topic_arn = var.sns_topic_arn
default_sns_topic_arns = [var.sns_topic_arn]
mysql_cluster_members = module.main.byo-vpc.rds.cluster_members
redis_cluster_members = module.main.byo-vpc.redis.member_clusters[0]
acm_certificate_arn = module.acm.acm_certificate_arn
@ -77,11 +77,12 @@ No modules.
| <a name="input_alb_target_group_arn_suffix"></a> [alb\_target\_group\_arn\_suffix](#input\_alb\_target\_group\_arn\_suffix) | n/a | `string` | `null` | no |
| <a name="input_alb_target_group_name"></a> [alb\_target\_group\_name](#input\_alb\_target\_group\_name) | n/a | `string` | `null` | no |
| <a name="input_customer_prefix"></a> [customer\_prefix](#input\_customer\_prefix) | n/a | `string` | `"fleet"` | no |
| <a name="input_default_sns_topic_arns"></a> [default\_sns\_topic\_arns](#input\_default\_sns\_topic\_arns) | n/a | `list(string)` | `[]` | no |
| <a name="input_fleet_ecs_service_name"></a> [fleet\_ecs\_service\_name](#input\_fleet\_ecs\_service\_name) | n/a | `string` | `null` | no |
| <a name="input_fleet_min_containers"></a> [fleet\_min\_containers](#input\_fleet\_min\_containers) | n/a | `number` | `1` | no |
| <a name="input_mysql_cluster_members"></a> [mysql\_cluster\_members](#input\_mysql\_cluster\_members) | n/a | `list(string)` | `[]` | no |
| <a name="input_redis_cluster_members"></a> [redis\_cluster\_members](#input\_redis\_cluster\_members) | n/a | `list(string)` | `[]` | no |
| <a name="input_sns_topic_arn"></a> [sns\_topic\_arn](#input\_sns\_topic\_arn) | n/a | `string` | n/a | yes |
| <a name="input_sns_topic_arns_map"></a> [sns\_topic\_arns\_map](#input\_sns\_topic\_arns\_map) | n/a | `map(list(string))` | `{}` | no |
## Outputs

View file

@ -10,17 +10,17 @@ resource "aws_cloudwatch_metric_alarm" "cpu_utilization_too_high" {
statistic = "Average"
threshold = 80
alarm_description = "Average database CPU utilization over last 5 minutes too high"
alarm_actions = [var.sns_topic_arn]
ok_actions = [var.sns_topic_arn]
alarm_actions = lookup(var.sns_topic_arns_map, "rds_cpu_untilizaton_too_high", var.default_sns_topic_arns)
ok_actions = lookup(var.sns_topic_arns_map, "rds_cpu_untilizaton_too_high", var.default_sns_topic_arns)
dimensions = {
DBInstanceIdentifier = each.key
}
}
resource "aws_db_event_subscription" "default" {
count = var.mysql_cluster_members == [] ? 0 : 1
count = length(var.mysql_cluster_members) == 0 || (contains(keys(var.sns_topic_arns_map), "rds_db_event_subscription") == false && length(var.default_sns_topic_arns) == 0) ? 0 : 1
name = "rds-event-sub-${var.customer_prefix}"
sns_topic = var.sns_topic_arn
sns_topic = try(var.sns_topic_arns_map.rds_db_event_subscription[0], var.default_sns_topic_arns[0])
source_type = "db-instance"
source_ids = var.mysql_cluster_members
@ -49,8 +49,8 @@ resource "aws_cloudwatch_metric_alarm" "alb_healthyhosts" {
threshold = var.fleet_min_containers
alarm_description = "This alarm indicates the number of Healthy Fleet hosts is lower than expected. Please investigate the load balancer \"${var.alb_name}\" or the target group \"${var.alb_target_group_name}\" and the fleet backend service \"${var.fleet_ecs_service_name}\""
actions_enabled = "true"
alarm_actions = [var.sns_topic_arn]
ok_actions = [var.sns_topic_arn]
alarm_actions = lookup(var.sns_topic_arns_map, "alb_helthyhosts", var.default_sns_topic_arns)
ok_actions = lookup(var.sns_topic_arns_map, "alb_helthyhosts", var.default_sns_topic_arns)
dimensions = {
TargetGroup = var.alb_target_group_arn_suffix
LoadBalancer = var.alb_arn_suffix
@ -65,8 +65,8 @@ resource "aws_cloudwatch_metric_alarm" "target_response_time" {
evaluation_periods = "2"
threshold_metric_id = "e1"
alarm_description = "This alarm indicates the Fleet server response time is greater than it usually is. Please investigate the ecs service \"${var.fleet_ecs_service_name}\" because the backend might need to be scaled up."
alarm_actions = [var.sns_topic_arn]
ok_actions = [var.sns_topic_arn]
alarm_actions = lookup(var.sns_topic_arns_map, "backend_response_time", var.default_sns_topic_arns)
ok_actions = lookup(var.sns_topic_arns_map, "backend_response_time", var.default_sns_topic_arns)
insufficient_data_actions = []
metric_query {
@ -105,8 +105,8 @@ resource "aws_cloudwatch_metric_alarm" "lb" {
statistic = "Sum"
threshold = "0"
alarm_description = "This alarm indicates there are an abnormal amount of 5XX responses. Either the lb cannot talk with the Fleet backend target or Fleet is returning an error."
alarm_actions = [var.sns_topic_arn]
ok_actions = [var.sns_topic_arn]
alarm_actions = lookup(var.sns_topic_arns_map, "alb_httpcode_5xx", var.default_sns_topic_arns)
ok_actions = lookup(var.sns_topic_arns_map, "alb_httpcode_5xx", var.default_sns_topic_arns)
treat_missing_data = "notBreaching"
dimensions = {
LoadBalancer = var.alb_arn_suffix
@ -125,8 +125,8 @@ resource "aws_cloudwatch_metric_alarm" "redis_cpu" {
namespace = "AWS/ElastiCache"
period = "300"
statistic = "Average"
alarm_actions = [var.sns_topic_arn]
ok_actions = [var.sns_topic_arn]
alarm_actions = lookup(var.sns_topic_arns_map, "redis_cpu_utilization", var.default_sns_topic_arns)
ok_actions = lookup(var.sns_topic_arns_map, "redis_cpu_utilization", var.default_sns_topic_arns)
threshold = "70"
@ -146,8 +146,8 @@ resource "aws_cloudwatch_metric_alarm" "redis_cpu_engine_utilization" {
namespace = "AWS/ElastiCache"
period = "300"
statistic = "Average"
alarm_actions = [var.sns_topic_arn]
ok_actions = [var.sns_topic_arn]
alarm_actions = lookup(var.sns_topic_arns_map, "redis_cpu_engine_utilization", var.default_sns_topic_arns)
ok_actions = lookup(var.sns_topic_arns_map, "redis_cpu_engine_utilization", var.default_sns_topic_arns)
threshold = "25"
@ -167,8 +167,8 @@ resource "aws_cloudwatch_metric_alarm" "redis-database-memory-percentage" {
namespace = "AWS/ElastiCache"
period = "300"
statistic = "Average"
alarm_actions = [var.sns_topic_arn]
ok_actions = [var.sns_topic_arn]
alarm_actions = lookup(var.sns_topic_arns_map, "redis_database_memory_percentage", var.default_sns_topic_arns)
ok_actions = lookup(var.sns_topic_arns_map, "redis_database_memory_percentage", var.default_sns_topic_arns)
threshold = "80"
@ -185,8 +185,8 @@ resource "aws_cloudwatch_metric_alarm" "redis-current-connections" {
comparison_operator = "LessThanLowerOrGreaterThanUpperThreshold"
evaluation_periods = "5"
threshold_metric_id = "e1"
alarm_actions = [var.sns_topic_arn]
ok_actions = [var.sns_topic_arn]
alarm_actions = lookup(var.sns_topic_arns_map, "redis_current_connections", var.default_sns_topic_arns)
ok_actions = lookup(var.sns_topic_arns_map, "redis_current_connections", var.default_sns_topic_arns)
insufficient_data_actions = []
metric_query {
@ -215,13 +215,13 @@ resource "aws_cloudwatch_metric_alarm" "redis-current-connections" {
resource "aws_cloudwatch_metric_alarm" "redis-replication-lag" {
for_each = toset(var.redis_cluster_members)
alarm_name = "redis-replication-lag-${var.customer_prefix}"
alarm_name = "redis-replication-lag-${each.key}-${var.customer_prefix}"
alarm_description = "This metric is only applicable for a node running as a read replica. It represents how far behind, in seconds, the replica is in applying changes from the primary node. For Redis engine version 5.0.6 onwards, the lag can be measured in milliseconds."
comparison_operator = "GreaterThanUpperThreshold"
evaluation_periods = "3"
threshold_metric_id = "e1"
alarm_actions = [var.sns_topic_arn]
ok_actions = [var.sns_topic_arn]
alarm_actions = lookup(var.sns_topic_arns_map, "redis_replication_lag", var.default_sns_topic_arns)
ok_actions = lookup(var.sns_topic_arns_map, "redis_replication_lag", var.default_sns_topic_arns)
insufficient_data_actions = []
metric_query {
@ -260,8 +260,8 @@ resource "aws_cloudwatch_metric_alarm" "acm_certificate_expired" {
metric_name = "DaysToExpiry"
actions_enabled = "true"
alarm_description = "ACM Certificate will expire soon"
alarm_actions = [var.sns_topic_arn]
ok_actions = [var.sns_topic_arn]
alarm_actions = lookup(var.sns_topic_arns_map, "acm_certificate_expired", var.default_sns_topic_arns)
ok_actions = lookup(var.sns_topic_arns_map, "acm_certificate_expired", var.default_sns_topic_arns)
dimensions = {
CertificateArn = var.acm_certificate_arn

View file

@ -33,8 +33,14 @@ variable "alb_arn_suffix" {
default = null
}
variable "sns_topic_arn" {
type = string
variable "default_sns_topic_arns" {
type = list(string)
default = []
}
variable "sns_topic_arns_map" {
type = map(list(string))
default = {}
}
variable "mysql_cluster_members" {