Terraform monitoring alert thresholds update (#18790)

This allows customizing alert thresholds for http elb and target 5xx
errors. This will prevent a single 5xx from triggering an alert if we
decide the thresholds should be higher. The default 120 seconds with 0
tolerance will be used if nothing is passed in, but alternatives can be
specified by a threshold and an interval.

closes #18776

---------

Co-authored-by: Luke Heath <luke@fleetdm.com>
This commit is contained in:
Robert Fairburn 2024-05-07 11:17:01 -05:00 committed by GitHub
parent 434239e5f9
commit ef0414d1ec
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 28 additions and 5 deletions

View file

@ -138,7 +138,7 @@ No modules.
| Name | Description | Type | Default | Required |
|------|-------------|------|---------|:--------:|
| <a name="input_acm_certificate_arn"></a> [acm\_certificate\_arn](#input\_acm\_certificate\_arn) | n/a | `string` | `null` | no |
| <a name="input_albs"></a> [albs](#input\_albs) | n/a | <pre>list(object({<br> name = string<br> arn_suffix = string<br> target_group_name = string<br> target_group_arn_suffix = string<br> min_containers = optional(string, 1)<br> ecs_service_name = string<br> }))</pre> | `[]` | no |
| <a name="input_albs"></a> [albs](#input\_albs) | n/a | <pre>list(object({<br> name = string<br> arn_suffix = string<br> target_group_name = string<br> target_group_arn_suffix = string<br> min_containers = optional(string, 1)<br> ecs_service_name = string<br> alert_thresholds = optional(<br> object({<br> HTTPCode_ELB_5XX_Count = object({<br> period = number<br> threshold = number<br> })<br> HTTPCode_Target_5XX_Count = object({<br> period = number<br> threshold = number<br> })<br> }),<br> {<br> HTTPCode_ELB_5XX_Count = {<br> period = 120<br> threshold = 0<br> },<br> HTTPCode_Target_5XX_Count = {<br> period = 120<br> threshold = 0<br> }<br> }<br> )<br> }))</pre> | `[]` | no |
| <a name="input_cron_monitoring"></a> [cron\_monitoring](#input\_cron\_monitoring) | n/a | <pre>object({<br> mysql_host = string<br> mysql_database = string<br> mysql_user = string<br> mysql_password_secret_name = string<br> vpc_id = string<br> subnet_ids = list(string)<br> rds_security_group_id = string<br> delay_tolerance = string<br> run_interval = string<br> log_retention_in_days = optional(number, 7)<br> })</pre> | `null` | no |
| <a name="input_customer_prefix"></a> [customer\_prefix](#input\_customer\_prefix) | n/a | `string` | `"fleet"` | no |
| <a name="input_default_sns_topic_arns"></a> [default\_sns\_topic\_arns](#input\_default\_sns\_topic\_arns) | n/a | `list(string)` | `[]` | no |

View file

@ -37,7 +37,7 @@ resource "aws_db_event_subscription" "default" {
}
locals {
alb_map = {for k, v in var.albs: k => v}
alb_map = { for k, v in var.albs : k => v }
}
@ -102,7 +102,7 @@ resource "aws_cloudwatch_metric_alarm" "target_response_time" {
locals {
http_5xx_alert_names = ["HTTPCode_ELB_5XX_Count", "HTTPCode_Target_5XX_Count"]
http_5xx_alerts_list = flatten([for alert in local.http_5xx_alert_names : [for alb in var.albs : merge(alb, { "alert" : alert })]])
http_5xx_alerts = {for k, v in local.http_5xx_alerts_list : k => v}
http_5xx_alerts = { for k, v in local.http_5xx_alerts_list : k => v }
}
@ -113,9 +113,9 @@ resource "aws_cloudwatch_metric_alarm" "lb" {
evaluation_periods = "1"
metric_name = each.value.alert
namespace = "AWS/ApplicationELB"
period = "120"
period = each.value.alert_thresholds[each.value.alert].period
statistic = "Sum"
threshold = "0"
threshold = each.value.alert_thresholds[each.value.alert].threshold
alarm_description = "This alarm indicates there are an abnormal amount of 5XX responses. Either the lb cannot talk with the Fleet backend target or Fleet is returning an error."
alarm_actions = lookup(var.sns_topic_arns_map, "alb_httpcode_5xx", var.default_sns_topic_arns)
ok_actions = lookup(var.sns_topic_arns_map, "alb_httpcode_5xx", var.default_sns_topic_arns)

View file

@ -16,10 +16,33 @@ variable "albs" {
target_group_arn_suffix = string
min_containers = optional(string, 1)
ecs_service_name = string
alert_thresholds = optional(
object({
HTTPCode_ELB_5XX_Count = object({
period = number
threshold = number
})
HTTPCode_Target_5XX_Count = object({
period = number
threshold = number
})
}),
{
HTTPCode_ELB_5XX_Count = {
period = 120
threshold = 0
},
HTTPCode_Target_5XX_Count = {
period = 120
threshold = 0
}
}
)
}))
default = []
}
variable "default_sns_topic_arns" {
type = list(string)
default = []