Initial terraform monitoring addon module (#9864)

This commit is contained in:
Robert Fairburn 2023-02-16 14:30:08 -06:00 committed by GitHub
parent b757e447bc
commit 78fd5f2971
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 460 additions and 0 deletions

View file

@ -0,0 +1,39 @@
# Monitoring addon
This addon enables Cloudwatch monitoring for Fleet.
This includes:
- 5XX Errors on ALB
- ECS Service Monitoring
- RDS Monitoring
- Redis Monitoring
- ACM Certificate Monitoring
# Preparation
Some of the for_each and counts in this module cannot pre-determine the numbers until the `main` fleet module is applied.
You will need to `terraform apply -target module.main` prior applying monitoring assuming the use of a configuration matching the example at https://github.com/fleetdm/fleet/blob/main/terraform/example/main.tf.
# Example Configuration
This assumes your fleet module is `main` and is configured with it's default documentation.
See https://github.com/fleetdm/fleet/blob/main/terraform/example/main.tf for details.
```
module "monitoring" {
source = "github.com/fleetdm/fleet//terraform/addons/monitoring?ref=main"
fleet_ecs_service_name = module.main.byo-vpc.byo-db.byo-ecs.service.name
fleet_min_containers = module.main.byo-vpc.byo-db.byo-ecs.service.desired_count
alb_name = module.main.byo-vpc.byo-db.alb.lb_dns_name
alb_target_group_name = module.main.byo-vpc.byo-db.alb.target_group_names[0]
alb_target_group_arn_suffix = module.main.byo-vpc.byo-db.alb.target_group_arn_suffixes[0]
alb_arn_suffix = module.main.byo-vpc.byo-db.alb.lb_arn_suffix
sns_topic_arn = var.sns_topic_arn
mysql_cluster_members = module.main.byo-vpc.rds.cluster_members
redis_cluster_members = module.main.byo-vpc.redis.member_clusters[0]
acm_certificate_arn = module.acm.acm_certificate_arn
}
```

View file

@ -0,0 +1 @@
header-from: .header.md

View file

@ -0,0 +1,88 @@
# Monitoring addon
This addon enables Cloudwatch monitoring for Fleet.
This includes:
- 5XX Errors on ALB
- ECS Service Monitoring
- RDS Monitoring
- Redis Monitoring
- ACM Certificate Monitoring
# Preparation
Some of the for\_each and counts in this module cannot pre-determine the numbers until the `main` fleet module is applied.
You will need to `terraform apply -target module.main` prior applying monitoring assuming the use of a configuration matching the example at https://github.com/fleetdm/fleet/blob/main/terraform/example/main.tf.
# Example Configuration
This assumes your fleet module is `main` and is configured with it's default documentation.
See https://github.com/fleetdm/fleet/blob/main/terraform/example/main.tf for details.
```
module "monitoring" {
source = "github.com/fleetdm/fleet//terraform/addons/monitoring?ref=main"
fleet_ecs_service_name = module.main.byo-vpc.byo-db.byo-ecs.service.name
fleet_min_containers = module.main.byo-vpc.byo-db.byo-ecs.service.desired_count
alb_name = module.main.byo-vpc.byo-db.alb.lb_dns_name
alb_target_group_name = module.main.byo-vpc.byo-db.alb.target_group_names[0]
alb_target_group_arn_suffix = module.main.byo-vpc.byo-db.alb.target_group_arn_suffixes[0]
alb_arn_suffix = module.main.byo-vpc.byo-db.alb.lb_arn_suffix
sns_topic_arn = var.sns_topic_arn
mysql_cluster_members = module.main.byo-vpc.rds.cluster_members
redis_cluster_members = module.main.byo-vpc.redis.member_clusters[0]
acm_certificate_arn = module.acm.acm_certificate_arn
}
```
## Requirements
No requirements.
## Providers
| Name | Version |
|------|---------|
| <a name="provider_aws"></a> [aws](#provider\_aws) | n/a |
## Modules
No modules.
## Resources
| Name | Type |
|------|------|
| [aws_cloudwatch_metric_alarm.acm_certificate_expired](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_metric_alarm) | resource |
| [aws_cloudwatch_metric_alarm.alb_healthyhosts](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_metric_alarm) | resource |
| [aws_cloudwatch_metric_alarm.cpu_utilization_too_high](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_metric_alarm) | resource |
| [aws_cloudwatch_metric_alarm.lb](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_metric_alarm) | resource |
| [aws_cloudwatch_metric_alarm.redis-current-connections](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_metric_alarm) | resource |
| [aws_cloudwatch_metric_alarm.redis-database-memory-percentage](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_metric_alarm) | resource |
| [aws_cloudwatch_metric_alarm.redis-replication-lag](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_metric_alarm) | resource |
| [aws_cloudwatch_metric_alarm.redis_cpu](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_metric_alarm) | resource |
| [aws_cloudwatch_metric_alarm.redis_cpu_engine_utilization](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_metric_alarm) | resource |
| [aws_cloudwatch_metric_alarm.target_response_time](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_metric_alarm) | resource |
| [aws_db_event_subscription.default](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/db_event_subscription) | resource |
## Inputs
| Name | Description | Type | Default | Required |
|------|-------------|------|---------|:--------:|
| <a name="input_acm_certificate_arn"></a> [acm\_certificate\_arn](#input\_acm\_certificate\_arn) | n/a | `string` | `null` | no |
| <a name="input_alb_arn_suffix"></a> [alb\_arn\_suffix](#input\_alb\_arn\_suffix) | n/a | `string` | `null` | no |
| <a name="input_alb_name"></a> [alb\_name](#input\_alb\_name) | n/a | `string` | `null` | no |
| <a name="input_alb_target_group_arn_suffix"></a> [alb\_target\_group\_arn\_suffix](#input\_alb\_target\_group\_arn\_suffix) | n/a | `string` | `null` | no |
| <a name="input_alb_target_group_name"></a> [alb\_target\_group\_name](#input\_alb\_target\_group\_name) | n/a | `string` | `null` | no |
| <a name="input_customer_prefix"></a> [customer\_prefix](#input\_customer\_prefix) | n/a | `string` | `"fleet"` | no |
| <a name="input_fleet_ecs_service_name"></a> [fleet\_ecs\_service\_name](#input\_fleet\_ecs\_service\_name) | n/a | `string` | `null` | no |
| <a name="input_fleet_min_containers"></a> [fleet\_min\_containers](#input\_fleet\_min\_containers) | n/a | `number` | `1` | no |
| <a name="input_mysql_cluster_members"></a> [mysql\_cluster\_members](#input\_mysql\_cluster\_members) | n/a | `list(string)` | `[]` | no |
| <a name="input_redis_cluster_members"></a> [redis\_cluster\_members](#input\_redis\_cluster\_members) | n/a | `list(string)` | `[]` | no |
| <a name="input_sns_topic_arn"></a> [sns\_topic\_arn](#input\_sns\_topic\_arn) | n/a | `string` | n/a | yes |
## Outputs
No outputs.

View file

@ -0,0 +1,269 @@
// Database alarms
resource "aws_cloudwatch_metric_alarm" "cpu_utilization_too_high" {
for_each = toset(var.mysql_cluster_members)
alarm_name = "rds_cpu_utilization_too_high-${var.customer_prefix}-${each.key}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "1"
metric_name = "CPUUtilization"
namespace = "AWS/RDS"
period = "300"
statistic = "Average"
threshold = 80
alarm_description = "Average database CPU utilization over last 5 minutes too high"
alarm_actions = [var.sns_topic_arn]
ok_actions = [var.sns_topic_arn]
dimensions = {
DBInstanceIdentifier = each.key
}
}
resource "aws_db_event_subscription" "default" {
count = var.mysql_cluster_members == [] ? 0 : 1
name = "rds-event-sub-${var.customer_prefix}"
sns_topic = var.sns_topic_arn
source_type = "db-instance"
source_ids = var.mysql_cluster_members
event_categories = [
"failover",
"failure",
"low storage",
"maintenance",
"notification",
"recovery",
]
}
// ECS Alarms
resource "aws_cloudwatch_metric_alarm" "alb_healthyhosts" {
count = var.alb_target_group_arn_suffix == null || var.alb_arn_suffix == null ? 0 : 1
alarm_name = "backend-healthyhosts-${var.customer_prefix}"
comparison_operator = "LessThanThreshold"
evaluation_periods = "1"
metric_name = "HealthyHostCount"
namespace = "AWS/ApplicationELB"
period = "60"
statistic = "Minimum"
threshold = var.fleet_min_containers
alarm_description = "This alarm indicates the number of Healthy Fleet hosts is lower than expected. Please investigate the load balancer \"${var.alb_name}\" or the target group \"${var.alb_target_group_name}\" and the fleet backend service \"${var.fleet_ecs_service_name}\""
actions_enabled = "true"
alarm_actions = [var.sns_topic_arn]
ok_actions = [var.sns_topic_arn]
dimensions = {
TargetGroup = var.alb_target_group_arn_suffix
LoadBalancer = var.alb_arn_suffix
}
}
// alarm for target response time (anomaly detection)
resource "aws_cloudwatch_metric_alarm" "target_response_time" {
count = var.alb_target_group_arn_suffix == null || var.alb_arn_suffix == null ? 0 : 1
alarm_name = "backend-target-response-time-${var.customer_prefix}"
comparison_operator = "GreaterThanUpperThreshold"
evaluation_periods = "2"
threshold_metric_id = "e1"
alarm_description = "This alarm indicates the Fleet server response time is greater than it usually is. Please investigate the ecs service \"${var.fleet_ecs_service_name}\" because the backend might need to be scaled up."
alarm_actions = [var.sns_topic_arn]
ok_actions = [var.sns_topic_arn]
insufficient_data_actions = []
metric_query {
id = "e1"
expression = "ANOMALY_DETECTION_BAND(m1)"
label = "TargetResponseTime (Expected)"
return_data = "true"
}
metric_query {
id = "m1"
return_data = "true"
metric {
metric_name = "TargetResponseTime"
namespace = "AWS/ApplicationELB"
period = "120"
stat = "p99"
unit = "Count"
dimensions = {
TargetGroup = var.alb_target_group_arn_suffix
LoadBalancer = var.alb_arn_suffix
}
}
}
}
resource "aws_cloudwatch_metric_alarm" "lb" {
for_each = var.alb_target_group_arn_suffix == null ? toset([]) : toset(["HTTPCode_ELB_5XX_Count", "HTTPCode_Target_5XX_Count"])
alarm_name = "${var.customer_prefix}-lb-${each.key}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "1"
metric_name = each.key
namespace = "AWS/ApplicationELB"
period = "120"
statistic = "Sum"
threshold = "0"
alarm_description = "This alarm indicates there are an abnormal amount of 5XX responses. Either the lb cannot talk with the Fleet backend target or Fleet is returning an error."
alarm_actions = [var.sns_topic_arn]
ok_actions = [var.sns_topic_arn]
treat_missing_data = "notBreaching"
dimensions = {
LoadBalancer = var.alb_arn_suffix
}
}
// Elasticache (redis) alerts https://docs.aws.amazon.com/AmazonElastiCache/latest/red-ug/CacheMetrics.WhichShouldIMonitor.html
resource "aws_cloudwatch_metric_alarm" "redis_cpu" {
for_each = toset(var.redis_cluster_members)
alarm_name = "redis-cpu-utilization-${each.key}-${var.customer_prefix}"
alarm_description = "Redis cluster CPU utilization node ${each.key}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "1"
metric_name = "CPUUtilization"
namespace = "AWS/ElastiCache"
period = "300"
statistic = "Average"
alarm_actions = [var.sns_topic_arn]
ok_actions = [var.sns_topic_arn]
threshold = "70"
dimensions = {
CacheClusterId = each.key
}
}
resource "aws_cloudwatch_metric_alarm" "redis_cpu_engine_utilization" {
for_each = toset(var.redis_cluster_members)
alarm_name = "redis-cpu-engine-utilization-${each.key}-${var.customer_prefix}"
alarm_description = "Redis cluster CPU Engine utilization node ${each.key}"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "1"
metric_name = "EngineCPUUtilization"
namespace = "AWS/ElastiCache"
period = "300"
statistic = "Average"
alarm_actions = [var.sns_topic_arn]
ok_actions = [var.sns_topic_arn]
threshold = "25"
dimensions = {
CacheClusterId = each.key
}
}
resource "aws_cloudwatch_metric_alarm" "redis-database-memory-percentage" {
for_each = toset(var.redis_cluster_members)
alarm_name = "redis-database-memory-percentage-${each.key}-${var.customer_prefix}"
alarm_description = "Percentage of the memory available for the cluster that is in use. This is calculated using used_memory/maxmemory."
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "1"
metric_name = "DatabaseMemoryUsagePercentage"
namespace = "AWS/ElastiCache"
period = "300"
statistic = "Average"
alarm_actions = [var.sns_topic_arn]
ok_actions = [var.sns_topic_arn]
threshold = "80"
dimensions = {
CacheClusterId = each.key
}
}
resource "aws_cloudwatch_metric_alarm" "redis-current-connections" {
for_each = toset(var.redis_cluster_members)
alarm_name = "redis-current-connections-${each.key}-${var.customer_prefix}"
alarm_description = "Redis current connections for node ${each.key}"
comparison_operator = "LessThanLowerOrGreaterThanUpperThreshold"
evaluation_periods = "5"
threshold_metric_id = "e1"
alarm_actions = [var.sns_topic_arn]
ok_actions = [var.sns_topic_arn]
insufficient_data_actions = []
metric_query {
id = "e1"
expression = "ANOMALY_DETECTION_BAND(m1,20)"
label = "Current Connections (Expected)"
return_data = "true"
}
metric_query {
id = "m1"
return_data = "true"
metric {
metric_name = "CurrConnections"
namespace = "AWS/ElastiCache"
period = "600"
stat = "Average"
unit = "Count"
dimensions = {
CacheClusterId = each.key
}
}
}
}
resource "aws_cloudwatch_metric_alarm" "redis-replication-lag" {
for_each = toset(var.redis_cluster_members)
alarm_name = "redis-replication-lag-${var.customer_prefix}"
alarm_description = "This metric is only applicable for a node running as a read replica. It represents how far behind, in seconds, the replica is in applying changes from the primary node. For Redis engine version 5.0.6 onwards, the lag can be measured in milliseconds."
comparison_operator = "GreaterThanUpperThreshold"
evaluation_periods = "3"
threshold_metric_id = "e1"
alarm_actions = [var.sns_topic_arn]
ok_actions = [var.sns_topic_arn]
insufficient_data_actions = []
metric_query {
id = "e1"
expression = "ANOMALY_DETECTION_BAND(m1)"
label = "ReplicationLag (expected)"
return_data = "true"
}
metric_query {
id = "m1"
return_data = "true"
metric {
metric_name = "ReplicationLag"
namespace = "AWS/ElastiCache"
period = "300"
stat = "p90"
dimensions = {
CacheClusterId = each.key
}
}
}
}
// ACM Certificate Manager
resource "aws_cloudwatch_metric_alarm" "acm_certificate_expired" {
count = var.acm_certificate_arn == null ? 0 : 1
alarm_name = "acm-cert-expiry-${var.customer_prefix}"
comparison_operator = "LessThanThreshold"
evaluation_periods = "1"
period = "86400" // 1 day in seconds
threshold = 30 // days
statistic = "Average"
namespace = "AWS/CertificateManager"
metric_name = "DaysToExpiry"
actions_enabled = "true"
alarm_description = "ACM Certificate will expire soon"
alarm_actions = [var.sns_topic_arn]
ok_actions = [var.sns_topic_arn]
dimensions = {
CertificateArn = var.acm_certificate_arn
}
}

View file

@ -0,0 +1,55 @@
variable "customer_prefix" {
type = string
default = "fleet"
}
variable "fleet_ecs_service_name" {
type = string
default = null
}
variable "fleet_min_containers" {
type = number
default = 1
}
variable "alb_name" {
type = string
default = null
}
variable "alb_target_group_name" {
type = string
default = null
}
variable "alb_target_group_arn_suffix" {
type = string
default = null
}
variable "alb_arn_suffix" {
type = string
default = null
}
variable "sns_topic_arn" {
type = string
}
variable "mysql_cluster_members" {
type = list(string)
default = []
}
variable "redis_cluster_members" {
type = list(string)
default = []
}
variable "acm_certificate_arn" {
type = string
default = null
}

View file

@ -1,3 +1,11 @@
output "byo-db" {
value = module.byo-db
}
output "rds" {
value = module.rds
}
output "redis" {
value = module.redis
}