mirror of
https://github.com/fleetdm/fleet
synced 2026-05-23 17:08:53 +00:00
Initial terraform monitoring addon module (#9864)
This commit is contained in:
parent
b757e447bc
commit
78fd5f2971
6 changed files with 460 additions and 0 deletions
39
terraform/addons/monitoring/.header.md
Normal file
39
terraform/addons/monitoring/.header.md
Normal file
|
|
@ -0,0 +1,39 @@
|
|||
# Monitoring addon
|
||||
This addon enables Cloudwatch monitoring for Fleet.
|
||||
|
||||
This includes:
|
||||
|
||||
- 5XX Errors on ALB
|
||||
- ECS Service Monitoring
|
||||
- RDS Monitoring
|
||||
- Redis Monitoring
|
||||
- ACM Certificate Monitoring
|
||||
|
||||
# Preparation
|
||||
|
||||
Some of the for_each and counts in this module cannot pre-determine the numbers until the `main` fleet module is applied.
|
||||
|
||||
You will need to `terraform apply -target module.main` prior applying monitoring assuming the use of a configuration matching the example at https://github.com/fleetdm/fleet/blob/main/terraform/example/main.tf.
|
||||
|
||||
# Example Configuration
|
||||
|
||||
This assumes your fleet module is `main` and is configured with it's default documentation.
|
||||
|
||||
See https://github.com/fleetdm/fleet/blob/main/terraform/example/main.tf for details.
|
||||
|
||||
```
|
||||
module "monitoring" {
|
||||
source = "github.com/fleetdm/fleet//terraform/addons/monitoring?ref=main"
|
||||
fleet_ecs_service_name = module.main.byo-vpc.byo-db.byo-ecs.service.name
|
||||
fleet_min_containers = module.main.byo-vpc.byo-db.byo-ecs.service.desired_count
|
||||
alb_name = module.main.byo-vpc.byo-db.alb.lb_dns_name
|
||||
alb_target_group_name = module.main.byo-vpc.byo-db.alb.target_group_names[0]
|
||||
alb_target_group_arn_suffix = module.main.byo-vpc.byo-db.alb.target_group_arn_suffixes[0]
|
||||
alb_arn_suffix = module.main.byo-vpc.byo-db.alb.lb_arn_suffix
|
||||
sns_topic_arn = var.sns_topic_arn
|
||||
mysql_cluster_members = module.main.byo-vpc.rds.cluster_members
|
||||
redis_cluster_members = module.main.byo-vpc.redis.member_clusters[0]
|
||||
acm_certificate_arn = module.acm.acm_certificate_arn
|
||||
}
|
||||
```
|
||||
|
||||
1
terraform/addons/monitoring/.terraform-docs.yml
Normal file
1
terraform/addons/monitoring/.terraform-docs.yml
Normal file
|
|
@ -0,0 +1 @@
|
|||
header-from: .header.md
|
||||
88
terraform/addons/monitoring/README.md
Normal file
88
terraform/addons/monitoring/README.md
Normal file
|
|
@ -0,0 +1,88 @@
|
|||
# Monitoring addon
|
||||
This addon enables Cloudwatch monitoring for Fleet.
|
||||
|
||||
This includes:
|
||||
|
||||
- 5XX Errors on ALB
|
||||
- ECS Service Monitoring
|
||||
- RDS Monitoring
|
||||
- Redis Monitoring
|
||||
- ACM Certificate Monitoring
|
||||
|
||||
# Preparation
|
||||
|
||||
Some of the for\_each and counts in this module cannot pre-determine the numbers until the `main` fleet module is applied.
|
||||
|
||||
You will need to `terraform apply -target module.main` prior applying monitoring assuming the use of a configuration matching the example at https://github.com/fleetdm/fleet/blob/main/terraform/example/main.tf.
|
||||
|
||||
# Example Configuration
|
||||
|
||||
This assumes your fleet module is `main` and is configured with it's default documentation.
|
||||
|
||||
See https://github.com/fleetdm/fleet/blob/main/terraform/example/main.tf for details.
|
||||
|
||||
```
|
||||
module "monitoring" {
|
||||
source = "github.com/fleetdm/fleet//terraform/addons/monitoring?ref=main"
|
||||
fleet_ecs_service_name = module.main.byo-vpc.byo-db.byo-ecs.service.name
|
||||
fleet_min_containers = module.main.byo-vpc.byo-db.byo-ecs.service.desired_count
|
||||
alb_name = module.main.byo-vpc.byo-db.alb.lb_dns_name
|
||||
alb_target_group_name = module.main.byo-vpc.byo-db.alb.target_group_names[0]
|
||||
alb_target_group_arn_suffix = module.main.byo-vpc.byo-db.alb.target_group_arn_suffixes[0]
|
||||
alb_arn_suffix = module.main.byo-vpc.byo-db.alb.lb_arn_suffix
|
||||
sns_topic_arn = var.sns_topic_arn
|
||||
mysql_cluster_members = module.main.byo-vpc.rds.cluster_members
|
||||
redis_cluster_members = module.main.byo-vpc.redis.member_clusters[0]
|
||||
acm_certificate_arn = module.acm.acm_certificate_arn
|
||||
}
|
||||
```
|
||||
|
||||
## Requirements
|
||||
|
||||
No requirements.
|
||||
|
||||
## Providers
|
||||
|
||||
| Name | Version |
|
||||
|------|---------|
|
||||
| <a name="provider_aws"></a> [aws](#provider\_aws) | n/a |
|
||||
|
||||
## Modules
|
||||
|
||||
No modules.
|
||||
|
||||
## Resources
|
||||
|
||||
| Name | Type |
|
||||
|------|------|
|
||||
| [aws_cloudwatch_metric_alarm.acm_certificate_expired](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_metric_alarm) | resource |
|
||||
| [aws_cloudwatch_metric_alarm.alb_healthyhosts](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_metric_alarm) | resource |
|
||||
| [aws_cloudwatch_metric_alarm.cpu_utilization_too_high](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_metric_alarm) | resource |
|
||||
| [aws_cloudwatch_metric_alarm.lb](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_metric_alarm) | resource |
|
||||
| [aws_cloudwatch_metric_alarm.redis-current-connections](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_metric_alarm) | resource |
|
||||
| [aws_cloudwatch_metric_alarm.redis-database-memory-percentage](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_metric_alarm) | resource |
|
||||
| [aws_cloudwatch_metric_alarm.redis-replication-lag](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_metric_alarm) | resource |
|
||||
| [aws_cloudwatch_metric_alarm.redis_cpu](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_metric_alarm) | resource |
|
||||
| [aws_cloudwatch_metric_alarm.redis_cpu_engine_utilization](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_metric_alarm) | resource |
|
||||
| [aws_cloudwatch_metric_alarm.target_response_time](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_metric_alarm) | resource |
|
||||
| [aws_db_event_subscription.default](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/db_event_subscription) | resource |
|
||||
|
||||
## Inputs
|
||||
|
||||
| Name | Description | Type | Default | Required |
|
||||
|------|-------------|------|---------|:--------:|
|
||||
| <a name="input_acm_certificate_arn"></a> [acm\_certificate\_arn](#input\_acm\_certificate\_arn) | n/a | `string` | `null` | no |
|
||||
| <a name="input_alb_arn_suffix"></a> [alb\_arn\_suffix](#input\_alb\_arn\_suffix) | n/a | `string` | `null` | no |
|
||||
| <a name="input_alb_name"></a> [alb\_name](#input\_alb\_name) | n/a | `string` | `null` | no |
|
||||
| <a name="input_alb_target_group_arn_suffix"></a> [alb\_target\_group\_arn\_suffix](#input\_alb\_target\_group\_arn\_suffix) | n/a | `string` | `null` | no |
|
||||
| <a name="input_alb_target_group_name"></a> [alb\_target\_group\_name](#input\_alb\_target\_group\_name) | n/a | `string` | `null` | no |
|
||||
| <a name="input_customer_prefix"></a> [customer\_prefix](#input\_customer\_prefix) | n/a | `string` | `"fleet"` | no |
|
||||
| <a name="input_fleet_ecs_service_name"></a> [fleet\_ecs\_service\_name](#input\_fleet\_ecs\_service\_name) | n/a | `string` | `null` | no |
|
||||
| <a name="input_fleet_min_containers"></a> [fleet\_min\_containers](#input\_fleet\_min\_containers) | n/a | `number` | `1` | no |
|
||||
| <a name="input_mysql_cluster_members"></a> [mysql\_cluster\_members](#input\_mysql\_cluster\_members) | n/a | `list(string)` | `[]` | no |
|
||||
| <a name="input_redis_cluster_members"></a> [redis\_cluster\_members](#input\_redis\_cluster\_members) | n/a | `list(string)` | `[]` | no |
|
||||
| <a name="input_sns_topic_arn"></a> [sns\_topic\_arn](#input\_sns\_topic\_arn) | n/a | `string` | n/a | yes |
|
||||
|
||||
## Outputs
|
||||
|
||||
No outputs.
|
||||
269
terraform/addons/monitoring/main.tf
Normal file
269
terraform/addons/monitoring/main.tf
Normal file
|
|
@ -0,0 +1,269 @@
|
|||
// Database alarms
|
||||
resource "aws_cloudwatch_metric_alarm" "cpu_utilization_too_high" {
|
||||
for_each = toset(var.mysql_cluster_members)
|
||||
alarm_name = "rds_cpu_utilization_too_high-${var.customer_prefix}-${each.key}"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "1"
|
||||
metric_name = "CPUUtilization"
|
||||
namespace = "AWS/RDS"
|
||||
period = "300"
|
||||
statistic = "Average"
|
||||
threshold = 80
|
||||
alarm_description = "Average database CPU utilization over last 5 minutes too high"
|
||||
alarm_actions = [var.sns_topic_arn]
|
||||
ok_actions = [var.sns_topic_arn]
|
||||
dimensions = {
|
||||
DBInstanceIdentifier = each.key
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_db_event_subscription" "default" {
|
||||
count = var.mysql_cluster_members == [] ? 0 : 1
|
||||
name = "rds-event-sub-${var.customer_prefix}"
|
||||
sns_topic = var.sns_topic_arn
|
||||
|
||||
source_type = "db-instance"
|
||||
source_ids = var.mysql_cluster_members
|
||||
|
||||
event_categories = [
|
||||
"failover",
|
||||
"failure",
|
||||
"low storage",
|
||||
"maintenance",
|
||||
"notification",
|
||||
"recovery",
|
||||
]
|
||||
|
||||
}
|
||||
|
||||
// ECS Alarms
|
||||
resource "aws_cloudwatch_metric_alarm" "alb_healthyhosts" {
|
||||
count = var.alb_target_group_arn_suffix == null || var.alb_arn_suffix == null ? 0 : 1
|
||||
alarm_name = "backend-healthyhosts-${var.customer_prefix}"
|
||||
comparison_operator = "LessThanThreshold"
|
||||
evaluation_periods = "1"
|
||||
metric_name = "HealthyHostCount"
|
||||
namespace = "AWS/ApplicationELB"
|
||||
period = "60"
|
||||
statistic = "Minimum"
|
||||
threshold = var.fleet_min_containers
|
||||
alarm_description = "This alarm indicates the number of Healthy Fleet hosts is lower than expected. Please investigate the load balancer \"${var.alb_name}\" or the target group \"${var.alb_target_group_name}\" and the fleet backend service \"${var.fleet_ecs_service_name}\""
|
||||
actions_enabled = "true"
|
||||
alarm_actions = [var.sns_topic_arn]
|
||||
ok_actions = [var.sns_topic_arn]
|
||||
dimensions = {
|
||||
TargetGroup = var.alb_target_group_arn_suffix
|
||||
LoadBalancer = var.alb_arn_suffix
|
||||
}
|
||||
}
|
||||
|
||||
// alarm for target response time (anomaly detection)
|
||||
resource "aws_cloudwatch_metric_alarm" "target_response_time" {
|
||||
count = var.alb_target_group_arn_suffix == null || var.alb_arn_suffix == null ? 0 : 1
|
||||
alarm_name = "backend-target-response-time-${var.customer_prefix}"
|
||||
comparison_operator = "GreaterThanUpperThreshold"
|
||||
evaluation_periods = "2"
|
||||
threshold_metric_id = "e1"
|
||||
alarm_description = "This alarm indicates the Fleet server response time is greater than it usually is. Please investigate the ecs service \"${var.fleet_ecs_service_name}\" because the backend might need to be scaled up."
|
||||
alarm_actions = [var.sns_topic_arn]
|
||||
ok_actions = [var.sns_topic_arn]
|
||||
insufficient_data_actions = []
|
||||
|
||||
metric_query {
|
||||
id = "e1"
|
||||
expression = "ANOMALY_DETECTION_BAND(m1)"
|
||||
label = "TargetResponseTime (Expected)"
|
||||
return_data = "true"
|
||||
}
|
||||
|
||||
metric_query {
|
||||
id = "m1"
|
||||
return_data = "true"
|
||||
metric {
|
||||
metric_name = "TargetResponseTime"
|
||||
namespace = "AWS/ApplicationELB"
|
||||
period = "120"
|
||||
stat = "p99"
|
||||
unit = "Count"
|
||||
|
||||
dimensions = {
|
||||
TargetGroup = var.alb_target_group_arn_suffix
|
||||
LoadBalancer = var.alb_arn_suffix
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_cloudwatch_metric_alarm" "lb" {
|
||||
for_each = var.alb_target_group_arn_suffix == null ? toset([]) : toset(["HTTPCode_ELB_5XX_Count", "HTTPCode_Target_5XX_Count"])
|
||||
alarm_name = "${var.customer_prefix}-lb-${each.key}"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "1"
|
||||
metric_name = each.key
|
||||
namespace = "AWS/ApplicationELB"
|
||||
period = "120"
|
||||
statistic = "Sum"
|
||||
threshold = "0"
|
||||
alarm_description = "This alarm indicates there are an abnormal amount of 5XX responses. Either the lb cannot talk with the Fleet backend target or Fleet is returning an error."
|
||||
alarm_actions = [var.sns_topic_arn]
|
||||
ok_actions = [var.sns_topic_arn]
|
||||
treat_missing_data = "notBreaching"
|
||||
dimensions = {
|
||||
LoadBalancer = var.alb_arn_suffix
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Elasticache (redis) alerts https://docs.aws.amazon.com/AmazonElastiCache/latest/red-ug/CacheMetrics.WhichShouldIMonitor.html
|
||||
resource "aws_cloudwatch_metric_alarm" "redis_cpu" {
|
||||
for_each = toset(var.redis_cluster_members)
|
||||
alarm_name = "redis-cpu-utilization-${each.key}-${var.customer_prefix}"
|
||||
alarm_description = "Redis cluster CPU utilization node ${each.key}"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "1"
|
||||
metric_name = "CPUUtilization"
|
||||
namespace = "AWS/ElastiCache"
|
||||
period = "300"
|
||||
statistic = "Average"
|
||||
alarm_actions = [var.sns_topic_arn]
|
||||
ok_actions = [var.sns_topic_arn]
|
||||
|
||||
threshold = "70"
|
||||
|
||||
dimensions = {
|
||||
CacheClusterId = each.key
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
resource "aws_cloudwatch_metric_alarm" "redis_cpu_engine_utilization" {
|
||||
for_each = toset(var.redis_cluster_members)
|
||||
alarm_name = "redis-cpu-engine-utilization-${each.key}-${var.customer_prefix}"
|
||||
alarm_description = "Redis cluster CPU Engine utilization node ${each.key}"
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "1"
|
||||
metric_name = "EngineCPUUtilization"
|
||||
namespace = "AWS/ElastiCache"
|
||||
period = "300"
|
||||
statistic = "Average"
|
||||
alarm_actions = [var.sns_topic_arn]
|
||||
ok_actions = [var.sns_topic_arn]
|
||||
|
||||
threshold = "25"
|
||||
|
||||
dimensions = {
|
||||
CacheClusterId = each.key
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
resource "aws_cloudwatch_metric_alarm" "redis-database-memory-percentage" {
|
||||
for_each = toset(var.redis_cluster_members)
|
||||
alarm_name = "redis-database-memory-percentage-${each.key}-${var.customer_prefix}"
|
||||
alarm_description = "Percentage of the memory available for the cluster that is in use. This is calculated using used_memory/maxmemory."
|
||||
comparison_operator = "GreaterThanThreshold"
|
||||
evaluation_periods = "1"
|
||||
metric_name = "DatabaseMemoryUsagePercentage"
|
||||
namespace = "AWS/ElastiCache"
|
||||
period = "300"
|
||||
statistic = "Average"
|
||||
alarm_actions = [var.sns_topic_arn]
|
||||
ok_actions = [var.sns_topic_arn]
|
||||
|
||||
threshold = "80"
|
||||
|
||||
dimensions = {
|
||||
CacheClusterId = each.key
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
resource "aws_cloudwatch_metric_alarm" "redis-current-connections" {
|
||||
for_each = toset(var.redis_cluster_members)
|
||||
alarm_name = "redis-current-connections-${each.key}-${var.customer_prefix}"
|
||||
alarm_description = "Redis current connections for node ${each.key}"
|
||||
comparison_operator = "LessThanLowerOrGreaterThanUpperThreshold"
|
||||
evaluation_periods = "5"
|
||||
threshold_metric_id = "e1"
|
||||
alarm_actions = [var.sns_topic_arn]
|
||||
ok_actions = [var.sns_topic_arn]
|
||||
insufficient_data_actions = []
|
||||
|
||||
metric_query {
|
||||
id = "e1"
|
||||
expression = "ANOMALY_DETECTION_BAND(m1,20)"
|
||||
label = "Current Connections (Expected)"
|
||||
return_data = "true"
|
||||
}
|
||||
|
||||
metric_query {
|
||||
id = "m1"
|
||||
return_data = "true"
|
||||
metric {
|
||||
metric_name = "CurrConnections"
|
||||
namespace = "AWS/ElastiCache"
|
||||
period = "600"
|
||||
stat = "Average"
|
||||
unit = "Count"
|
||||
|
||||
dimensions = {
|
||||
CacheClusterId = each.key
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_cloudwatch_metric_alarm" "redis-replication-lag" {
|
||||
for_each = toset(var.redis_cluster_members)
|
||||
alarm_name = "redis-replication-lag-${var.customer_prefix}"
|
||||
alarm_description = "This metric is only applicable for a node running as a read replica. It represents how far behind, in seconds, the replica is in applying changes from the primary node. For Redis engine version 5.0.6 onwards, the lag can be measured in milliseconds."
|
||||
comparison_operator = "GreaterThanUpperThreshold"
|
||||
evaluation_periods = "3"
|
||||
threshold_metric_id = "e1"
|
||||
alarm_actions = [var.sns_topic_arn]
|
||||
ok_actions = [var.sns_topic_arn]
|
||||
insufficient_data_actions = []
|
||||
|
||||
metric_query {
|
||||
id = "e1"
|
||||
expression = "ANOMALY_DETECTION_BAND(m1)"
|
||||
label = "ReplicationLag (expected)"
|
||||
return_data = "true"
|
||||
}
|
||||
|
||||
metric_query {
|
||||
id = "m1"
|
||||
return_data = "true"
|
||||
metric {
|
||||
metric_name = "ReplicationLag"
|
||||
namespace = "AWS/ElastiCache"
|
||||
period = "300"
|
||||
stat = "p90"
|
||||
|
||||
dimensions = {
|
||||
CacheClusterId = each.key
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ACM Certificate Manager
|
||||
resource "aws_cloudwatch_metric_alarm" "acm_certificate_expired" {
|
||||
count = var.acm_certificate_arn == null ? 0 : 1
|
||||
alarm_name = "acm-cert-expiry-${var.customer_prefix}"
|
||||
comparison_operator = "LessThanThreshold"
|
||||
evaluation_periods = "1"
|
||||
period = "86400" // 1 day in seconds
|
||||
threshold = 30 // days
|
||||
statistic = "Average"
|
||||
namespace = "AWS/CertificateManager"
|
||||
metric_name = "DaysToExpiry"
|
||||
actions_enabled = "true"
|
||||
alarm_description = "ACM Certificate will expire soon"
|
||||
alarm_actions = [var.sns_topic_arn]
|
||||
ok_actions = [var.sns_topic_arn]
|
||||
|
||||
dimensions = {
|
||||
CertificateArn = var.acm_certificate_arn
|
||||
}
|
||||
}
|
||||
55
terraform/addons/monitoring/variables.tf
Normal file
55
terraform/addons/monitoring/variables.tf
Normal file
|
|
@ -0,0 +1,55 @@
|
|||
variable "customer_prefix" {
|
||||
type = string
|
||||
default = "fleet"
|
||||
}
|
||||
|
||||
variable "fleet_ecs_service_name" {
|
||||
type = string
|
||||
default = null
|
||||
}
|
||||
|
||||
variable "fleet_min_containers" {
|
||||
type = number
|
||||
default = 1
|
||||
}
|
||||
|
||||
variable "alb_name" {
|
||||
type = string
|
||||
default = null
|
||||
}
|
||||
|
||||
variable "alb_target_group_name" {
|
||||
type = string
|
||||
default = null
|
||||
}
|
||||
|
||||
variable "alb_target_group_arn_suffix" {
|
||||
type = string
|
||||
default = null
|
||||
}
|
||||
|
||||
variable "alb_arn_suffix" {
|
||||
type = string
|
||||
default = null
|
||||
}
|
||||
|
||||
variable "sns_topic_arn" {
|
||||
type = string
|
||||
}
|
||||
|
||||
variable "mysql_cluster_members" {
|
||||
type = list(string)
|
||||
default = []
|
||||
}
|
||||
|
||||
variable "redis_cluster_members" {
|
||||
type = list(string)
|
||||
default = []
|
||||
}
|
||||
|
||||
variable "acm_certificate_arn" {
|
||||
type = string
|
||||
default = null
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -1,3 +1,11 @@
|
|||
output "byo-db" {
|
||||
value = module.byo-db
|
||||
}
|
||||
|
||||
output "rds" {
|
||||
value = module.rds
|
||||
}
|
||||
|
||||
output "redis" {
|
||||
value = module.redis
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue