initial fleet-cron monitoring script (#14579)

This commit is contained in:
Robert Fairburn 2023-11-01 12:01:02 -05:00 committed by GitHub
parent 0aa953b602
commit b74f2c059c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 643 additions and 20 deletions

View file

@ -1,6 +1,25 @@
# This file is maintained automatically by "terraform init".
# Manual edits may be lost in future updates.
provider "registry.terraform.io/hashicorp/archive" {
version = "2.4.0"
hashes = [
"h1:cJokkjeH1jfpG4QEHdRx0t2j8rr52H33A7C/oX73Ok4=",
"zh:18e408596dd53048f7fc8229098d0e3ad940b92036a24287eff63e2caec72594",
"zh:392d4216ecd1a1fd933d23f4486b642a8480f934c13e2cae3c13b6b6a7e34a7b",
"zh:655dd1fa5ca753a4ace21d0de3792d96fff429445717f2ce31c125d19c38f3ff",
"zh:70dae36c176aa2b258331ad366a471176417a94dd3b4985a911b8be9ff842b00",
"zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3",
"zh:7d8c8e3925f1e21daf73f85983894fbe8868e326910e6df3720265bc657b9c9c",
"zh:a032ec0f0aee27a789726e348e8ad20778c3a1c9190ef25e7cff602c8d175f44",
"zh:b8e50de62ba185745b0fe9713755079ad0e9f7ac8638d204de6762cc36870410",
"zh:c8ad0c7697a3d444df21ff97f3473a8604c8639be64afe3f31b8ec7ad7571e18",
"zh:df736c5a2a7c3a82c5493665f659437a22f0baf8c2d157e45f4dd7ca40e739fc",
"zh:e8ffbf578a0977074f6d08aa8734e36c726e53dc79894cfc4f25fadc4f45f1df",
"zh:efea57ff23b141551f92b2699024d356c7ffd1a4ad62931da7ed7a386aef7f1f",
]
}
provider "registry.terraform.io/hashicorp/aws" {
version = "4.57.0"
constraints = ">= 2.67.0, >= 3.0.0, >= 3.73.0, >= 4.6.0, >= 4.8.0, >= 4.9.0, >= 4.18.0, >= 4.27.0, >= 4.30.0, >= 4.40.0, 4.57.0"

View file

@ -264,7 +264,7 @@ module "osquery-carve" {
}
module "monitoring" {
source = "github.com/fleetdm/fleet//terraform/addons/monitoring?ref=tf-mod-addon-monitoring-v1.0.0"
source = "github.com/fleetdm/fleet//terraform/addons/monitoring?ref=tf-mod-addon-monitoring-v1.1.0"
customer_prefix = local.customer
fleet_ecs_service_name = module.main.byo-vpc.byo-db.byo-ecs.service.name
fleet_min_containers = module.main.byo-vpc.byo-db.byo-ecs.service.desired_count
@ -274,11 +274,25 @@ module "monitoring" {
alb_arn_suffix = module.main.byo-vpc.byo-db.alb.lb_arn_suffix
sns_topic_arns_map = {
alb_httpcode_5xx = [module.notify_slack.slack_topic_arn]
cron_monitoring = [module.notify_slack.slack_topic_arn]
}
mysql_cluster_members = module.main.byo-vpc.rds.cluster_members
# The cloudposse module seems to have a nested list here.
redis_cluster_members = module.main.byo-vpc.redis.member_clusters[0]
acm_certificate_arn = module.acm.acm_certificate_arn
cron_monitoring = {
mysql_host = module.main.byo-vpc.rds.cluster_reader_endpoint
mysql_database = module.main.byo-vpc.rds.cluster_database_name
mysql_user = module.main.byo-vpc.rds.cluster_master_username
mysql_password_secret_name = module.main.byo-vpc.secrets.secret_ids["${local.customer}-database-password"]
rds_security_group_id = module.main.byo-vpc.rds.security_group_id
subnet_ids = module.main.vpc.private_subnets
vpc_id = module.main.vpc.vpc_id
# Format of https://pkg.go.dev/time#ParseDuration
delay_tolerance = "2h"
# Interval format for: https://docs.aws.amazon.com/scheduler/latest/UserGuide/schedule-types.html#rate-based
run_interval = "1 hour"
}
}
module "logging_alb" {

View file

@ -8,6 +8,7 @@ This includes:
- RDS Monitoring
- Redis Monitoring
- ACM Certificate Monitoring
- A custom Lambda to check the Fleet DB for Cron runs
# Preparation
@ -15,25 +16,63 @@ Some of the for_each and counts in this module cannot pre-determine the numbers
You will need to `terraform apply -target module.main` prior applying monitoring assuming the use of a configuration matching the example at https://github.com/fleetdm/fleet/blob/main/terraform/example/main.tf.
# Example Configuration
# Example configuration
This assumes your fleet module is `main` and is configured with it's default documentation.
See https://github.com/fleetdm/fleet/blob/main/terraform/example/main.tf for details.
See https://github.com/fleetdm/fleet/blob/main/terraform/example/main.tf for details.
```
module "monitoring" {
source = "github.com/fleetdm/fleet//terraform/addons/monitoring?ref=main"
source = "github.com/fleetdm/fleet//terraform/addons/monitoring?ref=tf-mod-addon-monitoring-v1.1.0"
customer_prefix = local.customer
fleet_ecs_service_name = module.main.byo-vpc.byo-db.byo-ecs.service.name
fleet_min_containers = module.main.byo-vpc.byo-db.byo-ecs.service.desired_count
alb_name = module.main.byo-vpc.byo-db.alb.lb_dns_name
alb_target_group_name = module.main.byo-vpc.byo-db.alb.target_group_names[0]
alb_target_group_arn_suffix = module.main.byo-vpc.byo-db.alb.target_group_arn_suffixes[0]
alb_arn_suffix = module.main.byo-vpc.byo-db.alb.lb_arn_suffix
default_sns_topic_arns = [var.sns_topic_arn]
mysql_cluster_members = module.main.byo-vpc.rds.cluster_members
redis_cluster_members = module.main.byo-vpc.redis.member_clusters[0]
acm_certificate_arn = module.acm.acm_certificate_arn
# Only publish alerts for items in this map
sns_topic_arns_map = {
alb_httpcode_5xx = [var.sns_topic_arn]
cron_monitoring = [var.sns_topic_arn]
}
mysql_cluster_members = module.main.byo-vpc.rds.cluster_members
# The cloudposse module seems to have a nested list here.
redis_cluster_members = module.main.byo-vpc.redis.member_clusters[0]
acm_certificate_arn = module.acm.acm_certificate_arn
cron_monitoring = {
mysql_host = module.main.byo-vpc.rds.cluster_reader_endpoint
mysql_database = module.main.byo-vpc.rds.cluster_database_name
mysql_user = module.main.byo-vpc.rds.cluster_master_username
mysql_password_secret_name = module.main.byo-vpc.secrets.secret_ids["${local.customer}-database-password"]
rds_security_group_id = module.main.byo-vpc.rds.security_group_id
subnet_ids = module.main.vpc.private_subnets
vpc_id = module.main.vpc.vpc_id
# Format of https://pkg.go.dev/time#ParseDuration
delay_tolerance = "2h"
# Interval format for: https://docs.aws.amazon.com/scheduler/latest/UserGuide/schedule-types.html#rate-based
run_interval = "1 hour"
}
}
```
```
# SNS topic ARNs map
Valid targets for `sns_topic_arns_map`:
- acm_certificate_expired
- alb_helthyhosts
- alb_httpcode_5xx
- backend_response_time
- cron_monitoring
- rds_cpu_untilizaton_too_high
- rds_db_event_subscription
- redis_cpu_engine_utilization
- redis_cpu_utilization
- redis_current_connections
- redis_database_memory_percentage
- redis_replication_lag
If you want to publish to all, use `default_sns_topic_arns` instead and include your notification ARNs there.

View file

@ -8,6 +8,7 @@ This includes:
- RDS Monitoring
- Redis Monitoring
- ACM Certificate Monitoring
- A custom Lambda to check the Fleet DB for Cron runs
# Preparation
@ -15,7 +16,7 @@ Some of the for\_each and counts in this module cannot pre-determine the numbers
You will need to `terraform apply -target module.main` prior applying monitoring assuming the use of a configuration matching the example at https://github.com/fleetdm/fleet/blob/main/terraform/example/main.tf.
# Example Configuration
# Example configuration
This assumes your fleet module is `main` and is configured with it's default documentation.
@ -23,20 +24,58 @@ See https://github.com/fleetdm/fleet/blob/main/terraform/example/main.tf for det
```
module "monitoring" {
source = "github.com/fleetdm/fleet//terraform/addons/monitoring?ref=main"
source = "github.com/fleetdm/fleet//terraform/addons/monitoring?ref=tf-mod-addon-monitoring-v1.1.0"
customer_prefix = local.customer
fleet_ecs_service_name = module.main.byo-vpc.byo-db.byo-ecs.service.name
fleet_min_containers = module.main.byo-vpc.byo-db.byo-ecs.service.desired_count
alb_name = module.main.byo-vpc.byo-db.alb.lb_dns_name
alb_target_group_name = module.main.byo-vpc.byo-db.alb.target_group_names[0]
alb_target_group_arn_suffix = module.main.byo-vpc.byo-db.alb.target_group_arn_suffixes[0]
alb_arn_suffix = module.main.byo-vpc.byo-db.alb.lb_arn_suffix
default_sns_topic_arns = [var.sns_topic_arn]
mysql_cluster_members = module.main.byo-vpc.rds.cluster_members
redis_cluster_members = module.main.byo-vpc.redis.member_clusters[0]
acm_certificate_arn = module.acm.acm_certificate_arn
# Only publish alerts for items in this map
sns_topic_arns_map = {
alb_httpcode_5xx = [var.sns_topic_arn]
cron_monitoring = [var.sns_topic_arn]
}
mysql_cluster_members = module.main.byo-vpc.rds.cluster_members
# The cloudposse module seems to have a nested list here.
redis_cluster_members = module.main.byo-vpc.redis.member_clusters[0]
acm_certificate_arn = module.acm.acm_certificate_arn
cron_monitoring = {
mysql_host = module.main.byo-vpc.rds.cluster_reader_endpoint
mysql_database = module.main.byo-vpc.rds.cluster_database_name
mysql_user = module.main.byo-vpc.rds.cluster_master_username
mysql_password_secret_name = module.main.byo-vpc.secrets.secret_ids["${local.customer}-database-password"]
rds_security_group_id = module.main.byo-vpc.rds.security_group_id
subnet_ids = module.main.vpc.private_subnets
vpc_id = module.main.vpc.vpc_id
# Format of https://pkg.go.dev/time#ParseDuration
delay_tolerance = "2h"
# Interval format for: https://docs.aws.amazon.com/scheduler/latest/UserGuide/schedule-types.html#rate-based
run_interval = "1 hour"
}
}
```
# SNS topic ARNs map
Valid targets for `sns_topic_arns_map`:
- acm\_certificate\_expired
- alb\_helthyhosts
- alb\_httpcode\_5xx
- backend\_response\_time
- cron\_monitoring
- rds\_cpu\_untilizaton\_too\_high
- rds\_db\_event\_subscription
- redis\_cpu\_engine\_utilization
- redis\_cpu\_utilization
- redis\_current\_connections
- redis\_database\_memory\_percentage
- redis\_replication\_lag
If you want to publish to all, use `default_sns_topic_arns` instead and include your notification ARNs there.
## Requirements
No requirements.
@ -45,7 +84,9 @@ No requirements.
| Name | Version |
|------|---------|
| <a name="provider_aws"></a> [aws](#provider\_aws) | n/a |
| <a name="provider_archive"></a> [archive](#provider\_archive) | 2.4.0 |
| <a name="provider_aws"></a> [aws](#provider\_aws) | 5.22.0 |
| <a name="provider_null"></a> [null](#provider\_null) | 3.2.1 |
## Modules
@ -55,6 +96,9 @@ No modules.
| Name | Type |
|------|------|
| [aws_cloudwatch_event_rule.cron_monitoring_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_rule) | resource |
| [aws_cloudwatch_event_target.cron_monitoring_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_target) | resource |
| [aws_cloudwatch_log_group.cron_monitoring_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_log_group) | resource |
| [aws_cloudwatch_metric_alarm.acm_certificate_expired](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_metric_alarm) | resource |
| [aws_cloudwatch_metric_alarm.alb_healthyhosts](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_metric_alarm) | resource |
| [aws_cloudwatch_metric_alarm.cpu_utilization_too_high](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_metric_alarm) | resource |
@ -66,6 +110,21 @@ No modules.
| [aws_cloudwatch_metric_alarm.redis_cpu_engine_utilization](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_metric_alarm) | resource |
| [aws_cloudwatch_metric_alarm.target_response_time](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_metric_alarm) | resource |
| [aws_db_event_subscription.default](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/db_event_subscription) | resource |
| [aws_iam_policy.cron_monitoring_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource |
| [aws_iam_role.cron_monitoring_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | resource |
| [aws_iam_role_policy_attachment.cron_monitoring_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource |
| [aws_iam_role_policy_attachment.cron_monitoring_lambda_managed](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource |
| [aws_lambda_function.cron_monitoring](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_function) | resource |
| [aws_lambda_permission.cron_monitoring_cloudwatch](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_permission) | resource |
| [aws_security_group.cron_monitoring](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/security_group) | resource |
| [aws_security_group_rule.cron_monitoring_to_rds](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/security_group_rule) | resource |
| [null_resource.cron_monitoring_build](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource |
| [archive_file.cron_monitoring_lambda](https://registry.terraform.io/providers/hashicorp/archive/latest/docs/data-sources/file) | data source |
| [aws_caller_identity.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/caller_identity) | data source |
| [aws_iam_policy_document.cron_monitoring_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source |
| [aws_iam_policy_document.cron_monitoring_lambda_assume_role](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source |
| [aws_region.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/region) | data source |
| [aws_secretsmanager_secret.mysql_database_password](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/secretsmanager_secret) | data source |
## Inputs
@ -76,6 +135,7 @@ No modules.
| <a name="input_alb_name"></a> [alb\_name](#input\_alb\_name) | n/a | `string` | `null` | no |
| <a name="input_alb_target_group_arn_suffix"></a> [alb\_target\_group\_arn\_suffix](#input\_alb\_target\_group\_arn\_suffix) | n/a | `string` | `null` | no |
| <a name="input_alb_target_group_name"></a> [alb\_target\_group\_name](#input\_alb\_target\_group\_name) | n/a | `string` | `null` | no |
| <a name="input_cron_monitoring"></a> [cron\_monitoring](#input\_cron\_monitoring) | n/a | <pre>object({<br> mysql_host = string<br> mysql_database = string<br> mysql_user = string<br> mysql_password_secret_name = string<br> vpc_id = string<br> subnet_ids = list(string)<br> rds_security_group_id = string<br> delay_tolerance = string<br> run_interval = string <br> })</pre> | `null` | no |
| <a name="input_customer_prefix"></a> [customer\_prefix](#input\_customer\_prefix) | n/a | `string` | `"fleet"` | no |
| <a name="input_default_sns_topic_arns"></a> [default\_sns\_topic\_arns](#input\_default\_sns\_topic\_arns) | n/a | `list(string)` | `[]` | no |
| <a name="input_fleet_ecs_service_name"></a> [fleet\_ecs\_service\_name](#input\_fleet\_ecs\_service\_name) | n/a | `string` | `null` | no |

View file

@ -0,0 +1,3 @@
exports.sh
.lambda.zip
bootstrap

View file

@ -0,0 +1,16 @@
module github.com/fleetdm/fleet/terraform/addons/monitoring/lambda
go 1.21.1
require (
github.com/aws/aws-lambda-go v1.41.0
github.com/aws/aws-sdk-go v1.45.25
github.com/aws/aws-secretsmanager-caching-go v1.1.2
github.com/go-sql-driver/mysql v1.7.1
github.com/jessevdk/go-flags v1.5.0
)
require (
github.com/jmespath/go-jmespath v0.4.0 // indirect
golang.org/x/sys v0.9.0 // indirect
)

View file

@ -0,0 +1,73 @@
github.com/aws/aws-lambda-go v1.41.0 h1:l/5fyVb6Ud9uYd411xdHZzSf2n86TakxzpvIoz7l+3Y=
github.com/aws/aws-lambda-go v1.41.0/go.mod h1:jwFe2KmMsHmffA1X2R09hH6lFzJQxzI8qK17ewzbQMM=
github.com/aws/aws-sdk-go v1.44.287/go.mod h1:aVsgQcEevwlmQ7qHE9I3h+dtQgpqhFB+i8Phjh7fkwI=
github.com/aws/aws-sdk-go v1.45.25 h1:c4fLlh5sLdK2DCRTY1z0hyuJZU4ygxX8m1FswL6/nF4=
github.com/aws/aws-sdk-go v1.45.25/go.mod h1:aVsgQcEevwlmQ7qHE9I3h+dtQgpqhFB+i8Phjh7fkwI=
github.com/aws/aws-secretsmanager-caching-go v1.1.2 h1:tY3pRhAkaohm75KFpGHoqjWrnRpznqrc8iX/wTLVpH0=
github.com/aws/aws-secretsmanager-caching-go v1.1.2/go.mod h1:s3Or+O0O8obPyDJz6875Rg1WApAbQ64L0WTBwYNnKLo=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/go-sql-driver/mysql v1.7.1 h1:lUIinVbN1DY0xBg0eMOzmmtGoHwWBbvnWubQUrtU8EI=
github.com/go-sql-driver/mysql v1.7.1/go.mod h1:OXbVy3sEdcQ2Doequ6Z5BW6fXNQTmx+9S1MCJN5yJMI=
github.com/jessevdk/go-flags v1.5.0 h1:1jKYvbxEjfUl0fmqTCOfonvskHHXMjBySTLW4y9LFvc=
github.com/jessevdk/go-flags v1.5.0/go.mod h1:Fw0T6WPc1dYxT4mKEZRfG5kJhaTDP9pj1c2EWnYs/m4=
github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg=
github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo=
github.com/jmespath/go-jmespath/internal/testify v1.5.1/go.mod h1:L3OGu8Wl2/fWfCI6z80xFu9LTZmf1ZRjMHUOPmWr69U=
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.7.2 h1:4jaiDzPyXQvSd7D0EjG45355tLlV3VOECpq10pLC+8s=
github.com/stretchr/testify v1.7.2/go.mod h1:R6va5+xMeoiuVRoj+gSkQ7d3FALtqAAGI1FQKckRals=
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
golang.org/x/crypto v0.10.0/go.mod h1:o4eNf7Ede1fv+hwOwZsTHl9EsPFO6q6ZvYR8vYfY45I=
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
golang.org/x/net v0.1.0/go.mod h1:Cx3nUiGt4eDBEyega/BKRp+/AlGL8hYe7U9odMt2Cco=
golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
golang.org/x/net v0.11.0/go.mod h1:2L/ixqYpgIVXmeoSA/4Lu7BzTG4KIyPIryS4IsOd1oQ=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210320140829-1e4c9ba3b0c4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.1.0 h1:kunALQeHf1/185U1i0GOB/fy1IPRDDpuoOOqRReG57U=
golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.9.0 h1:KS/R3tvhPqvJvwcKfnBHJwwthS11LRhmM5D59eEXa0s=
golang.org/x/sys v0.9.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
golang.org/x/term v0.1.0/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo=
golang.org/x/term v0.9.0/go.mod h1:M6DEAAIenWoTxdKrOltXcmDY3rSplQUkrvaDU5FcQyo=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
golang.org/x/text v0.4.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
golang.org/x/text v0.10.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

View file

@ -0,0 +1,191 @@
/*
This script is intended to be used with AWS Lambda to monitor the various
crons that live inside of Fleet.
We will check to see if there are recent updates from the crons in the
following table:
- cron_stats
If we have an old/incomplete run in cron_stats or if we are missing a
cron entry entirely, throw an alert to an SNS topic.
Currently tested crons:
- cleanups_then_aggregation
- vulnerabilities
*/
package main
import (
"context"
"database/sql"
"fmt"
"log"
"strings"
"time"
"github.com/aws/aws-lambda-go/lambda"
"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/aws/session"
"github.com/aws/aws-sdk-go/service/secretsmanager"
"github.com/aws/aws-sdk-go/service/sns"
"github.com/aws/aws-secretsmanager-caching-go/secretcache"
"github.com/go-sql-driver/mysql"
flags "github.com/jessevdk/go-flags"
)
type NullEvent struct{}
type OptionsStruct struct {
LambdaRuntimeAPI string `long:"lambda-runtime-api" env:"AWS_LAMBDA_RUNTIME_API"`
SNSTopicArns string `long:"sns-topic-arn" env:"SNS_TOPIC_ARNS" required:"true"`
MySQLHost string `long:"mysql-host" env:"MYSQL_HOST" required:"true"`
MySQLUser string `long:"mysql-user" env:"MYSQL_USER" required:"true"`
MySQLSMSecret string `long:"mysql-secretsmanager-secret" env:"MYSQL_SECRETSMANAGER_SECRET" required:"true"`
MySQLDatabase string `long:"mysql-database" env:"MYSQL_DATABASE" required:"true"`
FleetEnv string `long:"fleet-environment" env:"FLEET_ENV" required:"true"`
AWSRegion string `long:"aws-region" env:"AWS_REGION" required:"true"`
CronDelayTolerance string `long:"cron-delay-tolerance" env:"CRON_DELAY_TOLERANCE" default:"2h"`
}
var options = OptionsStruct{}
func sendSNSMessage(msg string, sess *session.Session) {
log.Printf("Sending SNS Message")
fullMsg := fmt.Sprintf("Environment: %s\nMessage: %s", options.FleetEnv, msg)
svc := sns.New(sess)
for _, SNSTopicArn := range strings.Split(options.SNSTopicArns, ",") {
log.Printf("Sending '%s' to '%s'", fullMsg, SNSTopicArn)
result, err := svc.Publish(&sns.PublishInput{
Message: &fullMsg,
TopicArn: &SNSTopicArn,
})
if err != nil {
log.Printf(err.Error())
}
log.Printf(result.GoString())
}
}
func checkDB(sess *session.Session) (err error) {
secretCache, err := secretcache.New()
if err != nil {
log.Printf(err.Error())
sendSNSMessage("Unable to initialise SecretsManager helper. Cron status is unknown.", sess)
return err
}
secretCache.Client = secretsmanager.New(sess)
MySQLPassword, err := secretCache.GetSecretString(options.MySQLSMSecret)
if err != nil {
log.Printf(err.Error())
sendSNSMessage("Unable to retrieve SecretsManager secret. Cron status is unknown.", sess)
return err
}
cfg := mysql.Config{
User: options.MySQLUser,
Passwd: MySQLPassword,
Net: "tcp",
Addr: options.MySQLHost,
DBName: options.MySQLDatabase,
AllowNativePasswords: true,
ParseTime: true,
}
db, err := sql.Open("mysql", cfg.FormatDSN())
defer db.Close()
if err != nil {
log.Printf(err.Error())
sendSNSMessage("Unable to connect to database. Cron status unknown.", sess)
return err
}
if err = db.Ping(); err != nil {
log.Printf(err.Error())
sendSNSMessage("Unable to connect to database. Cron status unknown.", sess)
return err
}
log.Printf("Connected to database!")
type CronStatsRow struct {
name string
status string
updated_at time.Time
}
rows, err := db.Query("SELECT b.name,IFNULL(status, 'missing cron'),IFNULL(updated_at, FROM_UNIXTIME(0)) AS updated_at FROM (SELECT 'vulnerabilities' AS name UNION ALL SELECT 'cleanups_then_aggregation') b LEFT JOIN (SELECT name, status, updated_at FROM cron_stats WHERE id IN (SELECT MAX(id) FROM cron_stats WHERE status = 'completed' GROUP BY name)) a ON a.name = b.name;")
defer rows.Close()
if err != nil {
log.Printf(err.Error())
sendSNSMessage("Unable to SELECT cron_stats table. Unable to continue.", sess)
return err
}
cronDelayDuration, err := time.ParseDuration(options.CronDelayTolerance)
if err != nil {
log.Printf(err.Error())
sendSNSMessage("Unable to parse cron-delay-tolerance. Check lambda settings.", sess)
return err
}
cronAlertTimestamp := time.Now().Add(-1 * cronDelayDuration)
for rows.Next() {
var row CronStatsRow
if err := rows.Scan(&row.name, &row.status, &row.updated_at); err != nil {
log.Printf(err.Error())
sendSNSMessage("Error scanning row in cron_stats table. Unable to continue.", sess)
return err
}
log.Printf("Row %s last updated at %s", row.name, row.updated_at.String())
if row.updated_at.Before(cronAlertTimestamp) {
log.Printf("*** %s hasn't updated in more than %s, alerting! (status %s)", options.CronDelayTolerance, row.name, row.status)
// Fire on the first match and return. We only need to alert that the crons need looked at, not each cron.
sendSNSMessage(fmt.Sprintf("Fleet cron '%s' hasn't updated in more than %s. Last status was '%s' at %s.", row.name, options.CronDelayTolerance, row.status, row.updated_at.String()), sess)
return nil
}
}
return nil
}
func handler(ctx context.Context, name NullEvent) error {
sess := session.Must(session.NewSessionWithOptions(
session.Options{
SharedConfigState: session.SharedConfigEnable,
Config: aws.Config{
Region: &options.AWSRegion,
},
},
))
checkDB(sess)
return nil
}
func main() {
var err error
log.SetFlags(log.LstdFlags | log.Lshortfile)
// Get config from environment
parser := flags.NewParser(&options, flags.Default)
if _, err = parser.Parse(); err != nil {
if flagsErr, ok := err.(*flags.Error); ok && flagsErr.Type == flags.ErrHelp {
return
} else {
log.Fatal(err)
}
}
// When running from Lambda, this should be read from the environment.
if options.LambdaRuntimeAPI != "" {
log.Printf("Starting Lambda handler.")
lambda.Start(handler)
} else {
log.Printf("Lambda execution environment not found. Falling back to local execution.")
if err = handler(context.Background(), NullEvent{}); err != nil {
log.Fatal(err)
}
}
}

View file

@ -239,7 +239,7 @@ resource "aws_cloudwatch_metric_alarm" "redis-replication-lag" {
namespace = "AWS/ElastiCache"
period = "300"
stat = "p90"
dimensions = {
CacheClusterId = each.key
}
@ -267,3 +267,198 @@ resource "aws_cloudwatch_metric_alarm" "acm_certificate_expired" {
CertificateArn = var.acm_certificate_arn
}
}
// Cron Monitoring
resource "null_resource" "cron_monitoring_build" {
count = var.cron_monitoring == null ? 0 : 1
triggers = {
main_go_changes = filesha256("${path.module}/lambda/main.go"),
go_mod_changes = filesha256("${path.module}/lambda/go.mod")
go_sum_changes = filesha256("${path.module}/lambda/go.sum")
}
provisioner "local-exec" {
working_dir = "${path.module}/lambda"
command = <<-EOT
go get
GOOS=linux GOARCH=amd64 go build -tags lambda.norpc -o bootstrap main.go
EOT
}
}
data "archive_file" "cron_monitoring_lambda" {
count = var.cron_monitoring == null ? 0 : 1
depends_on = [null_resource.cron_monitoring_build[0]]
type = "zip"
output_path = "${path.module}/lambda/.lambda.zip"
source_file = "${path.module}/lambda/bootstrap"
}
data "aws_secretsmanager_secret" "mysql_database_password" {
count = var.cron_monitoring == null ? 0 : 1
name = var.cron_monitoring.mysql_password_secret_name
}
resource "aws_security_group" "cron_monitoring" {
count = var.cron_monitoring == null ? 0 : 1
name = "${var.customer_prefix}_cron_monitoring"
description = "Security group for cron monitoring lambda (used by RDS to allow access in)"
vpc_id = var.cron_monitoring.vpc_id
egress {
from_port = 0
to_port = 0
protocol = "-1"
cidr_blocks = ["0.0.0.0/0"]
ipv6_cidr_blocks = ["::/0"]
}
}
resource "aws_security_group_rule" "cron_monitoring_to_rds" {
count = var.cron_monitoring == null ? 0 : 1
type = "ingress"
from_port = 3306
to_port = 3306
protocol = "tcp"
source_security_group_id = aws_security_group.cron_monitoring[0].id
security_group_id = var.cron_monitoring.rds_security_group_id
}
resource "aws_lambda_function" "cron_monitoring" {
count = var.cron_monitoring == null ? 0 : 1
depends_on = [
null_resource.cron_monitoring_build[0],
data.archive_file.cron_monitoring_lambda[0]
]
function_name = "${var.customer_prefix}_cron_monitoring"
runtime = "provided.al2"
memory_size = 256
timeout = 300
package_type = "Zip"
filename = data.archive_file.cron_monitoring_lambda[0].output_path
source_code_hash = data.archive_file.cron_monitoring_lambda[0].output_base64sha256
handler = "bootstrap"
reserved_concurrent_executions = 1
description = "This function has the ability to log into a production database and validate that the Fleet crons are running properly"
tracing_config {
mode = "Active"
}
vpc_config {
subnet_ids = var.cron_monitoring.subnet_ids
security_group_ids = [aws_security_group.cron_monitoring[0].id]
}
role = aws_iam_role.cron_monitoring_lambda[0].arn
environment {
variables = {
MYSQL_HOST = var.cron_monitoring.mysql_host
MYSQL_DATABASE = var.cron_monitoring.mysql_database
MYSQL_USER = var.cron_monitoring.mysql_user
MYSQL_SECRETSMANAGER_SECRET = data.aws_secretsmanager_secret.mysql_database_password[0].name
SNS_TOPIC_ARNS = join(",", lookup(var.sns_topic_arns_map, "cron_monitoring", var.default_sns_topic_arns))
FLEET_ENV = var.customer_prefix
CRON_DELAY_TOLERANCE = var.cron_monitoring.delay_tolerance
}
}
}
// Lambda IAM
data "aws_iam_policy_document" "cron_monitoring_lambda_assume_role" {
statement {
actions = ["sts:AssumeRole"]
principals {
type = "Service"
identifiers = ["lambda.amazonaws.com"]
}
}
}
resource "aws_iam_role_policy_attachment" "cron_monitoring_lambda" {
count = var.cron_monitoring == null ? 0 : 1
role = aws_iam_role.cron_monitoring_lambda[0].id
policy_arn = aws_iam_policy.cron_monitoring_lambda[0].arn
}
resource "aws_iam_role_policy_attachment" "cron_monitoring_lambda_managed" {
count = var.cron_monitoring == null ? 0 : 1
role = aws_iam_role.cron_monitoring_lambda[0].id
policy_arn = "arn:aws:iam::aws:policy/service-role/AWSLambdaVPCAccessExecutionRole"
}
resource "aws_iam_policy" "cron_monitoring_lambda" {
count = var.cron_monitoring == null ? 0 : 1
name = "${var.customer_prefix}-cron-monitoring"
policy = data.aws_iam_policy_document.cron_monitoring_lambda.json
}
resource "aws_iam_role" "cron_monitoring_lambda" {
count = var.cron_monitoring == null ? 0 : 1
name = "cron-monitoring-lambda"
assume_role_policy = data.aws_iam_policy_document.cron_monitoring_lambda_assume_role.json
}
data "aws_region" "current" {}
data "aws_caller_identity" "current" {}
data "aws_iam_policy_document" "cron_monitoring_lambda" {
statement {
sid = "SSMGetParameterPolicy"
actions = [
"secretsmanager:GetResourcePolicy",
"secretsmanager:DescribeSecret",
"secretsmanager:GetSecretValue"
]
resources = [data.aws_secretsmanager_secret.mysql_database_password[0].arn]
effect = "Allow"
}
statement {
sid = "SNSPublish"
actions = [
"sns:Publish"
]
resources = lookup(var.sns_topic_arns_map, "cron_monitoring", var.default_sns_topic_arns)
effect = "Allow"
}
}
resource "aws_cloudwatch_log_group" "cron_monitoring_lambda" {
count = var.cron_monitoring == null ? 0 : 1
name = "/aws/lambda/${var.customer_prefix}-cron-monitoring"
retention_in_days = 7
}
resource "aws_cloudwatch_event_rule" "cron_monitoring_lambda" {
count = var.cron_monitoring == null ? 0 : 1
name = "${var.customer_prefix}-cron-monitoring"
schedule_expression = "rate(${var.cron_monitoring.run_interval})"
is_enabled = true
}
resource "aws_cloudwatch_event_target" "cron_monitoring_lambda" {
count = var.cron_monitoring == null ? 0 : 1
rule = aws_cloudwatch_event_rule.cron_monitoring_lambda[0].name
arn = aws_lambda_function.cron_monitoring[0].arn
}
resource "aws_lambda_permission" "cron_monitoring_cloudwatch" {
count = var.cron_monitoring == null ? 0 : 1
action = "lambda:InvokeFunction"
function_name = aws_lambda_function.cron_monitoring[0].id
principal = "events.amazonaws.com"
source_arn = aws_cloudwatch_event_rule.cron_monitoring_lambda[0].arn
}

View file

@ -19,12 +19,12 @@ variable "alb_name" {
}
variable "alb_target_group_name" {
type = string
type = string
default = null
}
variable "alb_target_group_arn_suffix" {
type = string
type = string
default = null
}
@ -58,4 +58,17 @@ variable "acm_certificate_arn" {
default = null
}
variable "cron_monitoring" {
type = object({
mysql_host = string
mysql_database = string
mysql_user = string
mysql_password_secret_name = string
vpc_id = string
subnet_ids = list(string)
rds_security_group_id = string
delay_tolerance = string
run_interval = string
})
default = null
}