From b74f2c059c788e7afc6b85122e866ca9c4c3723c Mon Sep 17 00:00:00 2001 From: Robert Fairburn <8029478+rfairburn@users.noreply.github.com> Date: Wed, 1 Nov 2023 12:01:02 -0500 Subject: [PATCH] initial fleet-cron monitoring script (#14579) --- .../aws-tf-module/.terraform.lock.hcl | 19 ++ .../dogfood/terraform/aws-tf-module/main.tf | 16 +- terraform/addons/monitoring/.header.md | 55 ++++- terraform/addons/monitoring/README.md | 74 ++++++- terraform/addons/monitoring/lambda/.gitignore | 3 + terraform/addons/monitoring/lambda/go.mod | 16 ++ terraform/addons/monitoring/lambda/go.sum | 73 +++++++ terraform/addons/monitoring/lambda/main.go | 191 +++++++++++++++++ terraform/addons/monitoring/main.tf | 197 +++++++++++++++++- terraform/addons/monitoring/variables.tf | 19 +- 10 files changed, 643 insertions(+), 20 deletions(-) create mode 100644 terraform/addons/monitoring/lambda/.gitignore create mode 100644 terraform/addons/monitoring/lambda/go.mod create mode 100644 terraform/addons/monitoring/lambda/go.sum create mode 100644 terraform/addons/monitoring/lambda/main.go diff --git a/infrastructure/dogfood/terraform/aws-tf-module/.terraform.lock.hcl b/infrastructure/dogfood/terraform/aws-tf-module/.terraform.lock.hcl index f63e59a9d1..3f16e4c1b7 100644 --- a/infrastructure/dogfood/terraform/aws-tf-module/.terraform.lock.hcl +++ b/infrastructure/dogfood/terraform/aws-tf-module/.terraform.lock.hcl @@ -1,6 +1,25 @@ # This file is maintained automatically by "terraform init". # Manual edits may be lost in future updates. +provider "registry.terraform.io/hashicorp/archive" { + version = "2.4.0" + hashes = [ + "h1:cJokkjeH1jfpG4QEHdRx0t2j8rr52H33A7C/oX73Ok4=", + "zh:18e408596dd53048f7fc8229098d0e3ad940b92036a24287eff63e2caec72594", + "zh:392d4216ecd1a1fd933d23f4486b642a8480f934c13e2cae3c13b6b6a7e34a7b", + "zh:655dd1fa5ca753a4ace21d0de3792d96fff429445717f2ce31c125d19c38f3ff", + "zh:70dae36c176aa2b258331ad366a471176417a94dd3b4985a911b8be9ff842b00", + "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", + "zh:7d8c8e3925f1e21daf73f85983894fbe8868e326910e6df3720265bc657b9c9c", + "zh:a032ec0f0aee27a789726e348e8ad20778c3a1c9190ef25e7cff602c8d175f44", + "zh:b8e50de62ba185745b0fe9713755079ad0e9f7ac8638d204de6762cc36870410", + "zh:c8ad0c7697a3d444df21ff97f3473a8604c8639be64afe3f31b8ec7ad7571e18", + "zh:df736c5a2a7c3a82c5493665f659437a22f0baf8c2d157e45f4dd7ca40e739fc", + "zh:e8ffbf578a0977074f6d08aa8734e36c726e53dc79894cfc4f25fadc4f45f1df", + "zh:efea57ff23b141551f92b2699024d356c7ffd1a4ad62931da7ed7a386aef7f1f", + ] +} + provider "registry.terraform.io/hashicorp/aws" { version = "4.57.0" constraints = ">= 2.67.0, >= 3.0.0, >= 3.73.0, >= 4.6.0, >= 4.8.0, >= 4.9.0, >= 4.18.0, >= 4.27.0, >= 4.30.0, >= 4.40.0, 4.57.0" diff --git a/infrastructure/dogfood/terraform/aws-tf-module/main.tf b/infrastructure/dogfood/terraform/aws-tf-module/main.tf index 7b5dbac087..5286f577f6 100644 --- a/infrastructure/dogfood/terraform/aws-tf-module/main.tf +++ b/infrastructure/dogfood/terraform/aws-tf-module/main.tf @@ -264,7 +264,7 @@ module "osquery-carve" { } module "monitoring" { - source = "github.com/fleetdm/fleet//terraform/addons/monitoring?ref=tf-mod-addon-monitoring-v1.0.0" + source = "github.com/fleetdm/fleet//terraform/addons/monitoring?ref=tf-mod-addon-monitoring-v1.1.0" customer_prefix = local.customer fleet_ecs_service_name = module.main.byo-vpc.byo-db.byo-ecs.service.name fleet_min_containers = module.main.byo-vpc.byo-db.byo-ecs.service.desired_count @@ -274,11 +274,25 @@ module "monitoring" { alb_arn_suffix = module.main.byo-vpc.byo-db.alb.lb_arn_suffix sns_topic_arns_map = { alb_httpcode_5xx = [module.notify_slack.slack_topic_arn] + cron_monitoring = [module.notify_slack.slack_topic_arn] } mysql_cluster_members = module.main.byo-vpc.rds.cluster_members # The cloudposse module seems to have a nested list here. redis_cluster_members = module.main.byo-vpc.redis.member_clusters[0] acm_certificate_arn = module.acm.acm_certificate_arn + cron_monitoring = { + mysql_host = module.main.byo-vpc.rds.cluster_reader_endpoint + mysql_database = module.main.byo-vpc.rds.cluster_database_name + mysql_user = module.main.byo-vpc.rds.cluster_master_username + mysql_password_secret_name = module.main.byo-vpc.secrets.secret_ids["${local.customer}-database-password"] + rds_security_group_id = module.main.byo-vpc.rds.security_group_id + subnet_ids = module.main.vpc.private_subnets + vpc_id = module.main.vpc.vpc_id + # Format of https://pkg.go.dev/time#ParseDuration + delay_tolerance = "2h" + # Interval format for: https://docs.aws.amazon.com/scheduler/latest/UserGuide/schedule-types.html#rate-based + run_interval = "1 hour" + } } module "logging_alb" { diff --git a/terraform/addons/monitoring/.header.md b/terraform/addons/monitoring/.header.md index 5de045a3a5..8c25ad69a7 100644 --- a/terraform/addons/monitoring/.header.md +++ b/terraform/addons/monitoring/.header.md @@ -8,6 +8,7 @@ This includes: - RDS Monitoring - Redis Monitoring - ACM Certificate Monitoring +- A custom Lambda to check the Fleet DB for Cron runs # Preparation @@ -15,25 +16,63 @@ Some of the for_each and counts in this module cannot pre-determine the numbers You will need to `terraform apply -target module.main` prior applying monitoring assuming the use of a configuration matching the example at https://github.com/fleetdm/fleet/blob/main/terraform/example/main.tf. -# Example Configuration +# Example configuration This assumes your fleet module is `main` and is configured with it's default documentation. -See https://github.com/fleetdm/fleet/blob/main/terraform/example/main.tf for details. +See https://github.com/fleetdm/fleet/blob/main/terraform/example/main.tf for details. + ``` module "monitoring" { - source = "github.com/fleetdm/fleet//terraform/addons/monitoring?ref=main" + source = "github.com/fleetdm/fleet//terraform/addons/monitoring?ref=tf-mod-addon-monitoring-v1.1.0" + customer_prefix = local.customer fleet_ecs_service_name = module.main.byo-vpc.byo-db.byo-ecs.service.name fleet_min_containers = module.main.byo-vpc.byo-db.byo-ecs.service.desired_count alb_name = module.main.byo-vpc.byo-db.alb.lb_dns_name alb_target_group_name = module.main.byo-vpc.byo-db.alb.target_group_names[0] alb_target_group_arn_suffix = module.main.byo-vpc.byo-db.alb.target_group_arn_suffixes[0] alb_arn_suffix = module.main.byo-vpc.byo-db.alb.lb_arn_suffix - default_sns_topic_arns = [var.sns_topic_arn] - mysql_cluster_members = module.main.byo-vpc.rds.cluster_members - redis_cluster_members = module.main.byo-vpc.redis.member_clusters[0] - acm_certificate_arn = module.acm.acm_certificate_arn + # Only publish alerts for items in this map + sns_topic_arns_map = { + alb_httpcode_5xx = [var.sns_topic_arn] + cron_monitoring = [var.sns_topic_arn] + } + mysql_cluster_members = module.main.byo-vpc.rds.cluster_members + # The cloudposse module seems to have a nested list here. + redis_cluster_members = module.main.byo-vpc.redis.member_clusters[0] + acm_certificate_arn = module.acm.acm_certificate_arn + cron_monitoring = { + mysql_host = module.main.byo-vpc.rds.cluster_reader_endpoint + mysql_database = module.main.byo-vpc.rds.cluster_database_name + mysql_user = module.main.byo-vpc.rds.cluster_master_username + mysql_password_secret_name = module.main.byo-vpc.secrets.secret_ids["${local.customer}-database-password"] + rds_security_group_id = module.main.byo-vpc.rds.security_group_id + subnet_ids = module.main.vpc.private_subnets + vpc_id = module.main.vpc.vpc_id + # Format of https://pkg.go.dev/time#ParseDuration + delay_tolerance = "2h" + # Interval format for: https://docs.aws.amazon.com/scheduler/latest/UserGuide/schedule-types.html#rate-based + run_interval = "1 hour" + } } -``` +``` +# SNS topic ARNs map + +Valid targets for `sns_topic_arns_map`: + + - acm_certificate_expired + - alb_helthyhosts + - alb_httpcode_5xx + - backend_response_time + - cron_monitoring + - rds_cpu_untilizaton_too_high + - rds_db_event_subscription + - redis_cpu_engine_utilization + - redis_cpu_utilization + - redis_current_connections + - redis_database_memory_percentage + - redis_replication_lag + +If you want to publish to all, use `default_sns_topic_arns` instead and include your notification ARNs there. diff --git a/terraform/addons/monitoring/README.md b/terraform/addons/monitoring/README.md index fbf3365fdb..59c5d511be 100644 --- a/terraform/addons/monitoring/README.md +++ b/terraform/addons/monitoring/README.md @@ -8,6 +8,7 @@ This includes: - RDS Monitoring - Redis Monitoring - ACM Certificate Monitoring +- A custom Lambda to check the Fleet DB for Cron runs # Preparation @@ -15,7 +16,7 @@ Some of the for\_each and counts in this module cannot pre-determine the numbers You will need to `terraform apply -target module.main` prior applying monitoring assuming the use of a configuration matching the example at https://github.com/fleetdm/fleet/blob/main/terraform/example/main.tf. -# Example Configuration +# Example configuration This assumes your fleet module is `main` and is configured with it's default documentation. @@ -23,20 +24,58 @@ See https://github.com/fleetdm/fleet/blob/main/terraform/example/main.tf for det ``` module "monitoring" { - source = "github.com/fleetdm/fleet//terraform/addons/monitoring?ref=main" + source = "github.com/fleetdm/fleet//terraform/addons/monitoring?ref=tf-mod-addon-monitoring-v1.1.0" + customer_prefix = local.customer fleet_ecs_service_name = module.main.byo-vpc.byo-db.byo-ecs.service.name fleet_min_containers = module.main.byo-vpc.byo-db.byo-ecs.service.desired_count alb_name = module.main.byo-vpc.byo-db.alb.lb_dns_name alb_target_group_name = module.main.byo-vpc.byo-db.alb.target_group_names[0] alb_target_group_arn_suffix = module.main.byo-vpc.byo-db.alb.target_group_arn_suffixes[0] alb_arn_suffix = module.main.byo-vpc.byo-db.alb.lb_arn_suffix - default_sns_topic_arns = [var.sns_topic_arn] - mysql_cluster_members = module.main.byo-vpc.rds.cluster_members - redis_cluster_members = module.main.byo-vpc.redis.member_clusters[0] - acm_certificate_arn = module.acm.acm_certificate_arn + # Only publish alerts for items in this map + sns_topic_arns_map = { + alb_httpcode_5xx = [var.sns_topic_arn] + cron_monitoring = [var.sns_topic_arn] + } + mysql_cluster_members = module.main.byo-vpc.rds.cluster_members + # The cloudposse module seems to have a nested list here. + redis_cluster_members = module.main.byo-vpc.redis.member_clusters[0] + acm_certificate_arn = module.acm.acm_certificate_arn + cron_monitoring = { + mysql_host = module.main.byo-vpc.rds.cluster_reader_endpoint + mysql_database = module.main.byo-vpc.rds.cluster_database_name + mysql_user = module.main.byo-vpc.rds.cluster_master_username + mysql_password_secret_name = module.main.byo-vpc.secrets.secret_ids["${local.customer}-database-password"] + rds_security_group_id = module.main.byo-vpc.rds.security_group_id + subnet_ids = module.main.vpc.private_subnets + vpc_id = module.main.vpc.vpc_id + # Format of https://pkg.go.dev/time#ParseDuration + delay_tolerance = "2h" + # Interval format for: https://docs.aws.amazon.com/scheduler/latest/UserGuide/schedule-types.html#rate-based + run_interval = "1 hour" + } } ``` +# SNS topic ARNs map + +Valid targets for `sns_topic_arns_map`: + + - acm\_certificate\_expired + - alb\_helthyhosts + - alb\_httpcode\_5xx + - backend\_response\_time + - cron\_monitoring + - rds\_cpu\_untilizaton\_too\_high + - rds\_db\_event\_subscription + - redis\_cpu\_engine\_utilization + - redis\_cpu\_utilization + - redis\_current\_connections + - redis\_database\_memory\_percentage + - redis\_replication\_lag + +If you want to publish to all, use `default_sns_topic_arns` instead and include your notification ARNs there. + ## Requirements No requirements. @@ -45,7 +84,9 @@ No requirements. | Name | Version | |------|---------| -| [aws](#provider\_aws) | n/a | +| [archive](#provider\_archive) | 2.4.0 | +| [aws](#provider\_aws) | 5.22.0 | +| [null](#provider\_null) | 3.2.1 | ## Modules @@ -55,6 +96,9 @@ No modules. | Name | Type | |------|------| +| [aws_cloudwatch_event_rule.cron_monitoring_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_rule) | resource | +| [aws_cloudwatch_event_target.cron_monitoring_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_target) | resource | +| [aws_cloudwatch_log_group.cron_monitoring_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_log_group) | resource | | [aws_cloudwatch_metric_alarm.acm_certificate_expired](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_metric_alarm) | resource | | [aws_cloudwatch_metric_alarm.alb_healthyhosts](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_metric_alarm) | resource | | [aws_cloudwatch_metric_alarm.cpu_utilization_too_high](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_metric_alarm) | resource | @@ -66,6 +110,21 @@ No modules. | [aws_cloudwatch_metric_alarm.redis_cpu_engine_utilization](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_metric_alarm) | resource | | [aws_cloudwatch_metric_alarm.target_response_time](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_metric_alarm) | resource | | [aws_db_event_subscription.default](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/db_event_subscription) | resource | +| [aws_iam_policy.cron_monitoring_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource | +| [aws_iam_role.cron_monitoring_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | resource | +| [aws_iam_role_policy_attachment.cron_monitoring_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource | +| [aws_iam_role_policy_attachment.cron_monitoring_lambda_managed](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource | +| [aws_lambda_function.cron_monitoring](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_function) | resource | +| [aws_lambda_permission.cron_monitoring_cloudwatch](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_permission) | resource | +| [aws_security_group.cron_monitoring](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/security_group) | resource | +| [aws_security_group_rule.cron_monitoring_to_rds](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/security_group_rule) | resource | +| [null_resource.cron_monitoring_build](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource | +| [archive_file.cron_monitoring_lambda](https://registry.terraform.io/providers/hashicorp/archive/latest/docs/data-sources/file) | data source | +| [aws_caller_identity.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/caller_identity) | data source | +| [aws_iam_policy_document.cron_monitoring_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | +| [aws_iam_policy_document.cron_monitoring_lambda_assume_role](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | +| [aws_region.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/region) | data source | +| [aws_secretsmanager_secret.mysql_database_password](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/secretsmanager_secret) | data source | ## Inputs @@ -76,6 +135,7 @@ No modules. | [alb\_name](#input\_alb\_name) | n/a | `string` | `null` | no | | [alb\_target\_group\_arn\_suffix](#input\_alb\_target\_group\_arn\_suffix) | n/a | `string` | `null` | no | | [alb\_target\_group\_name](#input\_alb\_target\_group\_name) | n/a | `string` | `null` | no | +| [cron\_monitoring](#input\_cron\_monitoring) | n/a |
object({
mysql_host = string
mysql_database = string
mysql_user = string
mysql_password_secret_name = string
vpc_id = string
subnet_ids = list(string)
rds_security_group_id = string
delay_tolerance = string
run_interval = string
})
| `null` | no | | [customer\_prefix](#input\_customer\_prefix) | n/a | `string` | `"fleet"` | no | | [default\_sns\_topic\_arns](#input\_default\_sns\_topic\_arns) | n/a | `list(string)` | `[]` | no | | [fleet\_ecs\_service\_name](#input\_fleet\_ecs\_service\_name) | n/a | `string` | `null` | no | diff --git a/terraform/addons/monitoring/lambda/.gitignore b/terraform/addons/monitoring/lambda/.gitignore new file mode 100644 index 0000000000..8c7f193def --- /dev/null +++ b/terraform/addons/monitoring/lambda/.gitignore @@ -0,0 +1,3 @@ +exports.sh +.lambda.zip +bootstrap diff --git a/terraform/addons/monitoring/lambda/go.mod b/terraform/addons/monitoring/lambda/go.mod new file mode 100644 index 0000000000..f20b3d981e --- /dev/null +++ b/terraform/addons/monitoring/lambda/go.mod @@ -0,0 +1,16 @@ +module github.com/fleetdm/fleet/terraform/addons/monitoring/lambda + +go 1.21.1 + +require ( + github.com/aws/aws-lambda-go v1.41.0 + github.com/aws/aws-sdk-go v1.45.25 + github.com/aws/aws-secretsmanager-caching-go v1.1.2 + github.com/go-sql-driver/mysql v1.7.1 + github.com/jessevdk/go-flags v1.5.0 +) + +require ( + github.com/jmespath/go-jmespath v0.4.0 // indirect + golang.org/x/sys v0.9.0 // indirect +) diff --git a/terraform/addons/monitoring/lambda/go.sum b/terraform/addons/monitoring/lambda/go.sum new file mode 100644 index 0000000000..2bcdce4a97 --- /dev/null +++ b/terraform/addons/monitoring/lambda/go.sum @@ -0,0 +1,73 @@ +github.com/aws/aws-lambda-go v1.41.0 h1:l/5fyVb6Ud9uYd411xdHZzSf2n86TakxzpvIoz7l+3Y= +github.com/aws/aws-lambda-go v1.41.0/go.mod h1:jwFe2KmMsHmffA1X2R09hH6lFzJQxzI8qK17ewzbQMM= +github.com/aws/aws-sdk-go v1.44.287/go.mod h1:aVsgQcEevwlmQ7qHE9I3h+dtQgpqhFB+i8Phjh7fkwI= +github.com/aws/aws-sdk-go v1.45.25 h1:c4fLlh5sLdK2DCRTY1z0hyuJZU4ygxX8m1FswL6/nF4= +github.com/aws/aws-sdk-go v1.45.25/go.mod h1:aVsgQcEevwlmQ7qHE9I3h+dtQgpqhFB+i8Phjh7fkwI= +github.com/aws/aws-secretsmanager-caching-go v1.1.2 h1:tY3pRhAkaohm75KFpGHoqjWrnRpznqrc8iX/wTLVpH0= +github.com/aws/aws-secretsmanager-caching-go v1.1.2/go.mod h1:s3Or+O0O8obPyDJz6875Rg1WApAbQ64L0WTBwYNnKLo= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/go-sql-driver/mysql v1.7.1 h1:lUIinVbN1DY0xBg0eMOzmmtGoHwWBbvnWubQUrtU8EI= +github.com/go-sql-driver/mysql v1.7.1/go.mod h1:OXbVy3sEdcQ2Doequ6Z5BW6fXNQTmx+9S1MCJN5yJMI= +github.com/jessevdk/go-flags v1.5.0 h1:1jKYvbxEjfUl0fmqTCOfonvskHHXMjBySTLW4y9LFvc= +github.com/jessevdk/go-flags v1.5.0/go.mod h1:Fw0T6WPc1dYxT4mKEZRfG5kJhaTDP9pj1c2EWnYs/m4= +github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg= +github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo= +github.com/jmespath/go-jmespath/internal/testify v1.5.1/go.mod h1:L3OGu8Wl2/fWfCI6z80xFu9LTZmf1ZRjMHUOPmWr69U= +github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.7.2 h1:4jaiDzPyXQvSd7D0EjG45355tLlV3VOECpq10pLC+8s= +github.com/stretchr/testify v1.7.2/go.mod h1:R6va5+xMeoiuVRoj+gSkQ7d3FALtqAAGI1FQKckRals= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/crypto v0.10.0/go.mod h1:o4eNf7Ede1fv+hwOwZsTHl9EsPFO6q6ZvYR8vYfY45I= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.1.0/go.mod h1:Cx3nUiGt4eDBEyega/BKRp+/AlGL8hYe7U9odMt2Cco= +golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= +golang.org/x/net v0.11.0/go.mod h1:2L/ixqYpgIVXmeoSA/4Lu7BzTG4KIyPIryS4IsOd1oQ= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210320140829-1e4c9ba3b0c4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.1.0 h1:kunALQeHf1/185U1i0GOB/fy1IPRDDpuoOOqRReG57U= +golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.9.0 h1:KS/R3tvhPqvJvwcKfnBHJwwthS11LRhmM5D59eEXa0s= +golang.org/x/sys v0.9.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.1.0/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= +golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= +golang.org/x/term v0.9.0/go.mod h1:M6DEAAIenWoTxdKrOltXcmDY3rSplQUkrvaDU5FcQyo= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.4.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= +golang.org/x/text v0.10.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/terraform/addons/monitoring/lambda/main.go b/terraform/addons/monitoring/lambda/main.go new file mode 100644 index 0000000000..eb56e0123b --- /dev/null +++ b/terraform/addons/monitoring/lambda/main.go @@ -0,0 +1,191 @@ +/* +This script is intended to be used with AWS Lambda to monitor the various +crons that live inside of Fleet. + +We will check to see if there are recent updates from the crons in the +following table: + + - cron_stats + +If we have an old/incomplete run in cron_stats or if we are missing a +cron entry entirely, throw an alert to an SNS topic. + +Currently tested crons: + + - cleanups_then_aggregation + - vulnerabilities + +*/ + +package main + +import ( + "context" + "database/sql" + "fmt" + "log" + "strings" + "time" + + "github.com/aws/aws-lambda-go/lambda" + "github.com/aws/aws-sdk-go/aws" + "github.com/aws/aws-sdk-go/aws/session" + "github.com/aws/aws-sdk-go/service/secretsmanager" + "github.com/aws/aws-sdk-go/service/sns" + "github.com/aws/aws-secretsmanager-caching-go/secretcache" + "github.com/go-sql-driver/mysql" + flags "github.com/jessevdk/go-flags" +) + +type NullEvent struct{} + +type OptionsStruct struct { + LambdaRuntimeAPI string `long:"lambda-runtime-api" env:"AWS_LAMBDA_RUNTIME_API"` + SNSTopicArns string `long:"sns-topic-arn" env:"SNS_TOPIC_ARNS" required:"true"` + MySQLHost string `long:"mysql-host" env:"MYSQL_HOST" required:"true"` + MySQLUser string `long:"mysql-user" env:"MYSQL_USER" required:"true"` + MySQLSMSecret string `long:"mysql-secretsmanager-secret" env:"MYSQL_SECRETSMANAGER_SECRET" required:"true"` + MySQLDatabase string `long:"mysql-database" env:"MYSQL_DATABASE" required:"true"` + FleetEnv string `long:"fleet-environment" env:"FLEET_ENV" required:"true"` + AWSRegion string `long:"aws-region" env:"AWS_REGION" required:"true"` + CronDelayTolerance string `long:"cron-delay-tolerance" env:"CRON_DELAY_TOLERANCE" default:"2h"` +} + +var options = OptionsStruct{} + +func sendSNSMessage(msg string, sess *session.Session) { + log.Printf("Sending SNS Message") + fullMsg := fmt.Sprintf("Environment: %s\nMessage: %s", options.FleetEnv, msg) + svc := sns.New(sess) + for _, SNSTopicArn := range strings.Split(options.SNSTopicArns, ",") { + log.Printf("Sending '%s' to '%s'", fullMsg, SNSTopicArn) + result, err := svc.Publish(&sns.PublishInput{ + Message: &fullMsg, + TopicArn: &SNSTopicArn, + }) + if err != nil { + log.Printf(err.Error()) + } + log.Printf(result.GoString()) + } +} + +func checkDB(sess *session.Session) (err error) { + secretCache, err := secretcache.New() + if err != nil { + log.Printf(err.Error()) + sendSNSMessage("Unable to initialise SecretsManager helper. Cron status is unknown.", sess) + return err + } + + secretCache.Client = secretsmanager.New(sess) + + MySQLPassword, err := secretCache.GetSecretString(options.MySQLSMSecret) + if err != nil { + log.Printf(err.Error()) + sendSNSMessage("Unable to retrieve SecretsManager secret. Cron status is unknown.", sess) + return err + } + + cfg := mysql.Config{ + User: options.MySQLUser, + Passwd: MySQLPassword, + Net: "tcp", + Addr: options.MySQLHost, + DBName: options.MySQLDatabase, + AllowNativePasswords: true, + ParseTime: true, + } + + db, err := sql.Open("mysql", cfg.FormatDSN()) + defer db.Close() + if err != nil { + log.Printf(err.Error()) + sendSNSMessage("Unable to connect to database. Cron status unknown.", sess) + return err + } + if err = db.Ping(); err != nil { + log.Printf(err.Error()) + sendSNSMessage("Unable to connect to database. Cron status unknown.", sess) + return err + } + + log.Printf("Connected to database!") + + type CronStatsRow struct { + name string + status string + updated_at time.Time + } + + rows, err := db.Query("SELECT b.name,IFNULL(status, 'missing cron'),IFNULL(updated_at, FROM_UNIXTIME(0)) AS updated_at FROM (SELECT 'vulnerabilities' AS name UNION ALL SELECT 'cleanups_then_aggregation') b LEFT JOIN (SELECT name, status, updated_at FROM cron_stats WHERE id IN (SELECT MAX(id) FROM cron_stats WHERE status = 'completed' GROUP BY name)) a ON a.name = b.name;") + defer rows.Close() + if err != nil { + log.Printf(err.Error()) + sendSNSMessage("Unable to SELECT cron_stats table. Unable to continue.", sess) + return err + } + cronDelayDuration, err := time.ParseDuration(options.CronDelayTolerance) + if err != nil { + log.Printf(err.Error()) + sendSNSMessage("Unable to parse cron-delay-tolerance. Check lambda settings.", sess) + return err + } + cronAlertTimestamp := time.Now().Add(-1 * cronDelayDuration) + for rows.Next() { + var row CronStatsRow + if err := rows.Scan(&row.name, &row.status, &row.updated_at); err != nil { + log.Printf(err.Error()) + sendSNSMessage("Error scanning row in cron_stats table. Unable to continue.", sess) + return err + } + log.Printf("Row %s last updated at %s", row.name, row.updated_at.String()) + if row.updated_at.Before(cronAlertTimestamp) { + log.Printf("*** %s hasn't updated in more than %s, alerting! (status %s)", options.CronDelayTolerance, row.name, row.status) + // Fire on the first match and return. We only need to alert that the crons need looked at, not each cron. + sendSNSMessage(fmt.Sprintf("Fleet cron '%s' hasn't updated in more than %s. Last status was '%s' at %s.", row.name, options.CronDelayTolerance, row.status, row.updated_at.String()), sess) + return nil + } + } + + return nil +} + +func handler(ctx context.Context, name NullEvent) error { + sess := session.Must(session.NewSessionWithOptions( + session.Options{ + SharedConfigState: session.SharedConfigEnable, + Config: aws.Config{ + Region: &options.AWSRegion, + }, + }, + )) + + checkDB(sess) + return nil +} + +func main() { + var err error + log.SetFlags(log.LstdFlags | log.Lshortfile) + // Get config from environment + parser := flags.NewParser(&options, flags.Default) + if _, err = parser.Parse(); err != nil { + if flagsErr, ok := err.(*flags.Error); ok && flagsErr.Type == flags.ErrHelp { + return + } else { + log.Fatal(err) + } + } + + // When running from Lambda, this should be read from the environment. + if options.LambdaRuntimeAPI != "" { + log.Printf("Starting Lambda handler.") + lambda.Start(handler) + } else { + log.Printf("Lambda execution environment not found. Falling back to local execution.") + if err = handler(context.Background(), NullEvent{}); err != nil { + log.Fatal(err) + } + } +} diff --git a/terraform/addons/monitoring/main.tf b/terraform/addons/monitoring/main.tf index 2911a290eb..7c05f374fd 100644 --- a/terraform/addons/monitoring/main.tf +++ b/terraform/addons/monitoring/main.tf @@ -239,7 +239,7 @@ resource "aws_cloudwatch_metric_alarm" "redis-replication-lag" { namespace = "AWS/ElastiCache" period = "300" stat = "p90" - + dimensions = { CacheClusterId = each.key } @@ -267,3 +267,198 @@ resource "aws_cloudwatch_metric_alarm" "acm_certificate_expired" { CertificateArn = var.acm_certificate_arn } } + +// Cron Monitoring +resource "null_resource" "cron_monitoring_build" { + count = var.cron_monitoring == null ? 0 : 1 + triggers = { + main_go_changes = filesha256("${path.module}/lambda/main.go"), + go_mod_changes = filesha256("${path.module}/lambda/go.mod") + go_sum_changes = filesha256("${path.module}/lambda/go.sum") + } + provisioner "local-exec" { + working_dir = "${path.module}/lambda" + command = <<-EOT + go get + GOOS=linux GOARCH=amd64 go build -tags lambda.norpc -o bootstrap main.go + EOT + } +} + +data "archive_file" "cron_monitoring_lambda" { + count = var.cron_monitoring == null ? 0 : 1 + depends_on = [null_resource.cron_monitoring_build[0]] + type = "zip" + output_path = "${path.module}/lambda/.lambda.zip" + source_file = "${path.module}/lambda/bootstrap" +} + +data "aws_secretsmanager_secret" "mysql_database_password" { + count = var.cron_monitoring == null ? 0 : 1 + name = var.cron_monitoring.mysql_password_secret_name +} + +resource "aws_security_group" "cron_monitoring" { + count = var.cron_monitoring == null ? 0 : 1 + name = "${var.customer_prefix}_cron_monitoring" + description = "Security group for cron monitoring lambda (used by RDS to allow access in)" + vpc_id = var.cron_monitoring.vpc_id + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + ipv6_cidr_blocks = ["::/0"] + } +} + +resource "aws_security_group_rule" "cron_monitoring_to_rds" { + count = var.cron_monitoring == null ? 0 : 1 + type = "ingress" + from_port = 3306 + to_port = 3306 + protocol = "tcp" + source_security_group_id = aws_security_group.cron_monitoring[0].id + security_group_id = var.cron_monitoring.rds_security_group_id +} + +resource "aws_lambda_function" "cron_monitoring" { + count = var.cron_monitoring == null ? 0 : 1 + + depends_on = [ + null_resource.cron_monitoring_build[0], + data.archive_file.cron_monitoring_lambda[0] + ] + + function_name = "${var.customer_prefix}_cron_monitoring" + runtime = "provided.al2" + memory_size = 256 + timeout = 300 + package_type = "Zip" + filename = data.archive_file.cron_monitoring_lambda[0].output_path + source_code_hash = data.archive_file.cron_monitoring_lambda[0].output_base64sha256 + handler = "bootstrap" + reserved_concurrent_executions = 1 + description = "This function has the ability to log into a production database and validate that the Fleet crons are running properly" + tracing_config { + mode = "Active" + } + + vpc_config { + subnet_ids = var.cron_monitoring.subnet_ids + security_group_ids = [aws_security_group.cron_monitoring[0].id] + } + + role = aws_iam_role.cron_monitoring_lambda[0].arn + + environment { + variables = { + MYSQL_HOST = var.cron_monitoring.mysql_host + MYSQL_DATABASE = var.cron_monitoring.mysql_database + MYSQL_USER = var.cron_monitoring.mysql_user + MYSQL_SECRETSMANAGER_SECRET = data.aws_secretsmanager_secret.mysql_database_password[0].name + SNS_TOPIC_ARNS = join(",", lookup(var.sns_topic_arns_map, "cron_monitoring", var.default_sns_topic_arns)) + FLEET_ENV = var.customer_prefix + CRON_DELAY_TOLERANCE = var.cron_monitoring.delay_tolerance + } + } + +} + +// Lambda IAM +data "aws_iam_policy_document" "cron_monitoring_lambda_assume_role" { + statement { + actions = ["sts:AssumeRole"] + principals { + type = "Service" + identifiers = ["lambda.amazonaws.com"] + } + } +} + +resource "aws_iam_role_policy_attachment" "cron_monitoring_lambda" { + count = var.cron_monitoring == null ? 0 : 1 + role = aws_iam_role.cron_monitoring_lambda[0].id + policy_arn = aws_iam_policy.cron_monitoring_lambda[0].arn +} + +resource "aws_iam_role_policy_attachment" "cron_monitoring_lambda_managed" { + count = var.cron_monitoring == null ? 0 : 1 + role = aws_iam_role.cron_monitoring_lambda[0].id + policy_arn = "arn:aws:iam::aws:policy/service-role/AWSLambdaVPCAccessExecutionRole" +} + +resource "aws_iam_policy" "cron_monitoring_lambda" { + count = var.cron_monitoring == null ? 0 : 1 + name = "${var.customer_prefix}-cron-monitoring" + policy = data.aws_iam_policy_document.cron_monitoring_lambda.json +} + +resource "aws_iam_role" "cron_monitoring_lambda" { + count = var.cron_monitoring == null ? 0 : 1 + name = "cron-monitoring-lambda" + assume_role_policy = data.aws_iam_policy_document.cron_monitoring_lambda_assume_role.json +} + +data "aws_region" "current" {} +data "aws_caller_identity" "current" {} + +data "aws_iam_policy_document" "cron_monitoring_lambda" { + statement { + + sid = "SSMGetParameterPolicy" + + actions = [ + "secretsmanager:GetResourcePolicy", + "secretsmanager:DescribeSecret", + "secretsmanager:GetSecretValue" + ] + + resources = [data.aws_secretsmanager_secret.mysql_database_password[0].arn] + + effect = "Allow" + + } + + statement { + sid = "SNSPublish" + + actions = [ + "sns:Publish" + ] + + resources = lookup(var.sns_topic_arns_map, "cron_monitoring", var.default_sns_topic_arns) + + effect = "Allow" + } + +} + +resource "aws_cloudwatch_log_group" "cron_monitoring_lambda" { + count = var.cron_monitoring == null ? 0 : 1 + name = "/aws/lambda/${var.customer_prefix}-cron-monitoring" + retention_in_days = 7 + +} + +resource "aws_cloudwatch_event_rule" "cron_monitoring_lambda" { + count = var.cron_monitoring == null ? 0 : 1 + name = "${var.customer_prefix}-cron-monitoring" + schedule_expression = "rate(${var.cron_monitoring.run_interval})" + is_enabled = true +} + +resource "aws_cloudwatch_event_target" "cron_monitoring_lambda" { + count = var.cron_monitoring == null ? 0 : 1 + rule = aws_cloudwatch_event_rule.cron_monitoring_lambda[0].name + arn = aws_lambda_function.cron_monitoring[0].arn +} + +resource "aws_lambda_permission" "cron_monitoring_cloudwatch" { + count = var.cron_monitoring == null ? 0 : 1 + action = "lambda:InvokeFunction" + function_name = aws_lambda_function.cron_monitoring[0].id + principal = "events.amazonaws.com" + source_arn = aws_cloudwatch_event_rule.cron_monitoring_lambda[0].arn +} diff --git a/terraform/addons/monitoring/variables.tf b/terraform/addons/monitoring/variables.tf index 5b2efdcbc4..b433e7343c 100644 --- a/terraform/addons/monitoring/variables.tf +++ b/terraform/addons/monitoring/variables.tf @@ -19,12 +19,12 @@ variable "alb_name" { } variable "alb_target_group_name" { - type = string + type = string default = null } variable "alb_target_group_arn_suffix" { - type = string + type = string default = null } @@ -58,4 +58,17 @@ variable "acm_certificate_arn" { default = null } - +variable "cron_monitoring" { + type = object({ + mysql_host = string + mysql_database = string + mysql_user = string + mysql_password_secret_name = string + vpc_id = string + subnet_ids = list(string) + rds_security_group_id = string + delay_tolerance = string + run_interval = string + }) + default = null +}