diff --git a/infrastructure/dogfood/terraform/aws-tf-module/.terraform.lock.hcl b/infrastructure/dogfood/terraform/aws-tf-module/.terraform.lock.hcl index f63e59a9d1..3f16e4c1b7 100644 --- a/infrastructure/dogfood/terraform/aws-tf-module/.terraform.lock.hcl +++ b/infrastructure/dogfood/terraform/aws-tf-module/.terraform.lock.hcl @@ -1,6 +1,25 @@ # This file is maintained automatically by "terraform init". # Manual edits may be lost in future updates. +provider "registry.terraform.io/hashicorp/archive" { + version = "2.4.0" + hashes = [ + "h1:cJokkjeH1jfpG4QEHdRx0t2j8rr52H33A7C/oX73Ok4=", + "zh:18e408596dd53048f7fc8229098d0e3ad940b92036a24287eff63e2caec72594", + "zh:392d4216ecd1a1fd933d23f4486b642a8480f934c13e2cae3c13b6b6a7e34a7b", + "zh:655dd1fa5ca753a4ace21d0de3792d96fff429445717f2ce31c125d19c38f3ff", + "zh:70dae36c176aa2b258331ad366a471176417a94dd3b4985a911b8be9ff842b00", + "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", + "zh:7d8c8e3925f1e21daf73f85983894fbe8868e326910e6df3720265bc657b9c9c", + "zh:a032ec0f0aee27a789726e348e8ad20778c3a1c9190ef25e7cff602c8d175f44", + "zh:b8e50de62ba185745b0fe9713755079ad0e9f7ac8638d204de6762cc36870410", + "zh:c8ad0c7697a3d444df21ff97f3473a8604c8639be64afe3f31b8ec7ad7571e18", + "zh:df736c5a2a7c3a82c5493665f659437a22f0baf8c2d157e45f4dd7ca40e739fc", + "zh:e8ffbf578a0977074f6d08aa8734e36c726e53dc79894cfc4f25fadc4f45f1df", + "zh:efea57ff23b141551f92b2699024d356c7ffd1a4ad62931da7ed7a386aef7f1f", + ] +} + provider "registry.terraform.io/hashicorp/aws" { version = "4.57.0" constraints = ">= 2.67.0, >= 3.0.0, >= 3.73.0, >= 4.6.0, >= 4.8.0, >= 4.9.0, >= 4.18.0, >= 4.27.0, >= 4.30.0, >= 4.40.0, 4.57.0" diff --git a/infrastructure/dogfood/terraform/aws-tf-module/main.tf b/infrastructure/dogfood/terraform/aws-tf-module/main.tf index 7b5dbac087..5286f577f6 100644 --- a/infrastructure/dogfood/terraform/aws-tf-module/main.tf +++ b/infrastructure/dogfood/terraform/aws-tf-module/main.tf @@ -264,7 +264,7 @@ module "osquery-carve" { } module "monitoring" { - source = "github.com/fleetdm/fleet//terraform/addons/monitoring?ref=tf-mod-addon-monitoring-v1.0.0" + source = "github.com/fleetdm/fleet//terraform/addons/monitoring?ref=tf-mod-addon-monitoring-v1.1.0" customer_prefix = local.customer fleet_ecs_service_name = module.main.byo-vpc.byo-db.byo-ecs.service.name fleet_min_containers = module.main.byo-vpc.byo-db.byo-ecs.service.desired_count @@ -274,11 +274,25 @@ module "monitoring" { alb_arn_suffix = module.main.byo-vpc.byo-db.alb.lb_arn_suffix sns_topic_arns_map = { alb_httpcode_5xx = [module.notify_slack.slack_topic_arn] + cron_monitoring = [module.notify_slack.slack_topic_arn] } mysql_cluster_members = module.main.byo-vpc.rds.cluster_members # The cloudposse module seems to have a nested list here. redis_cluster_members = module.main.byo-vpc.redis.member_clusters[0] acm_certificate_arn = module.acm.acm_certificate_arn + cron_monitoring = { + mysql_host = module.main.byo-vpc.rds.cluster_reader_endpoint + mysql_database = module.main.byo-vpc.rds.cluster_database_name + mysql_user = module.main.byo-vpc.rds.cluster_master_username + mysql_password_secret_name = module.main.byo-vpc.secrets.secret_ids["${local.customer}-database-password"] + rds_security_group_id = module.main.byo-vpc.rds.security_group_id + subnet_ids = module.main.vpc.private_subnets + vpc_id = module.main.vpc.vpc_id + # Format of https://pkg.go.dev/time#ParseDuration + delay_tolerance = "2h" + # Interval format for: https://docs.aws.amazon.com/scheduler/latest/UserGuide/schedule-types.html#rate-based + run_interval = "1 hour" + } } module "logging_alb" { diff --git a/terraform/addons/monitoring/.header.md b/terraform/addons/monitoring/.header.md index 5de045a3a5..8c25ad69a7 100644 --- a/terraform/addons/monitoring/.header.md +++ b/terraform/addons/monitoring/.header.md @@ -8,6 +8,7 @@ This includes: - RDS Monitoring - Redis Monitoring - ACM Certificate Monitoring +- A custom Lambda to check the Fleet DB for Cron runs # Preparation @@ -15,25 +16,63 @@ Some of the for_each and counts in this module cannot pre-determine the numbers You will need to `terraform apply -target module.main` prior applying monitoring assuming the use of a configuration matching the example at https://github.com/fleetdm/fleet/blob/main/terraform/example/main.tf. -# Example Configuration +# Example configuration This assumes your fleet module is `main` and is configured with it's default documentation. -See https://github.com/fleetdm/fleet/blob/main/terraform/example/main.tf for details. +See https://github.com/fleetdm/fleet/blob/main/terraform/example/main.tf for details. + ``` module "monitoring" { - source = "github.com/fleetdm/fleet//terraform/addons/monitoring?ref=main" + source = "github.com/fleetdm/fleet//terraform/addons/monitoring?ref=tf-mod-addon-monitoring-v1.1.0" + customer_prefix = local.customer fleet_ecs_service_name = module.main.byo-vpc.byo-db.byo-ecs.service.name fleet_min_containers = module.main.byo-vpc.byo-db.byo-ecs.service.desired_count alb_name = module.main.byo-vpc.byo-db.alb.lb_dns_name alb_target_group_name = module.main.byo-vpc.byo-db.alb.target_group_names[0] alb_target_group_arn_suffix = module.main.byo-vpc.byo-db.alb.target_group_arn_suffixes[0] alb_arn_suffix = module.main.byo-vpc.byo-db.alb.lb_arn_suffix - default_sns_topic_arns = [var.sns_topic_arn] - mysql_cluster_members = module.main.byo-vpc.rds.cluster_members - redis_cluster_members = module.main.byo-vpc.redis.member_clusters[0] - acm_certificate_arn = module.acm.acm_certificate_arn + # Only publish alerts for items in this map + sns_topic_arns_map = { + alb_httpcode_5xx = [var.sns_topic_arn] + cron_monitoring = [var.sns_topic_arn] + } + mysql_cluster_members = module.main.byo-vpc.rds.cluster_members + # The cloudposse module seems to have a nested list here. + redis_cluster_members = module.main.byo-vpc.redis.member_clusters[0] + acm_certificate_arn = module.acm.acm_certificate_arn + cron_monitoring = { + mysql_host = module.main.byo-vpc.rds.cluster_reader_endpoint + mysql_database = module.main.byo-vpc.rds.cluster_database_name + mysql_user = module.main.byo-vpc.rds.cluster_master_username + mysql_password_secret_name = module.main.byo-vpc.secrets.secret_ids["${local.customer}-database-password"] + rds_security_group_id = module.main.byo-vpc.rds.security_group_id + subnet_ids = module.main.vpc.private_subnets + vpc_id = module.main.vpc.vpc_id + # Format of https://pkg.go.dev/time#ParseDuration + delay_tolerance = "2h" + # Interval format for: https://docs.aws.amazon.com/scheduler/latest/UserGuide/schedule-types.html#rate-based + run_interval = "1 hour" + } } -``` +``` +# SNS topic ARNs map + +Valid targets for `sns_topic_arns_map`: + + - acm_certificate_expired + - alb_helthyhosts + - alb_httpcode_5xx + - backend_response_time + - cron_monitoring + - rds_cpu_untilizaton_too_high + - rds_db_event_subscription + - redis_cpu_engine_utilization + - redis_cpu_utilization + - redis_current_connections + - redis_database_memory_percentage + - redis_replication_lag + +If you want to publish to all, use `default_sns_topic_arns` instead and include your notification ARNs there. diff --git a/terraform/addons/monitoring/README.md b/terraform/addons/monitoring/README.md index fbf3365fdb..59c5d511be 100644 --- a/terraform/addons/monitoring/README.md +++ b/terraform/addons/monitoring/README.md @@ -8,6 +8,7 @@ This includes: - RDS Monitoring - Redis Monitoring - ACM Certificate Monitoring +- A custom Lambda to check the Fleet DB for Cron runs # Preparation @@ -15,7 +16,7 @@ Some of the for\_each and counts in this module cannot pre-determine the numbers You will need to `terraform apply -target module.main` prior applying monitoring assuming the use of a configuration matching the example at https://github.com/fleetdm/fleet/blob/main/terraform/example/main.tf. -# Example Configuration +# Example configuration This assumes your fleet module is `main` and is configured with it's default documentation. @@ -23,20 +24,58 @@ See https://github.com/fleetdm/fleet/blob/main/terraform/example/main.tf for det ``` module "monitoring" { - source = "github.com/fleetdm/fleet//terraform/addons/monitoring?ref=main" + source = "github.com/fleetdm/fleet//terraform/addons/monitoring?ref=tf-mod-addon-monitoring-v1.1.0" + customer_prefix = local.customer fleet_ecs_service_name = module.main.byo-vpc.byo-db.byo-ecs.service.name fleet_min_containers = module.main.byo-vpc.byo-db.byo-ecs.service.desired_count alb_name = module.main.byo-vpc.byo-db.alb.lb_dns_name alb_target_group_name = module.main.byo-vpc.byo-db.alb.target_group_names[0] alb_target_group_arn_suffix = module.main.byo-vpc.byo-db.alb.target_group_arn_suffixes[0] alb_arn_suffix = module.main.byo-vpc.byo-db.alb.lb_arn_suffix - default_sns_topic_arns = [var.sns_topic_arn] - mysql_cluster_members = module.main.byo-vpc.rds.cluster_members - redis_cluster_members = module.main.byo-vpc.redis.member_clusters[0] - acm_certificate_arn = module.acm.acm_certificate_arn + # Only publish alerts for items in this map + sns_topic_arns_map = { + alb_httpcode_5xx = [var.sns_topic_arn] + cron_monitoring = [var.sns_topic_arn] + } + mysql_cluster_members = module.main.byo-vpc.rds.cluster_members + # The cloudposse module seems to have a nested list here. + redis_cluster_members = module.main.byo-vpc.redis.member_clusters[0] + acm_certificate_arn = module.acm.acm_certificate_arn + cron_monitoring = { + mysql_host = module.main.byo-vpc.rds.cluster_reader_endpoint + mysql_database = module.main.byo-vpc.rds.cluster_database_name + mysql_user = module.main.byo-vpc.rds.cluster_master_username + mysql_password_secret_name = module.main.byo-vpc.secrets.secret_ids["${local.customer}-database-password"] + rds_security_group_id = module.main.byo-vpc.rds.security_group_id + subnet_ids = module.main.vpc.private_subnets + vpc_id = module.main.vpc.vpc_id + # Format of https://pkg.go.dev/time#ParseDuration + delay_tolerance = "2h" + # Interval format for: https://docs.aws.amazon.com/scheduler/latest/UserGuide/schedule-types.html#rate-based + run_interval = "1 hour" + } } ``` +# SNS topic ARNs map + +Valid targets for `sns_topic_arns_map`: + + - acm\_certificate\_expired + - alb\_helthyhosts + - alb\_httpcode\_5xx + - backend\_response\_time + - cron\_monitoring + - rds\_cpu\_untilizaton\_too\_high + - rds\_db\_event\_subscription + - redis\_cpu\_engine\_utilization + - redis\_cpu\_utilization + - redis\_current\_connections + - redis\_database\_memory\_percentage + - redis\_replication\_lag + +If you want to publish to all, use `default_sns_topic_arns` instead and include your notification ARNs there. + ## Requirements No requirements. @@ -45,7 +84,9 @@ No requirements. | Name | Version | |------|---------| -| [aws](#provider\_aws) | n/a | +| [archive](#provider\_archive) | 2.4.0 | +| [aws](#provider\_aws) | 5.22.0 | +| [null](#provider\_null) | 3.2.1 | ## Modules @@ -55,6 +96,9 @@ No modules. | Name | Type | |------|------| +| [aws_cloudwatch_event_rule.cron_monitoring_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_rule) | resource | +| [aws_cloudwatch_event_target.cron_monitoring_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_target) | resource | +| [aws_cloudwatch_log_group.cron_monitoring_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_log_group) | resource | | [aws_cloudwatch_metric_alarm.acm_certificate_expired](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_metric_alarm) | resource | | [aws_cloudwatch_metric_alarm.alb_healthyhosts](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_metric_alarm) | resource | | [aws_cloudwatch_metric_alarm.cpu_utilization_too_high](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_metric_alarm) | resource | @@ -66,6 +110,21 @@ No modules. | [aws_cloudwatch_metric_alarm.redis_cpu_engine_utilization](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_metric_alarm) | resource | | [aws_cloudwatch_metric_alarm.target_response_time](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_metric_alarm) | resource | | [aws_db_event_subscription.default](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/db_event_subscription) | resource | +| [aws_iam_policy.cron_monitoring_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource | +| [aws_iam_role.cron_monitoring_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | resource | +| [aws_iam_role_policy_attachment.cron_monitoring_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource | +| [aws_iam_role_policy_attachment.cron_monitoring_lambda_managed](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource | +| [aws_lambda_function.cron_monitoring](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_function) | resource | +| [aws_lambda_permission.cron_monitoring_cloudwatch](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_permission) | resource | +| [aws_security_group.cron_monitoring](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/security_group) | resource | +| [aws_security_group_rule.cron_monitoring_to_rds](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/security_group_rule) | resource | +| [null_resource.cron_monitoring_build](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource | +| [archive_file.cron_monitoring_lambda](https://registry.terraform.io/providers/hashicorp/archive/latest/docs/data-sources/file) | data source | +| [aws_caller_identity.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/caller_identity) | data source | +| [aws_iam_policy_document.cron_monitoring_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | +| [aws_iam_policy_document.cron_monitoring_lambda_assume_role](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | +| [aws_region.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/region) | data source | +| [aws_secretsmanager_secret.mysql_database_password](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/secretsmanager_secret) | data source | ## Inputs @@ -76,6 +135,7 @@ No modules. | [alb\_name](#input\_alb\_name) | n/a | `string` | `null` | no | | [alb\_target\_group\_arn\_suffix](#input\_alb\_target\_group\_arn\_suffix) | n/a | `string` | `null` | no | | [alb\_target\_group\_name](#input\_alb\_target\_group\_name) | n/a | `string` | `null` | no | +| [cron\_monitoring](#input\_cron\_monitoring) | n/a |
object({
mysql_host = string
mysql_database = string
mysql_user = string
mysql_password_secret_name = string
vpc_id = string
subnet_ids = list(string)
rds_security_group_id = string
delay_tolerance = string
run_interval = string
}) | `null` | no |
| [customer\_prefix](#input\_customer\_prefix) | n/a | `string` | `"fleet"` | no |
| [default\_sns\_topic\_arns](#input\_default\_sns\_topic\_arns) | n/a | `list(string)` | `[]` | no |
| [fleet\_ecs\_service\_name](#input\_fleet\_ecs\_service\_name) | n/a | `string` | `null` | no |
diff --git a/terraform/addons/monitoring/lambda/.gitignore b/terraform/addons/monitoring/lambda/.gitignore
new file mode 100644
index 0000000000..8c7f193def
--- /dev/null
+++ b/terraform/addons/monitoring/lambda/.gitignore
@@ -0,0 +1,3 @@
+exports.sh
+.lambda.zip
+bootstrap
diff --git a/terraform/addons/monitoring/lambda/go.mod b/terraform/addons/monitoring/lambda/go.mod
new file mode 100644
index 0000000000..f20b3d981e
--- /dev/null
+++ b/terraform/addons/monitoring/lambda/go.mod
@@ -0,0 +1,16 @@
+module github.com/fleetdm/fleet/terraform/addons/monitoring/lambda
+
+go 1.21.1
+
+require (
+ github.com/aws/aws-lambda-go v1.41.0
+ github.com/aws/aws-sdk-go v1.45.25
+ github.com/aws/aws-secretsmanager-caching-go v1.1.2
+ github.com/go-sql-driver/mysql v1.7.1
+ github.com/jessevdk/go-flags v1.5.0
+)
+
+require (
+ github.com/jmespath/go-jmespath v0.4.0 // indirect
+ golang.org/x/sys v0.9.0 // indirect
+)
diff --git a/terraform/addons/monitoring/lambda/go.sum b/terraform/addons/monitoring/lambda/go.sum
new file mode 100644
index 0000000000..2bcdce4a97
--- /dev/null
+++ b/terraform/addons/monitoring/lambda/go.sum
@@ -0,0 +1,73 @@
+github.com/aws/aws-lambda-go v1.41.0 h1:l/5fyVb6Ud9uYd411xdHZzSf2n86TakxzpvIoz7l+3Y=
+github.com/aws/aws-lambda-go v1.41.0/go.mod h1:jwFe2KmMsHmffA1X2R09hH6lFzJQxzI8qK17ewzbQMM=
+github.com/aws/aws-sdk-go v1.44.287/go.mod h1:aVsgQcEevwlmQ7qHE9I3h+dtQgpqhFB+i8Phjh7fkwI=
+github.com/aws/aws-sdk-go v1.45.25 h1:c4fLlh5sLdK2DCRTY1z0hyuJZU4ygxX8m1FswL6/nF4=
+github.com/aws/aws-sdk-go v1.45.25/go.mod h1:aVsgQcEevwlmQ7qHE9I3h+dtQgpqhFB+i8Phjh7fkwI=
+github.com/aws/aws-secretsmanager-caching-go v1.1.2 h1:tY3pRhAkaohm75KFpGHoqjWrnRpznqrc8iX/wTLVpH0=
+github.com/aws/aws-secretsmanager-caching-go v1.1.2/go.mod h1:s3Or+O0O8obPyDJz6875Rg1WApAbQ64L0WTBwYNnKLo=
+github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
+github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/go-sql-driver/mysql v1.7.1 h1:lUIinVbN1DY0xBg0eMOzmmtGoHwWBbvnWubQUrtU8EI=
+github.com/go-sql-driver/mysql v1.7.1/go.mod h1:OXbVy3sEdcQ2Doequ6Z5BW6fXNQTmx+9S1MCJN5yJMI=
+github.com/jessevdk/go-flags v1.5.0 h1:1jKYvbxEjfUl0fmqTCOfonvskHHXMjBySTLW4y9LFvc=
+github.com/jessevdk/go-flags v1.5.0/go.mod h1:Fw0T6WPc1dYxT4mKEZRfG5kJhaTDP9pj1c2EWnYs/m4=
+github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg=
+github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo=
+github.com/jmespath/go-jmespath/internal/testify v1.5.1/go.mod h1:L3OGu8Wl2/fWfCI6z80xFu9LTZmf1ZRjMHUOPmWr69U=
+github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
+github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
+github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
+github.com/stretchr/testify v1.7.2 h1:4jaiDzPyXQvSd7D0EjG45355tLlV3VOECpq10pLC+8s=
+github.com/stretchr/testify v1.7.2/go.mod h1:R6va5+xMeoiuVRoj+gSkQ7d3FALtqAAGI1FQKckRals=
+github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
+golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
+golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
+golang.org/x/crypto v0.10.0/go.mod h1:o4eNf7Ede1fv+hwOwZsTHl9EsPFO6q6ZvYR8vYfY45I=
+golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
+golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
+golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
+golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
+golang.org/x/net v0.1.0/go.mod h1:Cx3nUiGt4eDBEyega/BKRp+/AlGL8hYe7U9odMt2Cco=
+golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
+golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
+golang.org/x/net v0.11.0/go.mod h1:2L/ixqYpgIVXmeoSA/4Lu7BzTG4KIyPIryS4IsOd1oQ=
+golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210320140829-1e4c9ba3b0c4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.1.0 h1:kunALQeHf1/185U1i0GOB/fy1IPRDDpuoOOqRReG57U=
+golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.9.0 h1:KS/R3tvhPqvJvwcKfnBHJwwthS11LRhmM5D59eEXa0s=
+golang.org/x/sys v0.9.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
+golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
+golang.org/x/term v0.1.0/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
+golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
+golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo=
+golang.org/x/term v0.9.0/go.mod h1:M6DEAAIenWoTxdKrOltXcmDY3rSplQUkrvaDU5FcQyo=
+golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
+golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
+golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
+golang.org/x/text v0.4.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
+golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
+golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
+golang.org/x/text v0.10.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
+golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
+golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
+golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
+golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
+gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
+gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
diff --git a/terraform/addons/monitoring/lambda/main.go b/terraform/addons/monitoring/lambda/main.go
new file mode 100644
index 0000000000..eb56e0123b
--- /dev/null
+++ b/terraform/addons/monitoring/lambda/main.go
@@ -0,0 +1,191 @@
+/*
+This script is intended to be used with AWS Lambda to monitor the various
+crons that live inside of Fleet.
+
+We will check to see if there are recent updates from the crons in the
+following table:
+
+ - cron_stats
+
+If we have an old/incomplete run in cron_stats or if we are missing a
+cron entry entirely, throw an alert to an SNS topic.
+
+Currently tested crons:
+
+ - cleanups_then_aggregation
+ - vulnerabilities
+
+*/
+
+package main
+
+import (
+ "context"
+ "database/sql"
+ "fmt"
+ "log"
+ "strings"
+ "time"
+
+ "github.com/aws/aws-lambda-go/lambda"
+ "github.com/aws/aws-sdk-go/aws"
+ "github.com/aws/aws-sdk-go/aws/session"
+ "github.com/aws/aws-sdk-go/service/secretsmanager"
+ "github.com/aws/aws-sdk-go/service/sns"
+ "github.com/aws/aws-secretsmanager-caching-go/secretcache"
+ "github.com/go-sql-driver/mysql"
+ flags "github.com/jessevdk/go-flags"
+)
+
+type NullEvent struct{}
+
+type OptionsStruct struct {
+ LambdaRuntimeAPI string `long:"lambda-runtime-api" env:"AWS_LAMBDA_RUNTIME_API"`
+ SNSTopicArns string `long:"sns-topic-arn" env:"SNS_TOPIC_ARNS" required:"true"`
+ MySQLHost string `long:"mysql-host" env:"MYSQL_HOST" required:"true"`
+ MySQLUser string `long:"mysql-user" env:"MYSQL_USER" required:"true"`
+ MySQLSMSecret string `long:"mysql-secretsmanager-secret" env:"MYSQL_SECRETSMANAGER_SECRET" required:"true"`
+ MySQLDatabase string `long:"mysql-database" env:"MYSQL_DATABASE" required:"true"`
+ FleetEnv string `long:"fleet-environment" env:"FLEET_ENV" required:"true"`
+ AWSRegion string `long:"aws-region" env:"AWS_REGION" required:"true"`
+ CronDelayTolerance string `long:"cron-delay-tolerance" env:"CRON_DELAY_TOLERANCE" default:"2h"`
+}
+
+var options = OptionsStruct{}
+
+func sendSNSMessage(msg string, sess *session.Session) {
+ log.Printf("Sending SNS Message")
+ fullMsg := fmt.Sprintf("Environment: %s\nMessage: %s", options.FleetEnv, msg)
+ svc := sns.New(sess)
+ for _, SNSTopicArn := range strings.Split(options.SNSTopicArns, ",") {
+ log.Printf("Sending '%s' to '%s'", fullMsg, SNSTopicArn)
+ result, err := svc.Publish(&sns.PublishInput{
+ Message: &fullMsg,
+ TopicArn: &SNSTopicArn,
+ })
+ if err != nil {
+ log.Printf(err.Error())
+ }
+ log.Printf(result.GoString())
+ }
+}
+
+func checkDB(sess *session.Session) (err error) {
+ secretCache, err := secretcache.New()
+ if err != nil {
+ log.Printf(err.Error())
+ sendSNSMessage("Unable to initialise SecretsManager helper. Cron status is unknown.", sess)
+ return err
+ }
+
+ secretCache.Client = secretsmanager.New(sess)
+
+ MySQLPassword, err := secretCache.GetSecretString(options.MySQLSMSecret)
+ if err != nil {
+ log.Printf(err.Error())
+ sendSNSMessage("Unable to retrieve SecretsManager secret. Cron status is unknown.", sess)
+ return err
+ }
+
+ cfg := mysql.Config{
+ User: options.MySQLUser,
+ Passwd: MySQLPassword,
+ Net: "tcp",
+ Addr: options.MySQLHost,
+ DBName: options.MySQLDatabase,
+ AllowNativePasswords: true,
+ ParseTime: true,
+ }
+
+ db, err := sql.Open("mysql", cfg.FormatDSN())
+ defer db.Close()
+ if err != nil {
+ log.Printf(err.Error())
+ sendSNSMessage("Unable to connect to database. Cron status unknown.", sess)
+ return err
+ }
+ if err = db.Ping(); err != nil {
+ log.Printf(err.Error())
+ sendSNSMessage("Unable to connect to database. Cron status unknown.", sess)
+ return err
+ }
+
+ log.Printf("Connected to database!")
+
+ type CronStatsRow struct {
+ name string
+ status string
+ updated_at time.Time
+ }
+
+ rows, err := db.Query("SELECT b.name,IFNULL(status, 'missing cron'),IFNULL(updated_at, FROM_UNIXTIME(0)) AS updated_at FROM (SELECT 'vulnerabilities' AS name UNION ALL SELECT 'cleanups_then_aggregation') b LEFT JOIN (SELECT name, status, updated_at FROM cron_stats WHERE id IN (SELECT MAX(id) FROM cron_stats WHERE status = 'completed' GROUP BY name)) a ON a.name = b.name;")
+ defer rows.Close()
+ if err != nil {
+ log.Printf(err.Error())
+ sendSNSMessage("Unable to SELECT cron_stats table. Unable to continue.", sess)
+ return err
+ }
+ cronDelayDuration, err := time.ParseDuration(options.CronDelayTolerance)
+ if err != nil {
+ log.Printf(err.Error())
+ sendSNSMessage("Unable to parse cron-delay-tolerance. Check lambda settings.", sess)
+ return err
+ }
+ cronAlertTimestamp := time.Now().Add(-1 * cronDelayDuration)
+ for rows.Next() {
+ var row CronStatsRow
+ if err := rows.Scan(&row.name, &row.status, &row.updated_at); err != nil {
+ log.Printf(err.Error())
+ sendSNSMessage("Error scanning row in cron_stats table. Unable to continue.", sess)
+ return err
+ }
+ log.Printf("Row %s last updated at %s", row.name, row.updated_at.String())
+ if row.updated_at.Before(cronAlertTimestamp) {
+ log.Printf("*** %s hasn't updated in more than %s, alerting! (status %s)", options.CronDelayTolerance, row.name, row.status)
+ // Fire on the first match and return. We only need to alert that the crons need looked at, not each cron.
+ sendSNSMessage(fmt.Sprintf("Fleet cron '%s' hasn't updated in more than %s. Last status was '%s' at %s.", row.name, options.CronDelayTolerance, row.status, row.updated_at.String()), sess)
+ return nil
+ }
+ }
+
+ return nil
+}
+
+func handler(ctx context.Context, name NullEvent) error {
+ sess := session.Must(session.NewSessionWithOptions(
+ session.Options{
+ SharedConfigState: session.SharedConfigEnable,
+ Config: aws.Config{
+ Region: &options.AWSRegion,
+ },
+ },
+ ))
+
+ checkDB(sess)
+ return nil
+}
+
+func main() {
+ var err error
+ log.SetFlags(log.LstdFlags | log.Lshortfile)
+ // Get config from environment
+ parser := flags.NewParser(&options, flags.Default)
+ if _, err = parser.Parse(); err != nil {
+ if flagsErr, ok := err.(*flags.Error); ok && flagsErr.Type == flags.ErrHelp {
+ return
+ } else {
+ log.Fatal(err)
+ }
+ }
+
+ // When running from Lambda, this should be read from the environment.
+ if options.LambdaRuntimeAPI != "" {
+ log.Printf("Starting Lambda handler.")
+ lambda.Start(handler)
+ } else {
+ log.Printf("Lambda execution environment not found. Falling back to local execution.")
+ if err = handler(context.Background(), NullEvent{}); err != nil {
+ log.Fatal(err)
+ }
+ }
+}
diff --git a/terraform/addons/monitoring/main.tf b/terraform/addons/monitoring/main.tf
index 2911a290eb..7c05f374fd 100644
--- a/terraform/addons/monitoring/main.tf
+++ b/terraform/addons/monitoring/main.tf
@@ -239,7 +239,7 @@ resource "aws_cloudwatch_metric_alarm" "redis-replication-lag" {
namespace = "AWS/ElastiCache"
period = "300"
stat = "p90"
-
+
dimensions = {
CacheClusterId = each.key
}
@@ -267,3 +267,198 @@ resource "aws_cloudwatch_metric_alarm" "acm_certificate_expired" {
CertificateArn = var.acm_certificate_arn
}
}
+
+// Cron Monitoring
+resource "null_resource" "cron_monitoring_build" {
+ count = var.cron_monitoring == null ? 0 : 1
+ triggers = {
+ main_go_changes = filesha256("${path.module}/lambda/main.go"),
+ go_mod_changes = filesha256("${path.module}/lambda/go.mod")
+ go_sum_changes = filesha256("${path.module}/lambda/go.sum")
+ }
+ provisioner "local-exec" {
+ working_dir = "${path.module}/lambda"
+ command = <<-EOT
+ go get
+ GOOS=linux GOARCH=amd64 go build -tags lambda.norpc -o bootstrap main.go
+ EOT
+ }
+}
+
+data "archive_file" "cron_monitoring_lambda" {
+ count = var.cron_monitoring == null ? 0 : 1
+ depends_on = [null_resource.cron_monitoring_build[0]]
+ type = "zip"
+ output_path = "${path.module}/lambda/.lambda.zip"
+ source_file = "${path.module}/lambda/bootstrap"
+}
+
+data "aws_secretsmanager_secret" "mysql_database_password" {
+ count = var.cron_monitoring == null ? 0 : 1
+ name = var.cron_monitoring.mysql_password_secret_name
+}
+
+resource "aws_security_group" "cron_monitoring" {
+ count = var.cron_monitoring == null ? 0 : 1
+ name = "${var.customer_prefix}_cron_monitoring"
+ description = "Security group for cron monitoring lambda (used by RDS to allow access in)"
+ vpc_id = var.cron_monitoring.vpc_id
+
+ egress {
+ from_port = 0
+ to_port = 0
+ protocol = "-1"
+ cidr_blocks = ["0.0.0.0/0"]
+ ipv6_cidr_blocks = ["::/0"]
+ }
+}
+
+resource "aws_security_group_rule" "cron_monitoring_to_rds" {
+ count = var.cron_monitoring == null ? 0 : 1
+ type = "ingress"
+ from_port = 3306
+ to_port = 3306
+ protocol = "tcp"
+ source_security_group_id = aws_security_group.cron_monitoring[0].id
+ security_group_id = var.cron_monitoring.rds_security_group_id
+}
+
+resource "aws_lambda_function" "cron_monitoring" {
+ count = var.cron_monitoring == null ? 0 : 1
+
+ depends_on = [
+ null_resource.cron_monitoring_build[0],
+ data.archive_file.cron_monitoring_lambda[0]
+ ]
+
+ function_name = "${var.customer_prefix}_cron_monitoring"
+ runtime = "provided.al2"
+ memory_size = 256
+ timeout = 300
+ package_type = "Zip"
+ filename = data.archive_file.cron_monitoring_lambda[0].output_path
+ source_code_hash = data.archive_file.cron_monitoring_lambda[0].output_base64sha256
+ handler = "bootstrap"
+ reserved_concurrent_executions = 1
+ description = "This function has the ability to log into a production database and validate that the Fleet crons are running properly"
+ tracing_config {
+ mode = "Active"
+ }
+
+ vpc_config {
+ subnet_ids = var.cron_monitoring.subnet_ids
+ security_group_ids = [aws_security_group.cron_monitoring[0].id]
+ }
+
+ role = aws_iam_role.cron_monitoring_lambda[0].arn
+
+ environment {
+ variables = {
+ MYSQL_HOST = var.cron_monitoring.mysql_host
+ MYSQL_DATABASE = var.cron_monitoring.mysql_database
+ MYSQL_USER = var.cron_monitoring.mysql_user
+ MYSQL_SECRETSMANAGER_SECRET = data.aws_secretsmanager_secret.mysql_database_password[0].name
+ SNS_TOPIC_ARNS = join(",", lookup(var.sns_topic_arns_map, "cron_monitoring", var.default_sns_topic_arns))
+ FLEET_ENV = var.customer_prefix
+ CRON_DELAY_TOLERANCE = var.cron_monitoring.delay_tolerance
+ }
+ }
+
+}
+
+// Lambda IAM
+data "aws_iam_policy_document" "cron_monitoring_lambda_assume_role" {
+ statement {
+ actions = ["sts:AssumeRole"]
+ principals {
+ type = "Service"
+ identifiers = ["lambda.amazonaws.com"]
+ }
+ }
+}
+
+resource "aws_iam_role_policy_attachment" "cron_monitoring_lambda" {
+ count = var.cron_monitoring == null ? 0 : 1
+ role = aws_iam_role.cron_monitoring_lambda[0].id
+ policy_arn = aws_iam_policy.cron_monitoring_lambda[0].arn
+}
+
+resource "aws_iam_role_policy_attachment" "cron_monitoring_lambda_managed" {
+ count = var.cron_monitoring == null ? 0 : 1
+ role = aws_iam_role.cron_monitoring_lambda[0].id
+ policy_arn = "arn:aws:iam::aws:policy/service-role/AWSLambdaVPCAccessExecutionRole"
+}
+
+resource "aws_iam_policy" "cron_monitoring_lambda" {
+ count = var.cron_monitoring == null ? 0 : 1
+ name = "${var.customer_prefix}-cron-monitoring"
+ policy = data.aws_iam_policy_document.cron_monitoring_lambda.json
+}
+
+resource "aws_iam_role" "cron_monitoring_lambda" {
+ count = var.cron_monitoring == null ? 0 : 1
+ name = "cron-monitoring-lambda"
+ assume_role_policy = data.aws_iam_policy_document.cron_monitoring_lambda_assume_role.json
+}
+
+data "aws_region" "current" {}
+data "aws_caller_identity" "current" {}
+
+data "aws_iam_policy_document" "cron_monitoring_lambda" {
+ statement {
+
+ sid = "SSMGetParameterPolicy"
+
+ actions = [
+ "secretsmanager:GetResourcePolicy",
+ "secretsmanager:DescribeSecret",
+ "secretsmanager:GetSecretValue"
+ ]
+
+ resources = [data.aws_secretsmanager_secret.mysql_database_password[0].arn]
+
+ effect = "Allow"
+
+ }
+
+ statement {
+ sid = "SNSPublish"
+
+ actions = [
+ "sns:Publish"
+ ]
+
+ resources = lookup(var.sns_topic_arns_map, "cron_monitoring", var.default_sns_topic_arns)
+
+ effect = "Allow"
+ }
+
+}
+
+resource "aws_cloudwatch_log_group" "cron_monitoring_lambda" {
+ count = var.cron_monitoring == null ? 0 : 1
+ name = "/aws/lambda/${var.customer_prefix}-cron-monitoring"
+ retention_in_days = 7
+
+}
+
+resource "aws_cloudwatch_event_rule" "cron_monitoring_lambda" {
+ count = var.cron_monitoring == null ? 0 : 1
+ name = "${var.customer_prefix}-cron-monitoring"
+ schedule_expression = "rate(${var.cron_monitoring.run_interval})"
+ is_enabled = true
+}
+
+resource "aws_cloudwatch_event_target" "cron_monitoring_lambda" {
+ count = var.cron_monitoring == null ? 0 : 1
+ rule = aws_cloudwatch_event_rule.cron_monitoring_lambda[0].name
+ arn = aws_lambda_function.cron_monitoring[0].arn
+}
+
+resource "aws_lambda_permission" "cron_monitoring_cloudwatch" {
+ count = var.cron_monitoring == null ? 0 : 1
+ action = "lambda:InvokeFunction"
+ function_name = aws_lambda_function.cron_monitoring[0].id
+ principal = "events.amazonaws.com"
+ source_arn = aws_cloudwatch_event_rule.cron_monitoring_lambda[0].arn
+}
diff --git a/terraform/addons/monitoring/variables.tf b/terraform/addons/monitoring/variables.tf
index 5b2efdcbc4..b433e7343c 100644
--- a/terraform/addons/monitoring/variables.tf
+++ b/terraform/addons/monitoring/variables.tf
@@ -19,12 +19,12 @@ variable "alb_name" {
}
variable "alb_target_group_name" {
- type = string
+ type = string
default = null
}
variable "alb_target_group_arn_suffix" {
- type = string
+ type = string
default = null
}
@@ -58,4 +58,17 @@ variable "acm_certificate_arn" {
default = null
}
-
+variable "cron_monitoring" {
+ type = object({
+ mysql_host = string
+ mysql_database = string
+ mysql_user = string
+ mysql_password_secret_name = string
+ vpc_id = string
+ subnet_ids = list(string)
+ rds_security_group_id = string
+ delay_tolerance = string
+ run_interval = string
+ })
+ default = null
+}