mirror of
https://github.com/fleetdm/fleet
synced 2026-04-21 21:47:20 +00:00
331 lines
8 KiB
HCL
331 lines
8 KiB
HCL
terraform {
|
|
required_version = ">= 1.5"
|
|
|
|
required_providers {
|
|
aws = {
|
|
source = "hashicorp/aws"
|
|
version = ">= 5.68.0"
|
|
}
|
|
helm = {
|
|
source = "hashicorp/helm"
|
|
version = "~> 2.11"
|
|
}
|
|
kubernetes = {
|
|
source = "hashicorp/kubernetes"
|
|
version = "~> 2.23"
|
|
}
|
|
}
|
|
|
|
backend "s3" {
|
|
bucket = "fleet-terraform-state20220408141538466600000002"
|
|
key = "loadtesting/loadtesting/signoz/terraform.tfstate"
|
|
workspace_key_prefix = "loadtesting"
|
|
region = "us-east-2"
|
|
encrypt = true
|
|
kms_key_id = "9f98a443-ffd7-4dbe-a9c3-37df89b2e42a"
|
|
dynamodb_table = "tf-remote-state-lock"
|
|
assume_role = {
|
|
role_arn = "arn:aws:iam::353365949058:role/terraform-loadtesting"
|
|
}
|
|
}
|
|
}
|
|
|
|
provider "aws" {
|
|
region = var.aws_region
|
|
default_tags {
|
|
tags = {
|
|
environment = "loadtest-signoz"
|
|
terraform = "https://github.com/fleetdm/fleet/tree/main/infrastructure/loadtesting/terraform/signoz"
|
|
workspace = terraform.workspace
|
|
}
|
|
}
|
|
}
|
|
|
|
# Read shared VPC from remote state
|
|
data "terraform_remote_state" "shared_vpc" {
|
|
backend = "s3"
|
|
config = {
|
|
bucket = "fleet-terraform-state20220408141538466600000002"
|
|
key = "loadtesting/loadtesting/shared/terraform.tfstate"
|
|
workspace_key_prefix = "loadtesting"
|
|
region = "us-east-2"
|
|
encrypt = true
|
|
kms_key_id = "9f98a443-ffd7-4dbe-a9c3-37df89b2e42a"
|
|
dynamodb_table = "tf-remote-state-lock"
|
|
assume_role = {
|
|
role_arn = "arn:aws:iam::353365949058:role/terraform-loadtesting"
|
|
}
|
|
}
|
|
}
|
|
|
|
# Providers for Helm/Kubernetes after cluster is created
|
|
provider "helm" {
|
|
kubernetes {
|
|
host = module.eks.cluster_endpoint
|
|
cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)
|
|
exec {
|
|
api_version = "client.authentication.k8s.io/v1beta1"
|
|
command = "aws"
|
|
args = [
|
|
"eks",
|
|
"get-token",
|
|
"--cluster-name",
|
|
module.eks.cluster_name,
|
|
"--region",
|
|
var.aws_region
|
|
]
|
|
}
|
|
}
|
|
}
|
|
|
|
provider "kubernetes" {
|
|
host = module.eks.cluster_endpoint
|
|
cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)
|
|
exec {
|
|
api_version = "client.authentication.k8s.io/v1beta1"
|
|
command = "aws"
|
|
args = [
|
|
"eks",
|
|
"get-token",
|
|
"--cluster-name",
|
|
module.eks.cluster_name,
|
|
"--region",
|
|
var.aws_region
|
|
]
|
|
}
|
|
}
|
|
|
|
locals {
|
|
cluster_name = "signoz-${terraform.workspace}"
|
|
}
|
|
|
|
# Use shared fleet VPC
|
|
module "eks" {
|
|
source = "terraform-aws-modules/eks/aws"
|
|
version = "~> 21.0"
|
|
|
|
name = local.cluster_name
|
|
kubernetes_version = "1.31"
|
|
|
|
endpoint_public_access = true
|
|
|
|
vpc_id = data.terraform_remote_state.shared_vpc.outputs.vpc.vpc_id
|
|
subnet_ids = data.terraform_remote_state.shared_vpc.outputs.vpc.private_subnets
|
|
|
|
# IMPORTANT: Install critical addons BEFORE node group to avoid circular dependency
|
|
# Nodes need VPC CNI to become Ready, but terraform waits for nodes to be Ready
|
|
# before creating addons. This causes a deadlock where nodes are stuck NotReady.
|
|
# Solution: Use addons with before_compute=true to install VPC CNI before node group completes.
|
|
addons = {
|
|
vpc-cni = {
|
|
most_recent = true
|
|
before_compute = true
|
|
}
|
|
kube-proxy = {
|
|
most_recent = true
|
|
before_compute = true
|
|
}
|
|
coredns = {
|
|
most_recent = true
|
|
before_compute = true
|
|
}
|
|
}
|
|
|
|
# Managed node group
|
|
eks_managed_node_groups = {
|
|
default = {
|
|
min_size = 1
|
|
max_size = 1
|
|
desired_size = 1
|
|
instance_types = ["t3.2xlarge"]
|
|
}
|
|
}
|
|
|
|
# Enable cluster creator admin access
|
|
enable_cluster_creator_admin_permissions = true
|
|
|
|
# Enable OIDC provider for IRSA (IAM Roles for Service Accounts)
|
|
enable_irsa = true
|
|
}
|
|
|
|
# IAM Role for EBS CSI Driver Service Account (IRSA)
|
|
module "ebs_csi_irsa_role" {
|
|
source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks"
|
|
version = "~> 5.0"
|
|
|
|
role_name = "${local.cluster_name}-ebs-csi-driver"
|
|
attach_ebs_csi_policy = true
|
|
|
|
oidc_providers = {
|
|
main = {
|
|
provider_arn = module.eks.oidc_provider_arn
|
|
namespace_service_accounts = ["kube-system:ebs-csi-controller-sa"]
|
|
}
|
|
}
|
|
}
|
|
|
|
# EBS CSI Driver addon with IRSA support
|
|
# This must be created separately to avoid circular dependency with OIDC provider
|
|
resource "aws_eks_addon" "ebs_csi" {
|
|
cluster_name = module.eks.cluster_name
|
|
addon_name = "aws-ebs-csi-driver"
|
|
addon_version = data.aws_eks_addon_version.ebs_csi.version
|
|
service_account_role_arn = module.ebs_csi_irsa_role.iam_role_arn
|
|
|
|
depends_on = [
|
|
module.ebs_csi_irsa_role,
|
|
module.eks
|
|
]
|
|
}
|
|
|
|
data "aws_eks_addon_version" "ebs_csi" {
|
|
addon_name = "aws-ebs-csi-driver"
|
|
kubernetes_version = module.eks.cluster_version
|
|
most_recent = true
|
|
}
|
|
|
|
# Wait for EBS CSI driver to be active
|
|
resource "time_sleep" "wait_for_ebs_csi" {
|
|
depends_on = [aws_eks_addon.ebs_csi]
|
|
|
|
create_duration = "60s"
|
|
}
|
|
|
|
# Create gp3 storage class using EBS CSI driver
|
|
resource "kubernetes_storage_class_v1" "gp3" {
|
|
metadata {
|
|
name = "gp3"
|
|
annotations = {
|
|
"storageclass.kubernetes.io/is-default-class" = "true"
|
|
}
|
|
}
|
|
|
|
storage_provisioner = "ebs.csi.aws.com"
|
|
reclaim_policy = "Delete"
|
|
allow_volume_expansion = true
|
|
volume_binding_mode = "WaitForFirstConsumer"
|
|
|
|
parameters = {
|
|
type = "gp3"
|
|
encrypted = "true"
|
|
fsType = "ext4"
|
|
}
|
|
|
|
depends_on = [time_sleep.wait_for_ebs_csi]
|
|
}
|
|
|
|
# SigNoz via Helm
|
|
resource "helm_release" "signoz" {
|
|
name = "signoz"
|
|
repository = "https://charts.signoz.io"
|
|
chart = "signoz"
|
|
namespace = "signoz"
|
|
timeout = 3600 # 60 minutes - allows for worst-case AWS delays during both apply and destroy
|
|
|
|
create_namespace = true
|
|
wait = true
|
|
wait_for_jobs = false
|
|
|
|
# OTEL Collector configuration overrides for production stability
|
|
values = [
|
|
file("${path.module}/otel-collector-values.yaml")
|
|
]
|
|
|
|
set {
|
|
name = "cloud"
|
|
value = "false"
|
|
}
|
|
|
|
set {
|
|
name = "signoz.service.type"
|
|
value = "LoadBalancer"
|
|
}
|
|
|
|
# OTLP collector should be internal only (not publicly accessible)
|
|
set {
|
|
name = "otelCollector.service.type"
|
|
value = "LoadBalancer"
|
|
}
|
|
|
|
set {
|
|
name = "otelCollector.service.annotations.service\\.beta\\.kubernetes\\.io/aws-load-balancer-scheme"
|
|
value = "internal"
|
|
}
|
|
|
|
# Clickhouse storage configuration
|
|
set {
|
|
name = "clickhouse.persistence.size"
|
|
value = "200Gi"
|
|
}
|
|
|
|
set {
|
|
name = "clickhouse.persistence.storageClass"
|
|
value = "gp3"
|
|
}
|
|
|
|
# Zookeeper storage configuration
|
|
set {
|
|
name = "zookeeper.persistence.storageClass"
|
|
value = "gp3"
|
|
}
|
|
|
|
# SigNoz (alertmanager) storage configuration
|
|
set {
|
|
name = "alertmanager.persistence.storageClass"
|
|
value = "gp3"
|
|
}
|
|
|
|
# ClickHouse resource configuration for loadtest
|
|
# Default 100m CPU and 200Mi memory are way too low for high-volume telemetry
|
|
set {
|
|
name = "clickhouse.resources.requests.cpu"
|
|
value = "2000m"
|
|
}
|
|
|
|
set {
|
|
name = "clickhouse.resources.requests.memory"
|
|
value = "4Gi"
|
|
}
|
|
|
|
set {
|
|
name = "clickhouse.resources.limits.cpu"
|
|
value = "4000m"
|
|
}
|
|
|
|
set {
|
|
name = "clickhouse.resources.limits.memory"
|
|
value = "8Gi"
|
|
}
|
|
|
|
# OTEL Collector resource configuration for loadtest
|
|
set {
|
|
name = "otelCollector.resources.requests.memory"
|
|
value = "8Gi"
|
|
}
|
|
|
|
set {
|
|
name = "otelCollector.resources.limits.memory"
|
|
value = "12Gi"
|
|
}
|
|
|
|
set {
|
|
name = "otelCollector.resources.requests.cpu"
|
|
value = "1000m"
|
|
}
|
|
|
|
set {
|
|
name = "otelCollector.resources.limits.cpu"
|
|
value = "4000m"
|
|
}
|
|
|
|
# Only need 1 replica since we have 1 LoadBalancer endpoint
|
|
set {
|
|
name = "otelCollector.replicaCount"
|
|
value = "1"
|
|
}
|
|
|
|
depends_on = [
|
|
module.eks,
|
|
kubernetes_storage_class_v1.gp3
|
|
]
|
|
}
|