fix(deployment): use hostname for topologyKey instead of region, update resources settings and staging configs (#7005)

2026-04-21 14:37:17 +00:00 · 2025-09-16 14:06:42 +03:00 · 2025-09-16 14:06:42 +03:00 · 7c063ca1b0
commit 7c063ca1b0
parent e3ee46649c
17 changed files with 71 additions and 44 deletions
--- a/deployment/services/app.ts
+++ b/deployment/services/app.ts
@ -46,7 +46,7 @@ export function deployApp({
    'app',
    {
      image,
-      replicas: environment.isProduction ? 3 : 1,
+      replicas: environment.podsConfig.general.replicas,
      imagePullSecret: docker.secret,
      readinessProbe: '/api/health',
      livenessProbe: '/api/health',
--- a/deployment/services/commerce.ts
+++ b/deployment/services/commerce.ts
@ -49,7 +49,7 @@ export function deployCommerce({
    {
      image,
      imagePullSecret: docker.secret,
-      replicas: environment.isProduction ? 3 : 1,
+      replicas: environment.podsConfig.general.replicas,
      readinessProbe: '/_readiness',
      livenessProbe: '/_health',
      startupProbe: '/_health',
--- a/deployment/services/emails.ts
+++ b/deployment/services/emails.ts
@ -59,7 +59,7 @@ export function deployEmails({
      startupProbe: '/_health',
      exposesMetrics: true,
      image,
-      replicas: environment.isProduction ? 3 : 1,
+      replicas: environment.podsConfig.general.replicas,
    },
    [redis.deployment, redis.service],
  )
--- a/deployment/services/environment.ts
+++ b/deployment/services/environment.ts
@ -23,6 +23,9 @@ export function prepareEnvironment(input: {

  const appDns = `app.${input.rootDns}`;
  const apiDns = `api.${input.rootDns}`;
+  const isProduction = env === 'production';
+  const isStaging = env === 'staging';
+  const isDev = env === 'dev';

  return {
    envVars: {
@ -33,14 +36,50 @@ export function prepareEnvironment(input: {
      RELEASE: input.release,
    },
    envName: env,
-    isProduction: env === 'production',
-    isStaging: env === 'staging',
-    isDev: env === 'dev',
+    isProduction,
+    isStaging,
+    isDev,
    encryptionSecret,
    release: input.release,
    appDns,
    apiDns,
    rootDns: input.rootDns,
+    podsConfig: {
+      general: {
+        replicas: isProduction ? 3 : isStaging ? 2 : 1,
+      },
+      supertokens: {
+        replicas: isProduction ? 3 : 1,
+      },
+      envoy: {
+        replicas: isProduction ? 3 : 1,
+        cpuLimit: isProduction ? '800m' : '150m',
+        memoryLimit: isProduction ? '1Gi' : '200Mi',
+      },
+      schemaService: {
+        memoryLimit: isProduction ? '2Gi' : '1Gi',
+      },
+      usageService: {
+        replicas: isProduction ? 3 : isStaging ? 2 : 1,
+        cpuLimit: isProduction ? '900m' : '300m',
+        maxReplicas: isProduction ? 6 : isStaging ? 3 : 1,
+        cpuAverageToScale: 60,
+      },
+      usageIngestorService: {
+        replicas: isProduction ? 6 : isStaging ? 2 : 1,
+        cpuLimit: isProduction ? '900m' : '300m',
+        maxReplicas: isProduction ? /* numberOfPartitions */ 16 : 2,
+        cpuAverageToScale: 60,
+      },
+      redis: {
+        memoryLimit: isProduction ? '4Gi' : '100Mi',
+        cpuLimit: isProduction ? '1000m' : '50m',
+      },
+      internalObservability: {
+        cpuLimit: isProduction ? '512m' : '150m',
+        memoryLimit: isProduction ? '1000Mi' : '300Mi',
+      },
+    },
  };
 }

--- a/deployment/services/graphql.ts
+++ b/deployment/services/graphql.ts
@ -107,7 +107,7 @@ export function deployGraphQL({
      {
        imagePullSecret: docker.secret,
        image,
-        replicas: environment.isProduction ? 3 : 1,
+        replicas: environment.podsConfig.general.replicas,
        pdb: true,
        readinessProbe: '/_readiness',
        livenessProbe: '/_health',
--- a/deployment/services/policy.ts
+++ b/deployment/services/policy.ts
@ -32,7 +32,7 @@ export function deploySchemaPolicy({
    livenessProbe: '/_health',
    startupProbe: '/_health',
    exposesMetrics: true,
-    replicas: environment.isProduction ? 3 : 1,
+    replicas: environment.podsConfig.general.replicas,
    pdb: true,
  })
    .withConditionalSecret(sentry.enabled, 'SENTRY_DSN', sentry.secret, 'dsn')
--- a/deployment/services/proxy.ts
+++ b/deployment/services/proxy.ts
@ -32,9 +32,9 @@ export function deployProxy({
  })
    .deployProxy({
      envoy: {
-        replicas: environment.isProduction ? 3 : 1,
-        cpu: environment.isProduction ? '800m' : '150m',
-        memory: environment.isProduction ? '800Mi' : '192Mi',
+        replicas: environment.podsConfig.envoy.replicas,
+        cpu: environment.podsConfig.envoy.cpuLimit,
+        memory: environment.podsConfig.envoy.memoryLimit,
      },
      tracing: observability.enabled
        ? { collectorService: observability.observability!.otlpCollectorService }
--- a/deployment/services/public-graphql-api-gateway.ts
+++ b/deployment/services/public-graphql-api-gateway.ts
@ -60,7 +60,7 @@ export function deployPublicGraphQLAPIGateway(args: {
    {
      imagePullSecret: args.docker.secret,
      image: dockerImage,
-      replicas: args.environment.isProduction ? 3 : 1,
+      replicas: args.environment.podsConfig.general.replicas,
      availabilityOnEveryNode: true,
      env: {
        GRAPHQL_SERVICE_ENDPOINT: serviceLocalEndpoint(args.graphql.service).apply(
--- a/deployment/services/redis.ts
+++ b/deployment/services/redis.ts
@ -19,15 +19,10 @@ export function deployRedis(input: { environment: Environment }) {
  const redisApi = new RedisStore({
    password: redisPassword,
  }).deploy({
-    limits: input.environment.isProduction
-      ? {
-          memory: '6Gi',
-          cpu: '1000m',
-        }
-      : {
-          memory: '100Mi',
-          cpu: '50m',
-        },
+    limits: {
+      memory: input.environment.podsConfig.redis.memoryLimit,
+      cpu: input.environment.podsConfig.redis.cpuLimit,
+    },
  });

  const host = serviceLocalHost(redisApi.service);
--- a/deployment/services/schema.ts
+++ b/deployment/services/schema.ts
@ -51,8 +51,8 @@ export function deploySchema({
      livenessProbe: '/_health',
      startupProbe: '/_health',
      exposesMetrics: true,
-      replicas: environment.isProduction ? 3 : 1,
-      memoryLimit: '2Gi',
+      replicas: environment.podsConfig.general.replicas,
+      memoryLimit: environment.podsConfig.schemaService.memoryLimit,
      pdb: true,
    },
    [redis.deployment, redis.service],
--- a/deployment/services/supertokens.ts
+++ b/deployment/services/supertokens.ts
@ -88,7 +88,7 @@ export function deploySuperTokens(
  const deployment = new kx.Deployment(
    'supertokens',
    {
-      spec: pb.asDeploymentSpec({ replicas: environment.isProduction ? 3 : 1 }),
+      spec: pb.asDeploymentSpec({ replicas: environment.podsConfig.supertokens.replicas }),
    },
    {
      dependsOn: resourceOptions.dependencies,
--- a/deployment/services/tokens.ts
+++ b/deployment/services/tokens.ts
@ -39,7 +39,7 @@ export function deployTokens({
      startupProbe: '/_health',
      exposesMetrics: true,
      availabilityOnEveryNode: true,
-      replicas: environment.isProduction ? 3 : 1,
+      replicas: environment.podsConfig.general.replicas,
      image,
      env: {
        ...environment.envVars,
--- a/deployment/services/usage-ingestor.ts
+++ b/deployment/services/usage-ingestor.ts
@ -29,10 +29,6 @@ export function deployUsageIngestor({
  sentry: Sentry;
 }) {
  const clickHouseConfig = new pulumi.Config('clickhouse');
-  const numberOfPartitions = 16;
-  const replicas = environment.isProduction ? 6 : 1;
-  const cpuLimit = environment.isProduction ? '600m' : '300m';
-  const maxReplicas = environment.isProduction ? numberOfPartitions : 2;

  // Require migrationV2DataIngestionStartDate only in production and staging
  // Remove it once we are done with migration.
@ -46,7 +42,7 @@ export function deployUsageIngestor({
    {
      image,
      imagePullSecret: docker.secret,
-      replicas,
+      replicas: environment.podsConfig.usageIngestorService.replicas,
      readinessProbe: '/_readiness',
      livenessProbe: '/_health',
      availabilityOnEveryNode: true,
@ -67,10 +63,10 @@ export function deployUsageIngestor({
      pdb: true,
      autoScaling: {
        cpu: {
-          cpuAverageToScale: 60,
-          limit: cpuLimit,
+          cpuAverageToScale: environment.podsConfig.usageIngestorService.cpuAverageToScale,
+          limit: environment.podsConfig.usageIngestorService.cpuLimit,
        },
-        maxReplicas,
+        maxReplicas: environment.podsConfig.usageIngestorService.maxReplicas,
      },
    },
    [clickhouse.deployment, clickhouse.service, dbMigrations].filter(Boolean),
--- a/deployment/services/usage.ts
+++ b/deployment/services/usage.ts
@ -38,9 +38,6 @@ export function deployUsage({
  docker: Docker;
  sentry: Sentry;
 }) {
-  const replicas = environment.isProduction ? 3 : 1;
-  const cpuLimit = environment.isProduction ? '600m' : '300m';
-  const maxReplicas = environment.isProduction ? 6 : 2;
  const kafkaBufferDynamic =
    kafka.config.bufferDynamic === 'true' || kafka.config.bufferDynamic === '1' ? '1' : '0';

@ -50,7 +47,7 @@ export function deployUsage({
      {
        image,
        imagePullSecret: docker.secret,
-        replicas,
+        replicas: environment.podsConfig.usageService.replicas,
        readinessProbe: {
          initialDelaySeconds: 10,
          periodSeconds: 5,
@ -85,10 +82,10 @@ export function deployUsage({
        pdb: true,
        autoScaling: {
          cpu: {
-            cpuAverageToScale: 60,
-            limit: cpuLimit,
+            cpuAverageToScale: environment.podsConfig.usageService.cpuAverageToScale,
+            limit: environment.podsConfig.usageService.cpuLimit,
          },
-          maxReplicas,
+          maxReplicas: environment.podsConfig.usageService.maxReplicas,
        },
      },
      [
--- a/deployment/services/webhooks.ts
+++ b/deployment/services/webhooks.ts
@ -45,7 +45,7 @@ export function deployWebhooks({
      livenessProbe: '/_health',
      startupProbe: '/_health',
      exposesMetrics: true,
-      replicas: environment.isProduction ? 3 : 1,
+      replicas: environment.podsConfig.general.replicas,
      image,
    },
    [redis.deployment, redis.service],
--- a/deployment/utils/observability.ts
+++ b/deployment/utils/observability.ts
@ -112,8 +112,8 @@ export class Observability {
      replicaCount: 1,
      resources: {
        limits: {
-          cpu: this.environment.isProduction ? '512m' : '150m',
-          memory: this.environment.isProduction ? '1000Mi' : '300Mi',
+          cpu: this.environment.podsConfig.internalObservability.cpuLimit,
+          memory: this.environment.podsConfig.internalObservability.memoryLimit,
        },
      },
      podAnnotations: {
--- a/deployment/utils/service-deployment.ts
+++ b/deployment/utils/service-deployment.ts
@ -192,7 +192,7 @@ export class ServiceDeployment {
      // and ensure that we are not exposed to downtime issues caused by node failures/restarts:
      topologySpreadConstraints.push({
        maxSkew: 1,
-        topologyKey: 'topology.kubernetes.io/zone',
+        topologyKey: 'kubernetes.io/hostname',
        whenUnsatisfiable: 'DoNotSchedule',
        labelSelector: {
          matchLabels: {