feat(appset): add pprof endpoints (#25044)

Signed-off-by: rumstead <37445536+rumstead@users.noreply.github.com>
This commit is contained in:
rumstead 2025-10-23 10:42:01 -04:00 committed by GitHub
parent 9c4579b229
commit 97d50a14a6
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
14 changed files with 345 additions and 113 deletions

View file

@ -14,6 +14,7 @@ import (
"github.com/argoproj/argo-cd/v3/reposerver/apiclient"
logutils "github.com/argoproj/argo-cd/v3/util/log"
"github.com/argoproj/argo-cd/v3/util/profile"
"github.com/argoproj/argo-cd/v3/util/tls"
"github.com/argoproj/argo-cd/v3/applicationset/controllers"
@ -170,6 +171,15 @@ func NewCommand() *cobra.Command {
log.Error(err, "unable to start manager")
os.Exit(1)
}
pprofMux := http.NewServeMux()
profile.RegisterProfiler(pprofMux)
// This looks a little strange. Eg, not using ctrl.Options PprofBindAddress and then adding the pprof mux
// to the metrics server. However, it allows for the controller to dynamically expose the pprof endpoints
// and use the existing metrics server, the same pattern that the application controller and api-server follow.
if err = mgr.AddMetricsServerExtraHandler("/debug/pprof/", pprofMux); err != nil {
log.Error(err, "failed to register pprof handlers")
}
dynamicClient, err := dynamic.NewForConfig(mgr.GetConfig())
errors.CheckError(err)
k8sClient, err := kubernetes.NewForConfig(mgr.GetConfig())

View file

@ -294,6 +294,8 @@ data:
applicationsetcontroller.enable.github.api.metrics: "false"
# The maximum number of resources stored in the status of an ApplicationSet. This is a safeguard to prevent the status from growing too large.
applicationsetcontroller.status.max.resources.count: "5000"
# Enables profile endpoint on the internal metrics port
applicationsetcontroller.profile.enabled: "false"
## Argo CD Notifications Controller Properties
# Set the logging level. One of: debug|info|warn|error (default "info")

View file

@ -1,8 +1,10 @@
# High Availability
Argo CD is largely stateless. All data is persisted as Kubernetes objects, which in turn is stored in Kubernetes' etcd. Redis is only used as a throw-away cache and can be lost. When lost, it will be rebuilt without loss of service.
Argo CD is largely stateless. All data is persisted as Kubernetes objects, which in turn is stored in Kubernetes' etcd.
Redis is only used as a throw-away cache and can be lost. When lost, it will be rebuilt without loss of service.
A set of [HA manifests](https://github.com/argoproj/argo-cd/tree/stable/manifests/ha) are provided for users who wish to run Argo CD in a highly available manner. This runs more containers, and runs Redis in HA mode.
A set of [HA manifests](https://github.com/argoproj/argo-cd/tree/stable/manifests/ha) are provided for users who wish to
run Argo CD in a highly available manner. This runs more containers, and runs Redis in HA mode.
> [!NOTE]
> The HA installation will require at least three different nodes due to pod anti-affinity roles in the
@ -14,29 +16,52 @@ A set of [HA manifests](https://github.com/argoproj/argo-cd/tree/stable/manifest
**settings:**
The `argocd-repo-server` is responsible for cloning Git repository, keeping it up to date and generating manifests using the appropriate tool.
The `argocd-repo-server` is responsible for cloning Git repository, keeping it up to date and generating manifests using
the appropriate tool.
* `argocd-repo-server` fork/exec config management tool to generate manifests. The fork can fail due to lack of memory or limit on the number of OS threads.
The `--parallelismlimit` flag controls how many manifests generations are running concurrently and helps avoid OOM kills.
* `argocd-repo-server` fork/exec config management tool to generate manifests. The fork can fail due to lack of memory
or limit on the number of OS threads.
The `--parallelismlimit` flag controls how many manifests generations are running concurrently and helps avoid OOM
kills.
* the `argocd-repo-server` ensures that repository is in the clean state during the manifest generation using config management tools such as Kustomize, Helm
or custom plugin. As a result Git repositories with multiple applications might affect repository server performance.
Read [Monorepo Scaling Considerations](#monorepo-scaling-considerations) for more information.
* the `argocd-repo-server` ensures that repository is in the clean state during the manifest generation using config
management tools such as Kustomize, Helm
or custom plugin. As a result Git repositories with multiple applications might affect repository server performance.
Read [Monorepo Scaling Considerations](#monorepo-scaling-considerations) for more information.
* `argocd-repo-server` clones the repository into `/tmp` (or the path specified in the `TMPDIR` env variable). The Pod might run out of disk space if it has too many repositories
or if the repositories have a lot of files. To avoid this problem mount a persistent volume.
* `argocd-repo-server` clones the repository into `/tmp` (or the path specified in the `TMPDIR` env variable). The Pod
might run out of disk space if it has too many repositories
or if the repositories have a lot of files. To avoid this problem mount a persistent volume.
* `argocd-repo-server` uses `git ls-remote` to resolve ambiguous revisions such as `HEAD`, a branch or a tag name. This operation happens frequently
and might fail. To avoid failed syncs use the `ARGOCD_GIT_ATTEMPTS_COUNT` environment variable to retry failed requests.
* `argocd-repo-server` uses `git ls-remote` to resolve ambiguous revisions such as `HEAD`, a branch or a tag name. This
operation happens frequently
and might fail. To avoid failed syncs use the `ARGOCD_GIT_ATTEMPTS_COUNT` environment variable to retry failed
requests.
* `argocd-repo-server` Every 3m (by default) Argo CD checks for changes to the app manifests. Argo CD assumes by default that manifests only change when the repo changes, so it caches the generated manifests (for 24h by default). With Kustomize remote bases, or in case a Helm chart gets changed without bumping its version number, the expected manifests can change even though the repo has not changed. By reducing the cache time, you can get the changes without waiting for 24h. Use `--repo-cache-expiration duration`, and we'd suggest in low volume environments you try `1h`. Bear in mind that this will negate the benefits of caching if set too low.
* `argocd-repo-server` Every 3m (by default) Argo CD checks for changes to the app manifests. Argo CD assumes by default
that manifests only change when the repo changes, so it caches the generated manifests (for 24h by default). With
Kustomize remote bases, or in case a Helm chart gets changed without bumping its version number, the expected
manifests can change even though the repo has not changed. By reducing the cache time, you can get the changes without
waiting for 24h. Use `--repo-cache-expiration duration`, and we'd suggest in low volume environments you try `1h`.
Bear in mind that this will negate the benefits of caching if set too low.
* `argocd-repo-server` executes config management tools such as `helm` or `kustomize` and enforces a 90 second timeout. This timeout can be changed by using the `ARGOCD_EXEC_TIMEOUT` env variable. The value should be in the Go time duration string format, for example, `2m30s`.
* `argocd-repo-server` executes config management tools such as `helm` or `kustomize` and enforces a 90 second timeout.
This timeout can be changed by using the `ARGOCD_EXEC_TIMEOUT` env variable. The value should be in the Go time
duration string format, for example, `2m30s`.
* `argocd-repo-server` will issue a `SIGTERM` signal to a command that has elapsed the `ARGOCD_EXEC_TIMEOUT`. In most cases, well-behaved commands will exit immediately when receiving the signal. However, if this does not happen, `argocd-repo-server` will wait an additional timeout of `ARGOCD_EXEC_FATAL_TIMEOUT` and then forcefully exit the command with a `SIGKILL` to prevent stalling. Note that a failure to exit with `SIGTERM` is usually a bug in either the offending command or in the way `argocd-repo-server` calls it and should be reported to the issue tracker for further investigation.
* `argocd-repo-server` will issue a `SIGTERM` signal to a command that has elapsed the `ARGOCD_EXEC_TIMEOUT`. In most
cases, well-behaved commands will exit immediately when receiving the signal. However, if this does not happen,
`argocd-repo-server` will wait an additional timeout of `ARGOCD_EXEC_FATAL_TIMEOUT` and then forcefully exit the
command with a `SIGKILL` to prevent stalling. Note that a failure to exit with `SIGTERM` is usually a bug in either
the offending command or in the way `argocd-repo-server` calls it and should be reported to the issue tracker for
further investigation.
* When using the `discovery` option in Config Management Plugins (CMP), `argocd-repo-server` copies the repository (or only the files specified via the `argocd.argoproj.io/manifest-generate-paths` annotation) into a separate directory for each plugin.
This can place a heavy load on disk resources for a **argocd-repo-server**, especially if the repository contains large files. To mitigate this, consider disabling `discovery` or using [Plugin tar stream exclusions](./config-management-plugins.md#plugin-tar-stream-exclusions).
* When using the `discovery` option in Config Management Plugins (CMP), `argocd-repo-server` copies the repository (or
only the files specified via the `argocd.argoproj.io/manifest-generate-paths` annotation) into a separate directory
for each plugin.
This can place a heavy load on disk resources for a **argocd-repo-server**, especially if the repository contains
large files. To mitigate this, consider disabling `discovery` or
using [Plugin tar stream exclusions](./config-management-plugins.md#plugin-tar-stream-exclusions).
**metrics:**
@ -44,34 +69,49 @@ This can place a heavy load on disk resources for a **argocd-repo-server**, espe
- `repo` - Git repo URL
- `request_type` - `ls-remote` or `fetch`.
* `ARGOCD_ENABLE_GRPC_TIME_HISTOGRAM` - Is an environment variable that enables collecting RPC performance metrics. Enable it if you need to troubleshoot performance issues. Note: This metric is expensive to both query and store!
* `ARGOCD_ENABLE_GRPC_TIME_HISTOGRAM` - Is an environment variable that enables collecting RPC performance metrics.
Enable it if you need to troubleshoot performance issues. Note: This metric is expensive to both query and store!
### argocd-application-controller
**settings:**
The `argocd-application-controller` uses `argocd-repo-server` to get generated manifests and Kubernetes API server to get the actual cluster state.
The `argocd-application-controller` uses `argocd-repo-server` to get generated manifests and Kubernetes API server to
get the actual cluster state.
* each controller replica uses two separate queues to process application reconciliation (milliseconds) and app syncing (seconds). The number of queue processors for each queue is controlled by
`--status-processors` (20 by default) and `--operation-processors` (10 by default) flags. Increase the number of processors if your Argo CD instance manages too many applications.
For 1000 application we use 50 for `--status-processors` and 25 for `--operation-processors`
* each controller replica uses two separate queues to process application reconciliation (milliseconds) and app
syncing (seconds). The number of queue processors for each queue is controlled by
`--status-processors` (20 by default) and `--operation-processors` (10 by default) flags. Increase the number of
processors if your Argo CD instance manages too many applications.
For 1000 application we use 50 for `--status-processors` and 25 for `--operation-processors`
* The manifest generation typically takes the most time during reconciliation. The duration of manifest generation is limited to make sure the controller refresh queue does not overflow.
The app reconciliation fails with `Context deadline exceeded` error if the manifest generation is taking too much time. As a workaround increase the value of `--repo-server-timeout-seconds` and
consider scaling up the `argocd-repo-server` deployment.
* The manifest generation typically takes the most time during reconciliation. The duration of manifest generation is
limited to make sure the controller refresh queue does not overflow.
The app reconciliation fails with `Context deadline exceeded` error if the manifest generation is taking too much
time. As a workaround increase the value of `--repo-server-timeout-seconds` and
consider scaling up the `argocd-repo-server` deployment.
* The controller uses Kubernetes watch APIs to maintain a lightweight Kubernetes cluster cache. This allows avoiding querying Kubernetes during app reconciliation and significantly improves
performance. For performance reasons the controller monitors and caches only the preferred versions of a resource. During reconciliation, the controller might have to convert cached resources from the
preferred version into a version of the resource stored in Git. If `kubectl convert` fails because the conversion is not supported then the controller falls back to Kubernetes API query which slows down
reconciliation. In this case, we advise to use the preferred resource version in Git.
* The controller uses Kubernetes watch APIs to maintain a lightweight Kubernetes cluster cache. This allows avoiding
querying Kubernetes during app reconciliation and significantly improves
performance. For performance reasons the controller monitors and caches only the preferred versions of a resource.
During reconciliation, the controller might have to convert cached resources from the
preferred version into a version of the resource stored in Git. If `kubectl convert` fails because the conversion is
not supported then the controller falls back to Kubernetes API query which slows down
reconciliation. In this case, we advise to use the preferred resource version in Git.
* The controller polls Git every 3m by default. You can change this duration using the `timeout.reconciliation` and `timeout.reconciliation.jitter` setting in the `argocd-cm` ConfigMap. The value of the fields is a [duration string](https://pkg.go.dev/time#ParseDuration) e.g `60s`, `1m` or `1h`.
* The controller polls Git every 3m by default. You can change this duration using the `timeout.reconciliation` and
`timeout.reconciliation.jitter` setting in the `argocd-cm` ConfigMap. The value of the fields is
a [duration string](https://pkg.go.dev/time#ParseDuration) e.g `60s`, `1m` or `1h`.
* If the controller is managing too many clusters and uses too much memory then you can shard clusters across multiple
controller replicas. To enable sharding, increase the number of replicas in `argocd-application-controller` `StatefulSet`
and repeat the number of replicas in the `ARGOCD_CONTROLLER_REPLICAS` environment variable. The strategic merge patch below demonstrates changes required to configure two controller replicas.
controller replicas. To enable sharding, increase the number of replicas in `argocd-application-controller`
`StatefulSet`
and repeat the number of replicas in the `ARGOCD_CONTROLLER_REPLICAS` environment variable. The strategic merge patch
below demonstrates changes required to configure two controller replicas.
* By default, the controller will update the cluster information every 10 seconds. If there is a problem with your cluster network environment that is causing the update time to take a long time, you can try modifying the environment variable `ARGO_CD_UPDATE_CLUSTER_INFO_TIMEOUT` to increase the timeout (the unit is seconds).
* By default, the controller will update the cluster information every 10 seconds. If there is a problem with your
cluster network environment that is causing the update time to take a long time, you can try modifying the environment
variable `ARGO_CD_UPDATE_CLUSTER_INFO_TIMEOUT` to increase the timeout (the unit is seconds).
```yaml
apiVersion: apps/v1
@ -83,27 +123,39 @@ spec:
template:
spec:
containers:
- name: argocd-application-controller
env:
- name: ARGOCD_CONTROLLER_REPLICAS
value: "2"
- name: argocd-application-controller
env:
- name: ARGOCD_CONTROLLER_REPLICAS
value: "2"
```
* In order to manually set the cluster's shard number, specify the optional `shard` property when creating a cluster. If not specified, it will be calculated on the fly by the application controller.
* The shard distribution algorithm of the `argocd-application-controller` can be set by using the `--sharding-method` parameter. Supported sharding methods are:
* In order to manually set the cluster's shard number, specify the optional `shard` property when creating a cluster. If
not specified, it will be calculated on the fly by the application controller.
* The shard distribution algorithm of the `argocd-application-controller` can be set by using the `--sharding-method`
parameter. Supported sharding methods are:
- `legacy` mode uses an `uid` based distribution (non-uniform).
- `round-robin` uses an equal distribution across all shards.
- `consistent-hashing` uses the consistent hashing with bounded loads algorithm which tends to equal distribution and also reduces cluster or application reshuffling in case of additions or removals of shards or clusters.
- `consistent-hashing` uses the consistent hashing with bounded loads algorithm which tends to equal distribution
and also reduces cluster or application reshuffling in case of additions or removals of shards or clusters.
The `--sharding-method` parameter can also be overridden by setting the key `controller.sharding.algorithm` in the `argocd-cmd-params-cm` `configMap` (preferably) or by setting the `ARGOCD_CONTROLLER_SHARDING_ALGORITHM` environment variable and by specifying the same possible values.
The `--sharding-method` parameter can also be overridden by setting the key `controller.sharding.algorithm` in the
`argocd-cmd-params-cm` `configMap` (preferably) or by setting the `ARGOCD_CONTROLLER_SHARDING_ALGORITHM` environment
variable and by specifying the same possible values.
> [!WARNING]
> **Alpha Features**
>
> The `round-robin` shard distribution algorithm is an experimental feature. Reshuffling is known to occur in certain scenarios with cluster removal. If the cluster at rank-0 is removed, reshuffling all clusters across shards will occur and may temporarily have negative performance impacts.
> The `consistent-hashing` shard distribution algorithm is an experimental feature. Extensive benchmark have been documented on the [CNOE blog](https://cnoe.io/blog/argo-cd-application-scalability) with encouraging results. Community feedback is highly appreciated before moving this feature to a production ready state.
> The `round-robin` shard distribution algorithm is an experimental feature. Reshuffling is known to occur in certain
> scenarios with cluster removal. If the cluster at rank-0 is removed, reshuffling all clusters across shards will occur
> and may temporarily have negative performance impacts.
> The `consistent-hashing` shard distribution algorithm is an experimental feature. Extensive benchmark have been
> documented on the [CNOE blog](https://cnoe.io/blog/argo-cd-application-scalability) with encouraging results.
> Community
> feedback is highly appreciated before moving this feature to a production ready state.
* A cluster can be manually assigned and forced to a `shard` by patching the `shard` field in the cluster secret to
contain the shard number, e.g.
* A cluster can be manually assigned and forced to a `shard` by patching the `shard` field in the cluster secret to contain the shard number, e.g.
```yaml
apiVersion: v1
kind: Secret
@ -126,7 +178,8 @@ stringData:
}
```
* `ARGOCD_ENABLE_GRPC_TIME_HISTOGRAM` - environment variable that enables collecting RPC performance metrics. Enable it if you need to troubleshoot performance issues. Note: This metric is expensive to both query and store!
* `ARGOCD_ENABLE_GRPC_TIME_HISTOGRAM` - environment variable that enables collecting RPC performance metrics. Enable it
if you need to troubleshoot performance issues. Note: This metric is expensive to both query and store!
* `ARGOCD_CLUSTER_CACHE_LIST_PAGE_BUFFER_SIZE` - environment variable controlling the number of pages the controller
buffers in memory when performing a list operation against the K8s api server while syncing the cluster cache. This
@ -141,27 +194,38 @@ stringData:
be buffered in memory -- no api server request will be blocked by processing.
* `ARGOCD_CLUSTER_CACHE_BATCH_EVENTS_PROCESSING` - environment variable that enables the controller to collect events
for Kubernetes resources and process them in a batch. This is useful when the cluster contains a large number of resources,
and the controller is overwhelmed by the number of events. The default value is `true`. `false` would mean that the controller
for Kubernetes resources and process them in a batch. This is useful when the cluster contains a large number of
resources,
and the controller is overwhelmed by the number of events. The default value is `true`. `false` would mean that the
controller
would process events one by one.
* `ARGOCD_CLUSTER_CACHE_EVENTS_PROCESSING_INTERVAL` - environment variable controlling the interval for processing events in a batch.
The valid value is in the format of Go time duration string, e.g. `1ms`, `1s`, `1m`, `1h`. The default value is `100ms`.
* `ARGOCD_CLUSTER_CACHE_EVENTS_PROCESSING_INTERVAL` - environment variable controlling the interval for processing
events in a batch.
The valid value is in the format of Go time duration string, e.g. `1ms`, `1s`, `1m`, `1h`. The default value is
`100ms`.
The variable is used only when `ARGOCD_CLUSTER_CACHE_BATCH_EVENTS_PROCESSING` is set to `true`.
* `ARGOCD_APPLICATION_TREE_SHARD_SIZE` - environment variable controlling the max number of resources stored in one Redis
key. Splitting application tree into multiple keys helps to reduce the amount of traffic between the controller and Redis.
The default value is 0, which means that the application tree is stored in a single Redis key. The reasonable value is 100.
* `ARGOCD_APPLICATION_TREE_SHARD_SIZE` - environment variable controlling the max number of resources stored in one
Redis
key. Splitting application tree into multiple keys helps to reduce the amount of traffic between the controller and
Redis.
The default value is 0, which means that the application tree is stored in a single Redis key. The reasonable value is
100.
**metrics**
* `argocd_app_reconcile` - reports application reconciliation duration in seconds. Can be used to build reconciliation duration heat map to get a high-level reconciliation performance picture.
* `argocd_app_k8s_request_total` - number of k8s requests per application. The number of fallback Kubernetes API queries - useful to identify which application has a resource with
non-preferred version and causes performance issues.
* `argocd_app_reconcile` - reports application reconciliation duration in seconds. Can be used to build reconciliation
duration heat map to get a high-level reconciliation performance picture.
* `argocd_app_k8s_request_total` - number of k8s requests per application. The number of fallback Kubernetes API
queries - useful to identify which application has a resource with
non-preferred version and causes performance issues.
### argocd-server
The `argocd-server` is stateless and probably the least likely to cause issues. To ensure there is no downtime during upgrades, consider increasing the number of replicas to `3` or more and repeat the number in the `ARGOCD_API_SERVER_REPLICAS` environment variable. The strategic merge patch below
The `argocd-server` is stateless and probably the least likely to cause issues. To ensure there is no downtime during
upgrades, consider increasing the number of replicas to `3` or more and repeat the number in the
`ARGOCD_API_SERVER_REPLICAS` environment variable. The strategic merge patch below
demonstrates this.
```yaml
@ -174,57 +238,82 @@ spec:
template:
spec:
containers:
- name: argocd-server
env:
- name: ARGOCD_API_SERVER_REPLICAS
value: "3"
- name: argocd-server
env:
- name: ARGOCD_API_SERVER_REPLICAS
value: "3"
```
**settings:**
* The `ARGOCD_API_SERVER_REPLICAS` environment variable is used to divide [the limit of concurrent login requests (`ARGOCD_MAX_CONCURRENT_LOGIN_REQUESTS_COUNT`)](./user-management/index.md#failed-logins-rate-limiting) between each replica.
* The `ARGOCD_GRPC_MAX_SIZE_MB` environment variable allows specifying the max size of the server response message in megabytes.
The default value is 200. You might need to increase this for an Argo CD instance that manages 3000+ applications.
* The `ARGOCD_API_SERVER_REPLICAS` environment variable is used to divide [the limit of concurrent login requests (
`ARGOCD_MAX_CONCURRENT_LOGIN_REQUESTS_COUNT`)](./user-management/index.md#failed-logins-rate-limiting) between each
replica.
* The `ARGOCD_GRPC_MAX_SIZE_MB` environment variable allows specifying the max size of the server response message in
megabytes.
The default value is 200. You might need to increase this for an Argo CD instance that manages 3000+ applications.
### argocd-dex-server, argocd-redis
The `argocd-dex-server` uses an in-memory database, and two or more instances would have inconsistent data. `argocd-redis` is pre-configured with the understanding of only three total redis servers/sentinels.
The `argocd-dex-server` uses an in-memory database, and two or more instances would have inconsistent data.
`argocd-redis` is pre-configured with the understanding of only three total redis servers/sentinels.
## Monorepo Scaling Considerations
Argo CD repo server maintains one repository clone locally and uses it for application manifest generation. If the manifest generation requires to change a file in the local repository clone then only one concurrent manifest generation per server instance is allowed. This limitation might significantly slowdown Argo CD if you have a mono repository with multiple applications (50+).
Argo CD repo server maintains one repository clone locally and uses it for application manifest generation. If the
manifest generation requires to change a file in the local repository clone then only one concurrent manifest generation
per server instance is allowed. This limitation might significantly slowdown Argo CD if you have a mono repository with
multiple applications (50+).
### Enable Concurrent Processing
Argo CD determines if manifest generation might change local files in the local repository clone based on the config management tool and application settings.
If the manifest generation has no side effects then requests are processed in parallel without a performance penalty. The following are known cases that might cause slowness and their workarounds:
Argo CD determines if manifest generation might change local files in the local repository clone based on the config
management tool and application settings.
If the manifest generation has no side effects then requests are processed in parallel without a performance penalty.
The following are known cases that might cause slowness and their workarounds:
* **Multiple Helm based applications pointing to the same directory in one Git repository:** for historical reasons Argo CD used to generate Helm manifests sequentially. Starting v3.0, Argo CD performs a parallel generation of Helm manifests by default.
* **Multiple Helm based applications pointing to the same directory in one Git repository:** for historical reasons Argo
CD used to generate Helm manifests sequentially. Starting v3.0, Argo CD performs a parallel generation of Helm
manifests by default.
* **Multiple Custom plugin based applications:** avoid creating temporal files during manifest generation and create `.argocd-allow-concurrency` file in the app directory, or use the sidecar plugin option, which processes each application using a temporary copy of the repository.
* **Multiple Kustomize applications in same repository with [parameter overrides](../user-guide/parameters.md):** sorry, no workaround for now.
* **Multiple Custom plugin based applications:** avoid creating temporal files during manifest generation and create
`.argocd-allow-concurrency` file in the app directory, or use the sidecar plugin option, which processes each
application using a temporary copy of the repository.
* **Multiple Kustomize applications in same repository with [parameter overrides](../user-guide/parameters.md):** sorry,
no workaround for now.
### Manifest Paths Annotation
Argo CD aggressively caches generated manifests and uses the repository commit SHA as a cache key. A new commit to the Git repository invalidates the cache for all applications configured in the repository.
This can negatively affect repositories with multiple applications. You can use the `argocd.argoproj.io/manifest-generate-paths` Application CRD annotation to solve this problem and improve performance.
Argo CD aggressively caches generated manifests and uses the repository commit SHA as a cache key. A new commit to the
Git repository invalidates the cache for all applications configured in the repository.
This can negatively affect repositories with multiple applications. You can use the
`argocd.argoproj.io/manifest-generate-paths` Application CRD annotation to solve this problem and improve performance.
Note: The `argocd.argoproj.io/manifest-generate-paths` annotation is available for use with webhooks. Since Argo CD v2.11, this annotation can also be used **without configuring any webhooks**. Webhooks are not a pre-condition for this feature. You can rely on the annotation alone to optimize manifest generation for all applications.
Note: The `argocd.argoproj.io/manifest-generate-paths` annotation is available for use with webhooks. Since Argo CD
v2.11, this annotation can also be used **without configuring any webhooks**. Webhooks are not a pre-condition for this
feature. You can rely on the annotation alone to optimize manifest generation for all applications.
The `argocd.argoproj.io/manifest-generate-paths` annotation contains a semicolon-separated list of paths within the Git repository that are used during manifest generation. It will use the paths specified in the annotation to compare the last cached revision to the latest commit. If no modified files match the paths specified in `argocd.argoproj.io/manifest-generate-paths`, then it will not trigger application reconciliation and the existing cache will be considered valid for the new commit.
The `argocd.argoproj.io/manifest-generate-paths` annotation contains a semicolon-separated list of paths within the Git
repository that are used during manifest generation. It will use the paths specified in the annotation to compare the
last cached revision to the latest commit. If no modified files match the paths specified in
`argocd.argoproj.io/manifest-generate-paths`, then it will not trigger application reconciliation and the existing cache
will be considered valid for the new commit.
Installations that use a different repository for each application are **not** subject to this behavior and will likely get no benefit from using these annotations.
Installations that use a different repository for each application are **not** subject to this behavior and will likely
get no benefit from using these annotations.
Similarly, applications referencing an external Helm values file will not get the benefits of this feature when an unrelated change happens in the external source.
Similarly, applications referencing an external Helm values file will not get the benefits of this feature when an
unrelated change happens in the external source.
For webhooks, the comparison is done using the files specified in the webhook event payload instead.
> [!NOTE]
> Application manifest paths annotation support for webhooks depends on the git provider used for the Application. It is currently only supported for GitHub, GitLab, and Gogs based repos.
> Application manifest paths annotation support for webhooks depends on the git provider used for the Application. It is
> currently only supported for GitHub, GitLab, and Gogs based repos.
* **Relative path** The annotation might contain a relative path. In this case the path is considered relative to the path specified in the application source:
* **Relative path** The annotation might contain a relative path. In this case the path is considered relative to the
path specified in the application source:
```yaml
apiVersion: argoproj.io/v1alpha1
@ -243,7 +332,8 @@ spec:
# ...
```
* **Absolute path** The annotation value might be an absolute path starting with '/'. In this case path is considered as an absolute path within the Git repository:
* **Absolute path** The annotation value might be an absolute path starting with '/'. In this case path is considered as
an absolute path within the Git repository:
```yaml
apiVersion: argoproj.io/v1alpha1
@ -260,7 +350,8 @@ spec:
# ...
```
* **Multiple paths** It is possible to put multiple paths into the annotation. Paths must be separated with a semicolon (`;`):
* **Multiple paths** It is possible to put multiple paths into the annotation. Paths must be separated with a
semicolon (`;`):
```yaml
apiVersion: argoproj.io/v1alpha1
@ -278,7 +369,8 @@ spec:
# ...
```
* **Glob paths** The annotation might contain a glob pattern path, which can be any pattern supported by the [Go filepath Match function](https://pkg.go.dev/path/filepath#Match):
* **Glob paths** The annotation might contain a glob pattern path, which can be any pattern supported by
the [Go filepath Match function](https://pkg.go.dev/path/filepath#Match):
```yaml
apiVersion: argoproj.io/v1alpha1
@ -298,14 +390,23 @@ spec:
```
> [!NOTE]
> If application manifest generation using the `argocd.argoproj.io/manifest-generate-paths` annotation feature is enabled, only the resources specified by this annotation will be sent to the CMP server for manifest generation, rather than the entire repository. To determine the appropriate resources, a common root path is calculated based on the paths provided in the annotation. The application path serves as the deepest path that can be selected as the root.
> If application manifest generation using the `argocd.argoproj.io/manifest-generate-paths` annotation feature is
> enabled, only the resources specified by this annotation will be sent to the CMP server for manifest generation,
> rather
> than the entire repository. To determine the appropriate resources, a common root path is calculated based on the
> paths
> provided in the annotation. The application path serves as the deepest path that can be selected as the root.
### Application Sync Timeout & Jitter
Argo CD has a timeout for application syncs. It will trigger a refresh for each application periodically when the timeout expires.
With a large number of applications, this will cause a spike in the refresh queue and can cause a spike to the repo-server component. To avoid this, you can set a jitter to the sync timeout which will spread out the refreshes and give time to the repo-server to catch up.
Argo CD has a timeout for application syncs. It will trigger a refresh for each application periodically when the
timeout expires.
With a large number of applications, this will cause a spike in the refresh queue and can cause a spike to the
repo-server component. To avoid this, you can set a jitter to the sync timeout which will spread out the refreshes and
give time to the repo-server to catch up.
The jitter is the maximum duration that can be added to the sync timeout, so if the sync timeout is 5 minutes and the jitter is 1 minute, then the actual timeout will be between 5 and 6 minutes.
The jitter is the maximum duration that can be added to the sync timeout, so if the sync timeout is 5 minutes and the
jitter is 1 minute, then the actual timeout will be between 5 and 6 minutes.
To configure the jitter you can set the following environment variables:
@ -313,38 +414,49 @@ To configure the jitter you can set the following environment variables:
## Rate Limiting Application Reconciliations
To prevent high controller resource usage or sync loops caused either due to misbehaving apps or other environment specific factors,
we can configure rate limits on the workqueues used by the application controller. There are two types of rate limits that can be configured:
To prevent high controller resource usage or sync loops caused either due to misbehaving apps or other environment
specific factors,
we can configure rate limits on the workqueues used by the application controller. There are two types of rate limits
that can be configured:
* Global rate limits
* Per item rate limits
* Global rate limits
* Per item rate limits
The final rate limiter uses a combination of both and calculates the final backoff as `max(globalBackoff, perItemBackoff)`.
The final rate limiter uses a combination of both and calculates the final backoff as
`max(globalBackoff, perItemBackoff)`.
### Global rate limits
This is disabled by default, it is a simple bucket based rate limiter that limits the number of items that can be queued per second.
This is disabled by default, it is a simple bucket based rate limiter that limits the number of items that can be queued
per second.
This is useful to prevent a large number of apps from being queued at the same time.
To configure the bucket limiter you can set the following environment variables:
* `WORKQUEUE_BUCKET_SIZE` - The number of items that can be queued in a single burst. Defaults to 500.
* `WORKQUEUE_BUCKET_QPS` - The number of items that can be queued per second. Defaults to MaxFloat64, which disables the limiter.
* `WORKQUEUE_BUCKET_SIZE` - The number of items that can be queued in a single burst. Defaults to 500.
* `WORKQUEUE_BUCKET_QPS` - The number of items that can be queued per second. Defaults to MaxFloat64, which disables the
limiter.
### Per item rate limits
This by default returns a fixed base delay/backoff value but can be configured to return exponential values.
Per item rate limiter limits the number of times a particular item can be queued. This is based on exponential backoff where the backoff time for an item keeps increasing exponentially
if it is queued multiple times in a short period, but the backoff is reset automatically if a configured `cool down` period has elapsed since the last time the item was queued.
This by default returns a fixed base delay/backoff value but can be configured to return exponential values.
Per item rate limiter limits the number of times a particular item can be queued. This is based on exponential backoff
where the backoff time for an item keeps increasing exponentially
if it is queued multiple times in a short period, but the backoff is reset automatically if a configured `cool down`
period has elapsed since the last time the item was queued.
To configure the per item limiter you can set the following environment variables:
* `WORKQUEUE_FAILURE_COOLDOWN_NS` : The cool down period in nanoseconds, once period has elapsed for an item the backoff is reset. Exponential backoff is disabled if set to 0(default), eg. values : 10 * 10^9 (=10s)
* `WORKQUEUE_BASE_DELAY_NS` : The base delay in nanoseconds, this is the initial backoff used in the exponential backoff formula. Defaults to 1000 (=1μs)
* `WORKQUEUE_MAX_DELAY_NS` : The max delay in nanoseconds, this is the max backoff limit. Defaults to 3 * 10^9 (=3s)
* `WORKQUEUE_BACKOFF_FACTOR` : The backoff factor, this is the factor by which the backoff is increased for each retry. Defaults to 1.5
* `WORKQUEUE_FAILURE_COOLDOWN_NS` : The cool down period in nanoseconds, once period has elapsed for an item the backoff
is reset. Exponential backoff is disabled if set to 0(default), eg. values : 10 * 10^9 (=10s)
* `WORKQUEUE_BASE_DELAY_NS` : The base delay in nanoseconds, this is the initial backoff used in the exponential backoff
formula. Defaults to 1000 (=1μs)
* `WORKQUEUE_MAX_DELAY_NS` : The max delay in nanoseconds, this is the max backoff limit. Defaults to 3 * 10^9 (=3s)
* `WORKQUEUE_BACKOFF_FACTOR` : The backoff factor, this is the factor by which the backoff is increased for each retry.
Defaults to 1.5
The formula used to calculate the backoff time for an item, where `numRequeue` is the number of times the item has been queued
The formula used to calculate the backoff time for an item, where `numRequeue` is the number of times the item has been
queued
and `lastRequeueTime` is the time at which the item was last queued:
- When `WORKQUEUE_FAILURE_COOLDOWN_NS` != 0 :
@ -366,31 +478,38 @@ backoff = WORKQUEUE_BASE_DELAY_NS
## HTTP Request Retry Strategy
In scenarios where network instability or transient server errors occur, the retry strategy ensures the robustness of HTTP communication by automatically resending failed requests. It uses a combination of maximum retries and backoff intervals to prevent overwhelming the server or thrashing the network.
In scenarios where network instability or transient server errors occur, the retry strategy ensures the robustness of
HTTP communication by automatically resending failed requests. It uses a combination of maximum retries and backoff
intervals to prevent overwhelming the server or thrashing the network.
### Configuring Retries
The retry logic can be fine-tuned with the following environment variables:
* `ARGOCD_K8SCLIENT_RETRY_MAX` - The maximum number of retries for each request. The request will be dropped after this count is reached. Defaults to 0 (no retries).
* `ARGOCD_K8SCLIENT_RETRY_BASE_BACKOFF` - The initial backoff delay on the first retry attempt in ms. Subsequent retries will double this backoff time up to a maximum threshold. Defaults to 100ms.
* `ARGOCD_K8SCLIENT_RETRY_MAX` - The maximum number of retries for each request. The request will be dropped after this
count is reached. Defaults to 0 (no retries).
* `ARGOCD_K8SCLIENT_RETRY_BASE_BACKOFF` - The initial backoff delay on the first retry attempt in ms. Subsequent retries
will double this backoff time up to a maximum threshold. Defaults to 100ms.
### Backoff Strategy
The backoff strategy employed is a simple exponential backoff without jitter. The backoff time increases exponentially with each retry attempt until a maximum backoff duration is reached.
The backoff strategy employed is a simple exponential backoff without jitter. The backoff time increases exponentially
with each retry attempt until a maximum backoff duration is reached.
The formula for calculating the backoff time is:
```
backoff = min(retryWaitMax, baseRetryBackoff * (2 ^ retryAttempt))
```
Where `retryAttempt` starts at 0 and increments by 1 for each subsequent retry.
### Maximum Wait Time
There is a cap on the backoff time to prevent excessive wait times between retries. This cap is defined by:
* `retryWaitMax` - The maximum duration to wait before retrying. This ensures that retries happen within a reasonable timeframe. Defaults to 10 seconds.
* `retryWaitMax` - The maximum duration to wait before retrying. This ensures that retries happen within a reasonable
timeframe. Defaults to 10 seconds.
### Non-Retriable Conditions
@ -399,14 +518,16 @@ Not all HTTP responses are eligible for retries. The following conditions will n
* Responses with a status code indicating client errors (4xx) except for 429 Too Many Requests.
* Responses with the status code 501 Not Implemented.
## CPU/Memory Profiling
Argo CD optionally exposes a profiling endpoint that can be used to profile the CPU and memory usage of the Argo CD component.
The profiling endpoint is available on metrics port of each component. See [metrics](./metrics.md) for more information about the port.
For security reasons the profiling endpoint is disabled by default. The endpoint can be enabled by setting the `server.profile.enabled`
or `controller.profile.enabled` key of [argocd-cmd-params-cm](argocd-cmd-params-cm.yaml) ConfigMap to `true`.
Once the endpoint is enabled you can use go profile tool to collect the CPU and memory profiles. Example:
Argo CD optionally exposes a profiling endpoint that can be used to profile the CPU and memory usage of the Argo CD
component.
The profiling endpoint is available on metrics port of each component. See [metrics](./metrics.md) for more information
about the port.
For security reasons, the profiling endpoint is disabled by default. The endpoint can be enabled by setting the
`server.profile.enabled`, `applicationsetcontroller.profile.enabled`, or `controller.profile.enabled` key
of [argocd-cmd-params-cm](argocd-cmd-params-cm.yaml) ConfigMap to `true`.
Once the endpoint is enabled, you can use go profile tool to collect the CPU and memory profiles. Example:
```bash
$ kubectl port-forward svc/argocd-metrics 8082:8082

View file

@ -206,6 +206,8 @@ spec:
name: tmp
- name: argocd-repo-server-tls
mountPath: /app/config/reposerver/tls
- name: argocd-cmd-params-cm
mountPath: /home/argocd/params
securityContext:
capabilities:
drop:
@ -241,5 +243,12 @@ spec:
path: tls.key
- key: ca.crt
path: ca.crt
- name: argocd-cmd-params-cm
configMap:
optional: true
name: argocd-cmd-params-cm
items:
- key: applicationsetcontroller.profile.enabled
path: profiler.enabled
nodeSelector:
kubernetes.io/os: linux

View file

@ -24880,6 +24880,8 @@ spec:
name: tmp
- mountPath: /app/config/reposerver/tls
name: argocd-repo-server-tls
- mountPath: /home/argocd/params
name: argocd-cmd-params-cm
nodeSelector:
kubernetes.io/os: linux
serviceAccountName: argocd-applicationset-controller
@ -24908,6 +24910,13 @@ spec:
path: ca.crt
optional: true
secretName: argocd-repo-server-tls
- configMap:
items:
- key: applicationsetcontroller.profile.enabled
path: profiler.enabled
name: argocd-cmd-params-cm
optional: true
name: argocd-cmd-params-cm
---
apiVersion: apps/v1
kind: Deployment

View file

@ -24848,6 +24848,8 @@ spec:
name: tmp
- mountPath: /app/config/reposerver/tls
name: argocd-repo-server-tls
- mountPath: /home/argocd/params
name: argocd-cmd-params-cm
nodeSelector:
kubernetes.io/os: linux
serviceAccountName: argocd-applicationset-controller
@ -24876,6 +24878,13 @@ spec:
path: ca.crt
optional: true
secretName: argocd-repo-server-tls
- configMap:
items:
- key: applicationsetcontroller.profile.enabled
path: profiler.enabled
name: argocd-cmd-params-cm
optional: true
name: argocd-cmd-params-cm
---
apiVersion: apps/v1
kind: Deployment

View file

@ -26246,6 +26246,8 @@ spec:
name: tmp
- mountPath: /app/config/reposerver/tls
name: argocd-repo-server-tls
- mountPath: /home/argocd/params
name: argocd-cmd-params-cm
nodeSelector:
kubernetes.io/os: linux
serviceAccountName: argocd-applicationset-controller
@ -26274,6 +26276,13 @@ spec:
path: ca.crt
optional: true
secretName: argocd-repo-server-tls
- configMap:
items:
- key: applicationsetcontroller.profile.enabled
path: profiler.enabled
name: argocd-cmd-params-cm
optional: true
name: argocd-cmd-params-cm
---
apiVersion: apps/v1
kind: Deployment

View file

@ -26216,6 +26216,8 @@ spec:
name: tmp
- mountPath: /app/config/reposerver/tls
name: argocd-repo-server-tls
- mountPath: /home/argocd/params
name: argocd-cmd-params-cm
nodeSelector:
kubernetes.io/os: linux
serviceAccountName: argocd-applicationset-controller
@ -26244,6 +26246,13 @@ spec:
path: ca.crt
optional: true
secretName: argocd-repo-server-tls
- configMap:
items:
- key: applicationsetcontroller.profile.enabled
path: profiler.enabled
name: argocd-cmd-params-cm
optional: true
name: argocd-cmd-params-cm
---
apiVersion: apps/v1
kind: Deployment

View file

@ -1927,6 +1927,8 @@ spec:
name: tmp
- mountPath: /app/config/reposerver/tls
name: argocd-repo-server-tls
- mountPath: /home/argocd/params
name: argocd-cmd-params-cm
nodeSelector:
kubernetes.io/os: linux
serviceAccountName: argocd-applicationset-controller
@ -1955,6 +1957,13 @@ spec:
path: ca.crt
optional: true
secretName: argocd-repo-server-tls
- configMap:
items:
- key: applicationsetcontroller.profile.enabled
path: profiler.enabled
name: argocd-cmd-params-cm
optional: true
name: argocd-cmd-params-cm
---
apiVersion: apps/v1
kind: Deployment

View file

@ -1897,6 +1897,8 @@ spec:
name: tmp
- mountPath: /app/config/reposerver/tls
name: argocd-repo-server-tls
- mountPath: /home/argocd/params
name: argocd-cmd-params-cm
nodeSelector:
kubernetes.io/os: linux
serviceAccountName: argocd-applicationset-controller
@ -1925,6 +1927,13 @@ spec:
path: ca.crt
optional: true
secretName: argocd-repo-server-tls
- configMap:
items:
- key: applicationsetcontroller.profile.enabled
path: profiler.enabled
name: argocd-cmd-params-cm
optional: true
name: argocd-cmd-params-cm
---
apiVersion: apps/v1
kind: Deployment

View file

@ -25324,6 +25324,8 @@ spec:
name: tmp
- mountPath: /app/config/reposerver/tls
name: argocd-repo-server-tls
- mountPath: /home/argocd/params
name: argocd-cmd-params-cm
nodeSelector:
kubernetes.io/os: linux
serviceAccountName: argocd-applicationset-controller
@ -25352,6 +25354,13 @@ spec:
path: ca.crt
optional: true
secretName: argocd-repo-server-tls
- configMap:
items:
- key: applicationsetcontroller.profile.enabled
path: profiler.enabled
name: argocd-cmd-params-cm
optional: true
name: argocd-cmd-params-cm
---
apiVersion: apps/v1
kind: Deployment

View file

@ -25292,6 +25292,8 @@ spec:
name: tmp
- mountPath: /app/config/reposerver/tls
name: argocd-repo-server-tls
- mountPath: /home/argocd/params
name: argocd-cmd-params-cm
nodeSelector:
kubernetes.io/os: linux
serviceAccountName: argocd-applicationset-controller
@ -25320,6 +25322,13 @@ spec:
path: ca.crt
optional: true
secretName: argocd-repo-server-tls
- configMap:
items:
- key: applicationsetcontroller.profile.enabled
path: profiler.enabled
name: argocd-cmd-params-cm
optional: true
name: argocd-cmd-params-cm
---
apiVersion: apps/v1
kind: Deployment

View file

@ -1005,6 +1005,8 @@ spec:
name: tmp
- mountPath: /app/config/reposerver/tls
name: argocd-repo-server-tls
- mountPath: /home/argocd/params
name: argocd-cmd-params-cm
nodeSelector:
kubernetes.io/os: linux
serviceAccountName: argocd-applicationset-controller
@ -1033,6 +1035,13 @@ spec:
path: ca.crt
optional: true
secretName: argocd-repo-server-tls
- configMap:
items:
- key: applicationsetcontroller.profile.enabled
path: profiler.enabled
name: argocd-cmd-params-cm
optional: true
name: argocd-cmd-params-cm
---
apiVersion: apps/v1
kind: Deployment

View file

@ -973,6 +973,8 @@ spec:
name: tmp
- mountPath: /app/config/reposerver/tls
name: argocd-repo-server-tls
- mountPath: /home/argocd/params
name: argocd-cmd-params-cm
nodeSelector:
kubernetes.io/os: linux
serviceAccountName: argocd-applicationset-controller
@ -1001,6 +1003,13 @@ spec:
path: ca.crt
optional: true
secretName: argocd-repo-server-tls
- configMap:
items:
- key: applicationsetcontroller.profile.enabled
path: profiler.enabled
name: argocd-cmd-params-cm
optional: true
name: argocd-cmd-params-cm
---
apiVersion: apps/v1
kind: Deployment