mirror of
https://github.com/fleetdm/fleet
synced 2026-05-24 09:28:54 +00:00
<!-- Add the related story/sub-task/bug number, like Resolves #123, or remove if NA --> **Related issue:** Resolves #43928 This PR adds a Redis-backed cache in front of the two host-by-key lookups on the agent auth paths. Docs: https://github.com/fleetdm/fleet/pull/44504 ## What changes **Read path (osquery/orbit auth):** - `LoadHostByNodeKey` and `LoadHostByOrbitNodeKey` now check Redis before falling through to MySQL. - Successful lookups are cached for 60s ± 10% jitter (configurable via `FLEET_REDIS_HOST_CACHE_TTL`). - `NotFound` results are cached for 5s as a negative entry, dampening repeated probes for keys that do not exist (deleted hosts whose agents are still polling, attacker scans, retry storms). - Concurrent lookups for the same key collapse into one DB query via `singleflight`. The shared query runs under a context detached from any one caller's deadline so the leader giving up does not abort the work for joiners. The shared query is itself bounded by a 30s timeout so a wedged DB call cannot pin the singleflight slot indefinitely. **Write path (invalidations):** - These methods now invalidate the cache after a successful inner call: `UpdateHost`, `SerialUpdateHost`, `UpdateHostOsqueryIntervals`, `UpdateHostRefetchRequested`, `UpdateHostRefetchCriticalQueriesUntil`, `UpdateHostIdentityCertHostIDBySerial`, `EnrollOsquery`, `EnrollOrbit`, `NewHost`, `DeleteHost`, `DeleteHosts`, `CleanupExpiredHosts`, `CleanupIncomingHosts`, `AddHostsToTeam`. - `AddHostsToTeam`, `DeleteHosts`, `CleanupExpiredHosts`, and `CleanupIncomingHosts` use a pipelined batch invalidator so 10k-host operations stay in the millisecond range instead of taking minutes of sequential round-trips. - Inner-call errors are not invalidations: a failing write leaves cached state intact. **Configuration:** - New flags `FLEET_REDIS_HOST_CACHE_ENABLED` (default `true`) and `FLEET_REDIS_HOST_CACHE_TTL` (default `60s`). - Server refuses to start if the cache is enabled with `TTL <= 0`. **Observability:** - Three new OTEL counters under the `fleet` meter: - `fleet.host_cache.lookups{result=hit|negative_hit|miss}` - `fleet.host_cache.errors{op=get|set|del}` - `fleet.host_cache.invalidations{reason=update|enroll|team|delete|cert}` - A pre-built SigNoz dashboard ships in `tools/signoz/host_cache_dashboard.json`. # Checklist for submitter If some of the following don't apply, delete the relevant line. - [x] Changes file added for user-visible changes in `changes/`, `orbit/changes/` or `ee/fleetd-chrome/changes`. See [Changes files](https://github.com/fleetdm/fleet/blob/main/docs/Contributing/guides/committing-changes.md#changes-files) for more information. - [x] Timeouts are implemented and retries are limited to avoid infinite loops ## Testing - [x] Added/updated automated tests - [x] Where appropriate, [automated tests simulate multiple hosts and test for host isolation](https://github.com/fleetdm/fleet/blob/main/docs/Contributing/reference/patterns-backend.md#unit-testing) (updates to one hosts's records do not affect another) - [x] QA'd all new/changed functionality manually <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit * **New Features** * Optional Redis-backed host lookup cache for osquery and orbit auth, with automatic invalidation and metrics/monitoring dashboard. * **Bug Fixes** * Fixed host-removal batching so cache-related removals use correct chunks. * **Tests** * Added comprehensive host-cache unit tests covering hits, negative cache, invalidation, concurrency, and JSON round-trips. * **Chores** * New config flags to enable the cache and set TTL (default 60s ±10% jitter). <!-- end of auto-generated comment: release notes by coderabbit.ai -->
230 lines
6.7 KiB
Go
230 lines
6.7 KiB
Go
package mysqlredis
|
|
|
|
import (
|
|
"context"
|
|
"time"
|
|
|
|
"github.com/fleetdm/fleet/v4/server/contexts/ctxerr"
|
|
"github.com/fleetdm/fleet/v4/server/contexts/logging"
|
|
"github.com/fleetdm/fleet/v4/server/datastore/redis"
|
|
"github.com/fleetdm/fleet/v4/server/fleet"
|
|
redigo "github.com/gomodule/redigo/redis"
|
|
)
|
|
|
|
const enrolledHostsSetKey = "enrolled_hosts:host_ids"
|
|
|
|
var redisSetMembersBatchSize = 10000 // var so it can be changed in tests
|
|
|
|
// SyncEnrolledHostIDs forces synchronization of host IDs between the DB and
|
|
// the Redis set. To optimize for the common case, it first checks if the
|
|
// counts are the same in the database and the redis set, and if so it does
|
|
// nothing else. Otherwise, it loads the current list of IDs from the database,
|
|
// clears the Redis set, and stores the IDs in the Redis set. This is called
|
|
// regularly (via a cron job) so that if the Redis set gets out of sync, it
|
|
// eventually fixes itself automatically.
|
|
func (d *Datastore) SyncEnrolledHostIDs(ctx context.Context) error {
|
|
if d.enforceHostLimit <= 0 {
|
|
// remove the enrolled hosts key, e.g. if the limit was enforced at some
|
|
// point and then disabled, so we reclaim the redis memory space.
|
|
conn := redis.ConfigureDoer(d.pool, d.pool.Get())
|
|
defer conn.Close()
|
|
if _, err := conn.Do("DEL", enrolledHostsSetKey); err != nil {
|
|
return ctxerr.Wrap(ctx, err, "delete enrolled hosts key")
|
|
}
|
|
return nil
|
|
}
|
|
|
|
dbCount, err := d.CountEnrolledHosts(ctx)
|
|
if err != nil {
|
|
return ctxerr.Wrap(ctx, err, "count enrolled hosts from the database")
|
|
}
|
|
|
|
conn := redis.ConfigureDoer(d.pool, d.pool.Get())
|
|
defer conn.Close()
|
|
|
|
redisCount, err := redigo.Int(conn.Do("SCARD", enrolledHostsSetKey))
|
|
if err != nil {
|
|
return ctxerr.Wrap(ctx, err, "count enrolled hosts from redis")
|
|
}
|
|
|
|
if redisCount == dbCount {
|
|
return nil
|
|
}
|
|
|
|
// counts differ, replace the redis set with ids from the database
|
|
ids, err := d.EnrolledHostIDs(ctx)
|
|
if err != nil {
|
|
return ctxerr.Wrap(ctx, err, "get enrolled host IDs from the database")
|
|
}
|
|
|
|
if _, err := conn.Do("DEL", enrolledHostsSetKey); err != nil {
|
|
return ctxerr.Wrap(ctx, err, "clear redis enrolled hosts set")
|
|
}
|
|
|
|
// return the connection to the pool so it can be reused in addHosts
|
|
conn.Close()
|
|
|
|
if err := addHosts(ctx, d.pool, ids...); err != nil {
|
|
return ctxerr.Wrap(ctx, err, "add database host IDs to the redis set")
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func addHosts(ctx context.Context, pool fleet.RedisPool, hostIDs ...uint) error {
|
|
conn := redis.ConfigureDoer(pool, pool.Get())
|
|
defer conn.Close()
|
|
|
|
for len(hostIDs) > 0 {
|
|
maxSize := len(hostIDs)
|
|
if maxSize > redisSetMembersBatchSize {
|
|
maxSize = redisSetMembersBatchSize
|
|
}
|
|
|
|
args := redigo.Args{enrolledHostsSetKey}
|
|
args = args.AddFlat(hostIDs[:maxSize])
|
|
if _, err := conn.Do("SADD", args...); err != nil {
|
|
return ctxerr.Wrap(ctx, err, "enrolled limits: add hosts")
|
|
}
|
|
hostIDs = hostIDs[maxSize:]
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func removeHosts(ctx context.Context, pool fleet.RedisPool, hostIDs ...uint) error {
|
|
conn := redis.ConfigureDoer(pool, pool.Get())
|
|
defer conn.Close()
|
|
|
|
for len(hostIDs) > 0 {
|
|
maxSize := len(hostIDs)
|
|
if maxSize > redisSetMembersBatchSize {
|
|
maxSize = redisSetMembersBatchSize
|
|
}
|
|
|
|
args := redigo.Args{enrolledHostsSetKey}
|
|
args = args.AddFlat(hostIDs[:maxSize])
|
|
if _, err := conn.Do("SREM", args...); err != nil {
|
|
return ctxerr.Wrap(ctx, err, "enrolled limits: remove hosts")
|
|
}
|
|
hostIDs = hostIDs[maxSize:]
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (d *Datastore) checkCanAddHost(ctx context.Context) (bool, error) {
|
|
conn := redis.ConfigureDoer(d.pool, d.pool.Get())
|
|
defer conn.Close()
|
|
|
|
n, err := redigo.Int(conn.Do("SCARD", enrolledHostsSetKey))
|
|
if err != nil {
|
|
return false, ctxerr.Wrap(ctx, err, "enrolled limits: check can add host")
|
|
}
|
|
if n >= d.enforceHostLimit {
|
|
return false, nil
|
|
}
|
|
return true, nil
|
|
}
|
|
|
|
func (d *Datastore) NewHost(ctx context.Context, host *fleet.Host) (*fleet.Host, error) {
|
|
h, err := d.Datastore.NewHost(ctx, host)
|
|
if err != nil {
|
|
return h, err
|
|
}
|
|
if d.enforceHostLimit > 0 {
|
|
if err := addHosts(ctx, d.pool, h.ID); err != nil {
|
|
logging.WithErr(ctx, err)
|
|
}
|
|
}
|
|
// A newly inserted host has no positive cache entry, but a stale negative
|
|
// cache entry for the new node_key could linger (up to hostCacheNegativeTTL)
|
|
// if the node_key had been probed moments before enrollment. Clearing it
|
|
// here ensures the next LoadHostByNodeKey populates the positive cache
|
|
// instead of returning a false NotFound.
|
|
d.invalidateAfterHostEnroll(ctx, h, "enroll")
|
|
return h, nil
|
|
}
|
|
|
|
func (d *Datastore) EnrollOsquery(ctx context.Context, opts ...fleet.DatastoreEnrollOsqueryOption) (*fleet.Host, error) {
|
|
h, err := d.Datastore.EnrollOsquery(ctx, opts...)
|
|
if err != nil {
|
|
return h, err
|
|
}
|
|
if d.enforceHostLimit > 0 {
|
|
if err := addHosts(ctx, d.pool, h.ID); err != nil {
|
|
logging.WithErr(ctx, err)
|
|
}
|
|
}
|
|
// EnrollOsquery can update an existing row's node_key + team_id on
|
|
// re-enrollment, so the cached snapshot is stale after the call.
|
|
d.invalidateAfterHostEnroll(ctx, h, "enroll")
|
|
return h, nil
|
|
}
|
|
|
|
func (d *Datastore) DeleteHost(ctx context.Context, hid uint) error {
|
|
err := d.Datastore.DeleteHost(ctx, hid)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if d.enforceHostLimit > 0 {
|
|
if err := removeHosts(ctx, d.pool, hid); err != nil {
|
|
logging.WithErr(ctx, err)
|
|
}
|
|
}
|
|
// Deleted row must not serve from cache: a stale hit would let the host
|
|
// authenticate after its deletion.
|
|
d.hostCacheDeleteByID(ctx, hid, "delete")
|
|
return nil
|
|
}
|
|
|
|
func (d *Datastore) DeleteHosts(ctx context.Context, ids []uint) error {
|
|
err := d.Datastore.DeleteHosts(ctx, ids)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if d.enforceHostLimit > 0 {
|
|
if err := removeHosts(ctx, d.pool, ids...); err != nil {
|
|
logging.WithErr(ctx, err)
|
|
}
|
|
}
|
|
// Batched pipelined invalidation — see invalidateHostIDs for why.
|
|
d.invalidateHostIDs(ctx, ids, "delete")
|
|
return nil
|
|
}
|
|
|
|
func (d *Datastore) CleanupExpiredHosts(ctx context.Context) ([]fleet.DeletedHostDetails, error) {
|
|
details, err := d.Datastore.CleanupExpiredHosts(ctx)
|
|
if err != nil {
|
|
return details, err
|
|
}
|
|
ids := make([]uint, len(details))
|
|
for i, detail := range details {
|
|
ids[i] = detail.ID
|
|
}
|
|
if d.enforceHostLimit > 0 {
|
|
if err := removeHosts(ctx, d.pool, ids...); err != nil {
|
|
logging.WithErr(ctx, err)
|
|
}
|
|
}
|
|
d.invalidateHostIDs(ctx, ids, "delete")
|
|
return details, nil
|
|
}
|
|
|
|
func (d *Datastore) CleanupIncomingHosts(ctx context.Context, now time.Time) ([]uint, error) {
|
|
ids, err := d.Datastore.CleanupIncomingHosts(ctx, now)
|
|
if err != nil {
|
|
return ids, err
|
|
}
|
|
if d.enforceHostLimit > 0 {
|
|
if err := removeHosts(ctx, d.pool, ids...); err != nil {
|
|
logging.WithErr(ctx, err)
|
|
}
|
|
}
|
|
d.invalidateHostIDs(ctx, ids, "delete")
|
|
return ids, nil
|
|
}
|
|
|
|
func (d *Datastore) CanEnrollNewHost(ctx context.Context) (bool, error) {
|
|
if d.enforceHostLimit > 0 {
|
|
return d.checkCanAddHost(ctx)
|
|
}
|
|
return true, nil
|
|
}
|