fleet/server/service/middleware/ratelimit/ratelimit.go
Lucas Manuel Rodriguez d67fd73611
New rate limit algorithm for Fleet Desktop endpoints (#33344)
Resolves #31890

This new approach allows up to 1000 consecutive failing requests per
minute.
If the threshold of 1000 consecutive failures is reached for an IP, then
we ban request (return 429) from such IP for a duration of 1 minute.
(Any successful request for an IP clears the count.)

This supports the scenario where all hosts are behind a NAT (same IP)
AND still provides protection against brute force attacks (attackers can
only probe 1k requests per minute).

This approach was discussed in Slack with @rfairburn:
https://fleetdm.slack.com/archives/C051QJU3D0V/p1755625131298319?thread_ts=1755101701.844249&cid=C051QJU3D0V.

- [X] Changes file added for user-visible changes in `changes/`,
`orbit/changes/` or `ee/fleetd-chrome/changes`.
See [Changes
files](https://github.com/fleetdm/fleet/blob/main/docs/Contributing/guides/committing-changes.md#changes-files)
for more information.

## Testing

- [X] Added/updated automated tests
- [X] Where appropriate, [automated tests simulate multiple hosts and
test for host
isolation](https://github.com/fleetdm/fleet/blob/main/docs/Contributing/reference/patterns-backend.md#unit-testing)
(updates to one hosts's records do not affect another)

- [X] QA'd all new/changed functionality manually

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->

## Summary by CodeRabbit

- New Features
- Introduced IP-based rate limiting for Fleet Desktop endpoints to
better support many hosts behind a single public IP (NAT). Requests from
abusive IPs may be temporarily blocked, returning 429 Too Many Requests
with a retry-after hint.
- Documentation
- Added README for a new desktop rate-limit tester, describing usage and
expected behavior.
- Tests
- Added integration tests covering desktop endpoint rate limiting and
Redis-backed banning logic.
- Chores
- Added a command-line tool to stress-test desktop endpoints and verify
rate limiting behavior.

<!-- end of auto-generated comment: release notes by coderabbit.ai -->
2025-09-26 15:03:50 -03:00

165 lines
4.5 KiB
Go

package ratelimit
import (
"context"
"fmt"
"net/http"
authz_ctx "github.com/fleetdm/fleet/v4/server/contexts/authz"
"github.com/fleetdm/fleet/v4/server/contexts/ctxerr"
"github.com/fleetdm/fleet/v4/server/contexts/publicip"
"github.com/go-kit/kit/endpoint"
kitlog "github.com/go-kit/log"
"github.com/go-kit/log/level"
"github.com/throttled/throttled/v2"
)
// Middleware is a rate limiting middleware using the provided store. Each
// function wrapped by the rate limiter receives a separate quota.
type Middleware struct {
store throttled.GCRAStore
}
// NewMiddleware initializes the middleware with the provided store.
func NewMiddleware(store throttled.GCRAStore) *Middleware {
if store == nil {
panic("nil store")
}
return &Middleware{store: store}
}
// Limit returns a new middleware function enforcing the provided quota.
func (m *Middleware) Limit(keyName string, quota throttled.RateQuota) endpoint.Middleware {
return func(next endpoint.Endpoint) endpoint.Endpoint {
limiter, err := throttled.NewGCRARateLimiter(m.store, quota)
if err != nil {
panic(err)
}
return func(ctx context.Context, req interface{}) (response interface{}, err error) {
limited, result, err := limiter.RateLimit(keyName, 1)
if err != nil {
// This can happen if the limit store (e.g. Redis) is unavailable.
//
// We need to set authentication as checked, otherwise we end up returning HTTP 500
// errors.
if az, ok := authz_ctx.FromContext(ctx); ok {
az.SetChecked()
}
return nil, ctxerr.Wrap(ctx, err, "rate limit Middleware: failed to increase rate limit")
}
if limited {
// We need to set authentication as checked, otherwise we end up returning HTTP 500
// errors.
if az, ok := authz_ctx.FromContext(ctx); ok {
az.SetChecked()
}
return nil, ctxerr.Wrap(ctx, &rateLimitError{result: result})
}
return next(ctx, req)
}
}
}
// ErrorMiddleware is a rate limiter that performs limits only when there is an error in the request
type ErrorMiddleware struct {
ipBanner IPBanner
}
// IPBanner is an interface to perform rate limiting based on the request's IP.
type IPBanner interface {
// CheckBanned returns true if the IP is currently banned.
CheckBanned(ip string) (bool, error)
// RunRequest will update the status of the given IP with the result of a request.
RunRequest(ip string, success bool) error
}
// NewErrorMiddleware creates a new instance of ErrorMiddleware
func NewErrorMiddleware(ipBanner IPBanner) *ErrorMiddleware {
if ipBanner == nil {
panic("internal error: nil IP banner")
}
return &ErrorMiddleware{ipBanner: ipBanner}
}
func (m *ErrorMiddleware) Limit(logger kitlog.Logger) endpoint.Middleware {
return func(next endpoint.Endpoint) endpoint.Endpoint {
return func(ctx context.Context, req interface{}) (response interface{}, err error) {
publicIP := publicip.FromContext(ctx)
//
// Requests with empty public IP will fall under the same bucket.
//
banned, err := m.ipBanner.CheckBanned(publicIP)
if err != nil {
// This can happen if the limit store (e.g. Redis) is unavailable.
//
// We need to set authentication as checked, otherwise we end up returning HTTP 500 errors.
if az, ok := authz_ctx.FromContext(ctx); ok {
az.SetChecked()
}
return nil, ctxerr.Wrap(ctx, err, "rate limit ErrorMiddleware: failed to check rate limit")
}
if banned {
// We need to set authentication as checked, otherwise we end up returning HTTP 500 errors.
if az, ok := authz_ctx.FromContext(ctx); ok {
az.SetChecked()
}
level.Warn(logger).Log(
"ip", publicIP,
"msg", "limit exceeded",
)
return nil, ctxerr.Wrap(ctx, &rateLimitError{})
}
resp, err := next(ctx, req)
if rateErr := m.ipBanner.RunRequest(publicIP, err == nil); rateErr != nil {
level.Warn(logger).Log(
"ip", publicIP,
"msg", "fail to run request on IP banner",
"err", rateErr,
)
}
return resp, err
}
}
}
// Error is the interface for rate limiting errors.
type Error interface {
error
Result() throttled.RateLimitResult
}
type rateLimitError struct {
result throttled.RateLimitResult
}
func (r rateLimitError) Error() string {
ra := int(r.result.RetryAfter.Seconds())
if ra > 0 {
return fmt.Sprintf("limit exceeded, retry after: %ds", ra)
}
return "limit exceeded"
}
func (r rateLimitError) StatusCode() int {
return http.StatusTooManyRequests
}
func (r rateLimitError) RetryAfter() int {
return int(r.result.RetryAfter.Seconds())
}
func (r rateLimitError) Result() throttled.RateLimitResult {
return r.result
}