mirror of
https://github.com/fleetdm/fleet
synced 2026-05-23 08:58:41 +00:00
<!-- Add the related story/sub-task/bug number, like Resolves #123, or remove if NA --> **Related issue:** Resolves #35603 # Details This PR aims to optimize the system for recording scheduled query results in the database. Previously, each time a result set was received from a host, the Fleet server would count all of the current result rows in the db for that query before deciding whether to save more. This count becomes more expensive as the DB size grows, until it becomes the "long" pole in the recording process. With this PR, the system changes in the following ways: * When result rows are received from the host, no count is immediately taken. Instead, a Redis key is checked which holds a current approximate count of rows in the table. If the count is over the configured row limit, no rows are saved. Otherwise, rows are saved and the count is adjusted accordingly (it can go down, e.g. if a host previously returned 5 rows for a query and now returns 3). Keep in mind that we only store one set of results per host for a scheduled query; when a host reports results for a query, we delete that hosts previous results and write the new ones if there's room. * As an additional failsafe against runaway queries, if a result set contains more than 1000 rows, it is rejected. * Once a minute, a cron job runs which deletes all rows over the limit for each query and resets the counter for all queries to the actual # of rows in the table. The end result is: * No more expensive counts on every distributed write request for scheduled queries * Results for a single query can burst to over the limit for a short time, but will get cleaned up after a minute * Because of concurrency and race issues where multiple hosts might get the same count from Redis before inserting rows, the actual # of results in the db can burst higher than the limit. In testing w/ osquery-perf with 1000 hosts started simultaneously, sending 500 rows at a time, a 50,000 row limit and a query running every 10 seconds, I saw the table get up to 60,000 rows at times before being cleaned up. This is a very bad case; in the real world we'd have a lot more jitter in the reporting, and queries would not typically return this many rows. # Checklist for submitter If some of the following don't apply, delete the relevant line. - [X] Changes file added for user-visible changes in `changes/`, `orbit/changes/` or `ee/fleetd-chrome/changes`. See [Changes files](https://github.com/fleetdm/fleet/blob/main/docs/Contributing/guides/committing-changes.md#changes-files) for more information. - [X] Input data is properly validated, `SELECT *` is avoided, SQL injection is prevented (using placeholders for values in statements) ## Testing - [X] Added/updated automated tests Added a new test to verify that results are still discarded if table size is > limit, updated existing tests. - [X] Where appropriate, [automated tests simulate multiple hosts and test for host isolation](https://github.com/fleetdm/fleet/blob/main/docs/Contributing/reference/patterns-backend.md#unit-testing) (updates to one hosts's records do not affect another) - [X] QA'd all new/changed functionality manually Ran osquery-perf with 1000 hosts and a 50,000 row limit per query, using queries that returned 1, 500 and 1000 rows at a time. Verified that the limits were respected (subject to the amount of flex discussed above). I'm doing some A/B tests now using local MySQL metrics and will report back. <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit * **New Features** * Automated periodic cleanup of excess query results to retain recent data and free storage * Redis-backed query result counting to track per-query result volumes * **Performance Improvements** * Optimized recording of scheduled query results for reduced overhead * Cleanup runs in configurable batches to lower database contention and balance storage use <sub>✏️ Tip: You can customize this high-level summary in your review settings.</sub> <!-- end of auto-generated comment: release notes by coderabbit.ai -->
339 lines
12 KiB
Go
339 lines
12 KiB
Go
package service
|
|
|
|
import (
|
|
"context"
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/fleetdm/fleet/v4/server/contexts/viewer"
|
|
"github.com/fleetdm/fleet/v4/server/fleet"
|
|
"github.com/fleetdm/fleet/v4/server/mock"
|
|
"github.com/fleetdm/fleet/v4/server/ptr"
|
|
"github.com/fleetdm/fleet/v4/server/pubsub"
|
|
"github.com/stretchr/testify/require"
|
|
)
|
|
|
|
type nopLiveQuery struct{}
|
|
|
|
func (nopLiveQuery) RunQuery(name, sql string, hostIDs []uint) error {
|
|
return nil
|
|
}
|
|
|
|
func (nopLiveQuery) StopQuery(name string) error {
|
|
return nil
|
|
}
|
|
|
|
func (nopLiveQuery) QueriesForHost(hostID uint) (map[string]string, error) {
|
|
return map[string]string{}, nil
|
|
}
|
|
|
|
func (nopLiveQuery) QueryCompletedByHost(name string, hostID uint) error {
|
|
return nil
|
|
}
|
|
|
|
func (nopLiveQuery) CleanupInactiveQueries(ctx context.Context, inactiveCampaignIDs []uint) error {
|
|
return nil
|
|
}
|
|
|
|
func (q nopLiveQuery) LoadActiveQueryNames() ([]string, error) {
|
|
return nil, nil
|
|
}
|
|
|
|
func (q nopLiveQuery) GetQueryResultsCounts([]uint) (map[uint]int, error) {
|
|
return make(map[uint]int), nil
|
|
}
|
|
|
|
func (q nopLiveQuery) IncrQueryResultsCounts(map[uint]int) error {
|
|
return nil
|
|
}
|
|
|
|
func (q nopLiveQuery) SetQueryResultsCount(uint, int) error {
|
|
return nil
|
|
}
|
|
|
|
func (q nopLiveQuery) DeleteQueryResultsCount(uint) error {
|
|
return nil
|
|
}
|
|
|
|
func (q nopLiveQuery) LiveQueryStore() fleet.LiveQueryStore {
|
|
return q
|
|
}
|
|
|
|
func TestLiveQueryAuth(t *testing.T) {
|
|
ds := new(mock.Store)
|
|
qr := pubsub.NewInmemQueryResults()
|
|
svc, ctx := newTestService(t, ds, qr, nopLiveQuery{})
|
|
|
|
teamMaintainer := &fleet.User{ID: 42, Teams: []fleet.UserTeam{{Team: fleet.Team{ID: 1}, Role: fleet.RoleMaintainer}}}
|
|
query1ObsCanRun := &fleet.Query{
|
|
ID: 1,
|
|
AuthorID: ptr.Uint(teamMaintainer.ID),
|
|
Name: "q1",
|
|
Query: "SELECT 1",
|
|
ObserverCanRun: true,
|
|
}
|
|
query2ObsCannotRun := &fleet.Query{
|
|
ID: 2,
|
|
AuthorID: ptr.Uint(teamMaintainer.ID),
|
|
Name: "q2",
|
|
Query: "SELECT 2",
|
|
ObserverCanRun: false,
|
|
}
|
|
|
|
var lastCreatedQuery *fleet.Query
|
|
ds.NewQueryFunc = func(ctx context.Context, query *fleet.Query, opts ...fleet.OptionalArg) (*fleet.Query, error) {
|
|
q := *query
|
|
vw, ok := viewer.FromContext(ctx)
|
|
q.ID = 123
|
|
if ok {
|
|
q.AuthorID = ptr.Uint(vw.User.ID)
|
|
}
|
|
lastCreatedQuery = &q
|
|
return &q, nil
|
|
}
|
|
ds.AppConfigFunc = func(ctx context.Context) (*fleet.AppConfig, error) {
|
|
return &fleet.AppConfig{ServerSettings: fleet.ServerSettings{LiveQueryDisabled: false}}, nil
|
|
}
|
|
ds.NewDistributedQueryCampaignFunc = func(ctx context.Context, camp *fleet.DistributedQueryCampaign) (*fleet.DistributedQueryCampaign, error) {
|
|
return camp, nil
|
|
}
|
|
ds.NewDistributedQueryCampaignTargetFunc = func(ctx context.Context, target *fleet.DistributedQueryCampaignTarget) (*fleet.DistributedQueryCampaignTarget, error) {
|
|
return target, nil
|
|
}
|
|
ds.HostIDsInTargetsFunc = func(ctx context.Context, filters fleet.TeamFilter, targets fleet.HostTargets) ([]uint, error) {
|
|
return []uint{1}, nil
|
|
}
|
|
ds.HostIDsByIdentifierFunc = func(ctx context.Context, filter fleet.TeamFilter, identifiers []string) ([]uint, error) {
|
|
return nil, nil
|
|
}
|
|
ds.LabelIDsByNameFunc = func(ctx context.Context, names []string, filter fleet.TeamFilter) (map[string]uint, error) {
|
|
return nil, nil
|
|
}
|
|
ds.CountHostsInTargetsFunc = func(ctx context.Context, filters fleet.TeamFilter, targets fleet.HostTargets, now time.Time) (fleet.TargetMetrics, error) {
|
|
return fleet.TargetMetrics{}, nil
|
|
}
|
|
ds.QueryFunc = func(ctx context.Context, id uint) (*fleet.Query, error) {
|
|
if id == 1 {
|
|
return query1ObsCanRun, nil
|
|
}
|
|
if id == 2 {
|
|
return query2ObsCannotRun, nil
|
|
}
|
|
if lastCreatedQuery != nil {
|
|
q := lastCreatedQuery
|
|
lastCreatedQuery = nil
|
|
return q, nil
|
|
}
|
|
return &fleet.Query{ID: 8888, AuthorID: ptr.Uint(6666)}, nil
|
|
}
|
|
|
|
testCases := []struct {
|
|
name string
|
|
user *fleet.User
|
|
teamID *uint // to use as host target
|
|
shouldFailRunNew bool
|
|
shouldFailRunObsCan bool
|
|
shouldFailRunObsCannot bool
|
|
}{
|
|
{
|
|
name: "global admin",
|
|
user: &fleet.User{GlobalRole: ptr.String(fleet.RoleAdmin)},
|
|
teamID: nil,
|
|
shouldFailRunNew: false,
|
|
shouldFailRunObsCan: false,
|
|
shouldFailRunObsCannot: false,
|
|
},
|
|
{
|
|
name: "global maintainer",
|
|
user: &fleet.User{GlobalRole: ptr.String(fleet.RoleMaintainer)},
|
|
teamID: nil,
|
|
shouldFailRunNew: false,
|
|
shouldFailRunObsCan: false,
|
|
shouldFailRunObsCannot: false,
|
|
},
|
|
{
|
|
name: "global observer",
|
|
user: &fleet.User{GlobalRole: ptr.String(fleet.RoleObserver)},
|
|
teamID: nil,
|
|
shouldFailRunNew: true,
|
|
shouldFailRunObsCan: false,
|
|
shouldFailRunObsCannot: true,
|
|
},
|
|
{
|
|
name: "team maintainer",
|
|
user: teamMaintainer,
|
|
teamID: nil,
|
|
shouldFailRunNew: false,
|
|
shouldFailRunObsCan: false,
|
|
shouldFailRunObsCannot: false,
|
|
},
|
|
{
|
|
name: "team admin, no team target",
|
|
user: &fleet.User{Teams: []fleet.UserTeam{{Team: fleet.Team{ID: 1}, Role: fleet.RoleAdmin}}},
|
|
teamID: nil,
|
|
shouldFailRunNew: false,
|
|
shouldFailRunObsCan: false,
|
|
shouldFailRunObsCannot: false,
|
|
},
|
|
{
|
|
name: "team admin, target not set to own team",
|
|
user: &fleet.User{Teams: []fleet.UserTeam{{Team: fleet.Team{ID: 1}, Role: fleet.RoleAdmin}}},
|
|
teamID: ptr.Uint(2),
|
|
shouldFailRunNew: false,
|
|
shouldFailRunObsCan: true, // fails observer can run, as they are not part of that team, even as observer
|
|
shouldFailRunObsCannot: true,
|
|
},
|
|
{
|
|
name: "team admin, target set to own team",
|
|
user: &fleet.User{Teams: []fleet.UserTeam{{Team: fleet.Team{ID: 1}, Role: fleet.RoleAdmin}}},
|
|
teamID: ptr.Uint(1),
|
|
shouldFailRunNew: false,
|
|
shouldFailRunObsCan: false,
|
|
shouldFailRunObsCannot: false,
|
|
},
|
|
{
|
|
name: "team observer, no team target",
|
|
user: &fleet.User{ID: 48, Teams: []fleet.UserTeam{{Team: fleet.Team{ID: 1}, Role: fleet.RoleObserver}}},
|
|
teamID: nil,
|
|
shouldFailRunNew: true,
|
|
shouldFailRunObsCan: false,
|
|
shouldFailRunObsCannot: true,
|
|
},
|
|
{
|
|
name: "team observer, target not set to own team",
|
|
user: &fleet.User{ID: 48, Teams: []fleet.UserTeam{{Team: fleet.Team{ID: 1}, Role: fleet.RoleObserver}}},
|
|
teamID: ptr.Uint(2),
|
|
shouldFailRunNew: true,
|
|
shouldFailRunObsCan: true,
|
|
shouldFailRunObsCannot: true,
|
|
},
|
|
{
|
|
name: "team observer, target set to own team",
|
|
user: &fleet.User{ID: 48, Teams: []fleet.UserTeam{{Team: fleet.Team{ID: 1}, Role: fleet.RoleObserver}}},
|
|
teamID: ptr.Uint(1),
|
|
shouldFailRunNew: true,
|
|
shouldFailRunObsCan: false,
|
|
shouldFailRunObsCannot: true,
|
|
},
|
|
}
|
|
for _, tt := range testCases {
|
|
t.Run(tt.name, func(t *testing.T) {
|
|
ctx := viewer.NewContext(ctx, viewer.Viewer{User: tt.user})
|
|
|
|
var tms []uint
|
|
// Testing RunNew is tricky, because RunNew authorization is done, then
|
|
// the query is created, and then the Run authorization is applied to
|
|
// that now-existing query, so we have to make sure that the Run does not
|
|
// cause a Forbidden error. To this end, the ds.NewQuery mock always sets
|
|
// the AuthorID to the context user, and if the user is member of a team,
|
|
// always set that team as a host target. This will prevent the Run
|
|
// action from failing, if RunNew did succeed.
|
|
if len(tt.user.Teams) > 0 {
|
|
tms = []uint{tt.user.Teams[0].ID}
|
|
}
|
|
_, err := svc.NewDistributedQueryCampaign(ctx, query1ObsCanRun.Query, nil, fleet.HostTargets{TeamIDs: tms})
|
|
checkAuthErr(t, tt.shouldFailRunNew, err)
|
|
|
|
if tt.teamID != nil {
|
|
tms = []uint{*tt.teamID}
|
|
}
|
|
_, err = svc.NewDistributedQueryCampaign(ctx, query1ObsCanRun.Query, ptr.Uint(query1ObsCanRun.ID), fleet.HostTargets{TeamIDs: tms})
|
|
checkAuthErr(t, tt.shouldFailRunObsCan, err)
|
|
|
|
_, err = svc.NewDistributedQueryCampaign(ctx, query2ObsCannotRun.Query, ptr.Uint(query2ObsCannotRun.ID), fleet.HostTargets{TeamIDs: tms})
|
|
checkAuthErr(t, tt.shouldFailRunObsCannot, err)
|
|
|
|
// tests with a team target cannot run the "ByNames" calls, as there's no way
|
|
// to pass a team target with this call.
|
|
if tt.teamID == nil {
|
|
_, err = svc.NewDistributedQueryCampaignByIdentifiers(ctx, query1ObsCanRun.Query, nil, nil, nil)
|
|
checkAuthErr(t, tt.shouldFailRunNew, err)
|
|
|
|
_, err = svc.NewDistributedQueryCampaignByIdentifiers(ctx, query1ObsCanRun.Query, ptr.Uint(query1ObsCanRun.ID), nil, nil)
|
|
checkAuthErr(t, tt.shouldFailRunObsCan, err)
|
|
|
|
_, err = svc.NewDistributedQueryCampaignByIdentifiers(ctx, query2ObsCannotRun.Query, ptr.Uint(query2ObsCannotRun.ID), nil, nil)
|
|
checkAuthErr(t, tt.shouldFailRunObsCannot, err)
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestLiveQueryLabelValidation(t *testing.T) {
|
|
ds := new(mock.Store)
|
|
qr := pubsub.NewInmemQueryResults()
|
|
svc, ctx := newTestService(t, ds, qr, nopLiveQuery{})
|
|
|
|
user := &fleet.User{GlobalRole: ptr.String(fleet.RoleAdmin)}
|
|
query := &fleet.Query{
|
|
ID: 1,
|
|
Name: "q1",
|
|
Query: "SELECT 1",
|
|
ObserverCanRun: true,
|
|
}
|
|
ds.NewQueryFunc = func(ctx context.Context, query *fleet.Query, opts ...fleet.OptionalArg) (*fleet.Query, error) {
|
|
query.ID = 123
|
|
return query, nil
|
|
}
|
|
ds.AppConfigFunc = func(ctx context.Context) (*fleet.AppConfig, error) {
|
|
return &fleet.AppConfig{ServerSettings: fleet.ServerSettings{LiveQueryDisabled: false}}, nil
|
|
}
|
|
ds.NewDistributedQueryCampaignFunc = func(ctx context.Context, camp *fleet.DistributedQueryCampaign) (*fleet.DistributedQueryCampaign, error) {
|
|
return camp, nil
|
|
}
|
|
ds.NewDistributedQueryCampaignTargetFunc = func(ctx context.Context, target *fleet.DistributedQueryCampaignTarget) (*fleet.DistributedQueryCampaignTarget, error) {
|
|
return target, nil
|
|
}
|
|
ds.HostIDsInTargetsFunc = func(ctx context.Context, filters fleet.TeamFilter, targets fleet.HostTargets) ([]uint, error) {
|
|
return []uint{1}, nil
|
|
}
|
|
ds.HostIDsByIdentifierFunc = func(ctx context.Context, filter fleet.TeamFilter, identifiers []string) ([]uint, error) {
|
|
return nil, nil
|
|
}
|
|
ds.CountHostsInTargetsFunc = func(ctx context.Context, filters fleet.TeamFilter, targets fleet.HostTargets, now time.Time) (fleet.TargetMetrics, error) {
|
|
return fleet.TargetMetrics{}, nil
|
|
}
|
|
ds.QueryFunc = func(ctx context.Context, id uint) (*fleet.Query, error) {
|
|
return query, nil
|
|
}
|
|
|
|
ds.LabelIDsByNameFunc = func(ctx context.Context, names []string, filter fleet.TeamFilter) (map[string]uint, error) {
|
|
return map[string]uint{"label1": uint(1)}, nil
|
|
}
|
|
|
|
testCases := []struct {
|
|
name string
|
|
labels []string
|
|
expectedError string
|
|
}{
|
|
{
|
|
name: "no labels",
|
|
labels: []string{},
|
|
expectedError: "",
|
|
},
|
|
{
|
|
name: "invalid label",
|
|
labels: []string{"iamnotalabel"},
|
|
expectedError: "Invalid label name(s): iamnotalabel.",
|
|
},
|
|
{
|
|
name: "valid label",
|
|
labels: []string{"label1"},
|
|
expectedError: "",
|
|
},
|
|
}
|
|
|
|
for _, tt := range testCases {
|
|
t.Run(tt.name, func(t *testing.T) {
|
|
ctx := viewer.NewContext(ctx, viewer.Viewer{User: user})
|
|
_, err := svc.NewDistributedQueryCampaignByIdentifiers(ctx, query.Query, nil, nil, tt.labels)
|
|
|
|
if tt.expectedError == "" {
|
|
require.Nil(t, err)
|
|
} else {
|
|
require.NotNil(t, err)
|
|
require.Contains(t, err.Error(), tt.expectedError)
|
|
}
|
|
})
|
|
}
|
|
}
|