fleet/server/live_query/live_query_test.go

267 lines
7.7 KiB
Go
Raw Normal View History

package live_query
import (
"context"
"testing"
2024-08-26 16:32:57 +00:00
"time"
"github.com/fleetdm/fleet/v4/server/datastore/redis"
2021-06-26 04:46:51 +00:00
"github.com/fleetdm/fleet/v4/server/fleet"
redigo "github.com/gomodule/redigo/redis"
"github.com/stretchr/testify/assert"
2020-07-01 17:51:34 +00:00
"github.com/stretchr/testify/require"
)
var testFunctions = [...]func(*testing.T, fleet.LiveQueryStore){
2020-07-01 17:51:34 +00:00
testLiveQuery,
testLiveQueryNoTargets,
testLiveQueryStopQuery,
testLiveQueryExpiredQuery,
testLiveQueryOnlyExpired,
testLiveQueryCleanupInactive,
Fix lingering live queries keys in Redis (#33928) Resolves #33254 This can be reproduced locally by running the following "high load" test: Run 500 hosts using osquery-perf: ``` go run ./cmd/osquery-perf --enroll_secret ... \ --host_count 500 \ --server_url https://localhost:8080 \ --live_query_fail_prob 0.0 \ --live_query_no_results_prob 0.0 \ --orbit_prob 0.0 \ --http_message_signature_prob 0.0 ``` Run `stress_test_live_queries.sh`: ``` #!/bin/bash while true; do curl -v -k -X POST -H "Authorization: Bearer $TEST_TOKEN" https://localhost:8080/api/latest/fleet/queries/$SAVED_QUERY_ID/run -d '{"host_ids": [<500 comma-separated host ids>]}' done ``` Use "Redis Insight" or the like and you will start to see `livequery:{$CAMPAIGN_ID}` keys with `No limit` (which is the bug): <img width="1380" height="227" alt="Screenshot 2025-10-07 at 3 10 26 PM" src="https://github.com/user-attachments/assets/30434348-3217-40c4-8ebc-bab5ceb4daa9" /> - [X] Changes file added for user-visible changes in `changes/`, `orbit/changes/` or `ee/fleetd-chrome/changes`. ## Testing - [x] Added/updated automated tests - [x] QA'd all new/changed functionality manually <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit - Bug Fixes - Prevent lingering Redis keys for live queries by ensuring keys are cleaned up and not recreated when completing/canceling non-existent queries. - Improves resource usage and avoids stale state in live query processing. - Tests - Added tests verifying proper retrieval/completion behavior and that no Redis key is created for non-existent live queries. <!-- end of auto-generated comment: release notes by coderabbit.ai --> --------- Co-authored-by: Ian Littman <iansltx@gmail.com>
2025-10-08 09:36:38 +00:00
testLiveQuerySetBitOnlyIfKeyExists,
2020-07-01 17:51:34 +00:00
}
func testLiveQuery(t *testing.T, store fleet.LiveQueryStore) {
2020-07-01 17:51:34 +00:00
queries, err := store.QueriesForHost(1)
assert.NoError(t, err)
assert.Len(t, queries, 0)
queries, err = store.QueriesForHost(3)
assert.NoError(t, err)
assert.Len(t, queries, 0)
assert.NoError(t, store.RunQuery("test", "select 1", []uint{1, 3}))
assert.NoError(t, store.RunQuery("test2", "select 2", []uint{3}))
assert.NoError(t, store.RunQuery("test3", "select 3", []uint{1}))
assert.NoError(t, store.RunQuery("test4", "select 4", []uint{4}))
queries, err = store.QueriesForHost(1)
assert.NoError(t, err)
assert.Equal(t,
map[string]string{
"test": "select 1",
"test3": "select 3",
},
queries,
)
queries, err = store.QueriesForHost(2)
assert.NoError(t, err)
assert.Len(t, queries, 0)
queries, err = store.QueriesForHost(3)
assert.NoError(t, err)
assert.Equal(t,
map[string]string{
"test": "select 1",
"test2": "select 2",
},
queries,
)
2020-07-01 17:51:34 +00:00
assert.NoError(t, store.QueryCompletedByHost("test", 1))
assert.NoError(t, store.QueryCompletedByHost("test2", 3))
queries, err = store.QueriesForHost(1)
assert.NoError(t, err)
assert.Equal(t,
map[string]string{
"test3": "select 3",
},
queries,
)
2020-07-01 17:51:34 +00:00
queries, err = store.QueriesForHost(2)
assert.NoError(t, err)
assert.Len(t, queries, 0)
queries, err = store.QueriesForHost(3)
assert.NoError(t, err)
assert.Equal(t,
map[string]string{
"test": "select 1",
},
queries,
)
}
2020-07-01 17:51:34 +00:00
func testLiveQueryNoTargets(t *testing.T, store fleet.LiveQueryStore) {
2020-07-01 17:51:34 +00:00
assert.Error(t, store.RunQuery("test", "select 1", []uint{}))
}
func testLiveQueryStopQuery(t *testing.T, store fleet.LiveQueryStore) {
2020-07-01 17:51:34 +00:00
require.NoError(t, store.RunQuery("test", "select 1", []uint{1, 3}))
require.NoError(t, store.RunQuery("test2", "select 2", []uint{1, 3}))
require.NoError(t, store.StopQuery("test"))
queries, err := store.QueriesForHost(1)
require.NoError(t, err)
assert.Len(t, queries, 1)
}
func testLiveQueryExpiredQuery(t *testing.T, store fleet.LiveQueryStore) {
oldModulo := cleanupExpiredQueriesModulo
cleanupExpiredQueriesModulo = 1 // run the cleanup each time
t.Cleanup(func() { cleanupExpiredQueriesModulo = oldModulo })
require.NoError(t, store.RunQuery("test", "select 1", []uint{1}))
// simulate a "test2" live query that has expired but is still in the set
pool := store.(*redisLiveQuery).pool
conn := redis.ConfigureDoer(pool, pool.Get())
defer conn.Close()
_, err := conn.Do("SADD", activeQueriesKey, "test2")
require.NoError(t, err)
queries, err := store.QueriesForHost(1)
require.NoError(t, err)
assert.Len(t, queries, 1)
assert.Equal(t, map[string]string{"test": "select 1"}, queries)
2024-08-29 16:03:45 +00:00
assert.Eventually(t, func() bool {
activeNames, err := redigo.Strings(conn.Do("SMEMBERS", activeQueriesKey))
require.NoError(t, err)
if len(activeNames) == 1 && activeNames[0] == "test" {
return true
}
return false
}, 5*time.Second, 100*time.Millisecond)
}
func testLiveQueryOnlyExpired(t *testing.T, store fleet.LiveQueryStore) {
oldModulo := cleanupExpiredQueriesModulo
cleanupExpiredQueriesModulo = 1 // run the cleanup each time
t.Cleanup(func() { cleanupExpiredQueriesModulo = oldModulo })
// simulate a "test" live query that has expired but is still in the set
pool := store.(*redisLiveQuery).pool
conn := redis.ConfigureDoer(pool, pool.Get())
defer conn.Close()
_, err := conn.Do("SADD", activeQueriesKey, "test")
require.NoError(t, err)
queries, err := store.QueriesForHost(1)
require.NoError(t, err)
assert.Len(t, queries, 0)
2024-08-26 16:32:57 +00:00
assert.Eventually(t, func() bool {
activeNames, err := redigo.Strings(conn.Do("SMEMBERS", activeQueriesKey))
require.NoError(t, err)
return len(activeNames) == 0
}, 5*time.Second, 100*time.Millisecond)
}
func testLiveQueryCleanupInactive(t *testing.T, store fleet.LiveQueryStore) {
ctx := context.Background()
// get a raw Redis connection to make direct checks
pool := store.(*redisLiveQuery).pool
conn := redis.ConfigureDoer(pool, pool.Get())
defer conn.Close()
// run a few live queries, making them active in Redis
err := store.RunQuery("1", "SELECT 1", []uint{1, 2, 3})
require.NoError(t, err)
err = store.RunQuery("2", "SELECT 2", []uint{4})
require.NoError(t, err)
err = store.RunQuery("3", "SELECT 3", []uint{5, 6})
require.NoError(t, err)
err = store.RunQuery("4", "SELECT 4", []uint{1, 2, 5})
require.NoError(t, err)
err = store.RunQuery("5", "SELECT 5", []uint{2, 3, 7})
require.NoError(t, err)
activeNames, err := store.LoadActiveQueryNames()
require.NoError(t, err)
require.ElementsMatch(t, []string{"1", "2", "3", "4", "5"}, activeNames)
// sanity-check that the queries are properly stored
m, err := store.QueriesForHost(1)
require.NoError(t, err)
require.Equal(t, map[string]string{"1": "SELECT 1", "4": "SELECT 4"}, m)
// simulate that only campaigns 2 and 4 are still active, cleanup the rest
err = store.CleanupInactiveQueries(ctx, []uint{1, 3, 5})
require.NoError(t, err)
activeNames, err = store.LoadActiveQueryNames()
require.NoError(t, err)
require.ElementsMatch(t, []string{"2", "4"}, activeNames)
m, err = store.QueriesForHost(1)
require.NoError(t, err)
require.Equal(t, map[string]string{"4": "SELECT 4"}, m)
// explicitly mark campaign 4 as stopped
err = store.StopQuery("4")
require.NoError(t, err)
// no more queries for host 1
m, err = store.QueriesForHost(1)
require.NoError(t, err)
require.Empty(t, m)
// only campaign 2 remains, for host 4
m, err = store.QueriesForHost(4)
require.NoError(t, err)
require.Equal(t, map[string]string{"2": "SELECT 2"}, m)
// simulate that there are no inactive campaigns to cleanup
err = store.CleanupInactiveQueries(ctx, nil)
require.NoError(t, err)
activeNames, err = store.LoadActiveQueryNames()
require.NoError(t, err)
require.ElementsMatch(t, []string{"2"}, activeNames)
// simulate that all campaigns are inactive, cleanup all
err = store.CleanupInactiveQueries(ctx, []uint{1, 2, 3, 4, 5})
require.NoError(t, err)
activeNames, err = store.LoadActiveQueryNames()
require.NoError(t, err)
require.Empty(t, activeNames)
m, err = store.QueriesForHost(4)
require.NoError(t, err)
require.Empty(t, m)
}
Fix lingering live queries keys in Redis (#33928) Resolves #33254 This can be reproduced locally by running the following "high load" test: Run 500 hosts using osquery-perf: ``` go run ./cmd/osquery-perf --enroll_secret ... \ --host_count 500 \ --server_url https://localhost:8080 \ --live_query_fail_prob 0.0 \ --live_query_no_results_prob 0.0 \ --orbit_prob 0.0 \ --http_message_signature_prob 0.0 ``` Run `stress_test_live_queries.sh`: ``` #!/bin/bash while true; do curl -v -k -X POST -H "Authorization: Bearer $TEST_TOKEN" https://localhost:8080/api/latest/fleet/queries/$SAVED_QUERY_ID/run -d '{"host_ids": [<500 comma-separated host ids>]}' done ``` Use "Redis Insight" or the like and you will start to see `livequery:{$CAMPAIGN_ID}` keys with `No limit` (which is the bug): <img width="1380" height="227" alt="Screenshot 2025-10-07 at 3 10 26 PM" src="https://github.com/user-attachments/assets/30434348-3217-40c4-8ebc-bab5ceb4daa9" /> - [X] Changes file added for user-visible changes in `changes/`, `orbit/changes/` or `ee/fleetd-chrome/changes`. ## Testing - [x] Added/updated automated tests - [x] QA'd all new/changed functionality manually <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit - Bug Fixes - Prevent lingering Redis keys for live queries by ensuring keys are cleaned up and not recreated when completing/canceling non-existent queries. - Improves resource usage and avoids stale state in live query processing. - Tests - Added tests verifying proper retrieval/completion behavior and that no Redis key is created for non-existent live queries. <!-- end of auto-generated comment: release notes by coderabbit.ai --> --------- Co-authored-by: Ian Littman <iansltx@gmail.com>
2025-10-08 09:36:38 +00:00
func testLiveQuerySetBitOnlyIfKeyExists(t *testing.T, store fleet.LiveQueryStore) {
// Create a live query campaign.
err := store.RunQuery("test", "SELECT 1;", []uint{1})
require.NoError(t, err)
// Get the query for the host.
queries, err := store.QueriesForHost(1)
require.NoError(t, err)
require.Equal(t,
map[string]string{
"test": "SELECT 1;",
},
queries,
)
// Mark query as completed by host.
err = store.QueryCompletedByHost("test", 1)
require.NoError(t, err)
// Query should not be returned anymore as it was marked as completed for this host.
queries, err = store.QueriesForHost(1)
require.NoError(t, err)
require.Empty(t, queries)
// A host could be attempting to write a result for a query that was already deleted.
err = store.QueryCompletedByHost("test-2", 1)
require.NoError(t, err)
// Let's test that such key was not created.
// get a raw Redis connection to make direct checks
pool := store.(*redisLiveQuery).pool
conn := redis.ConfigureDoer(pool, pool.Get())
t.Cleanup(func() {
conn.Close()
})
n, err := redigo.Int(conn.Do("EXISTS", queryKeyPrefix+"{test-2}"))
require.NoError(t, err)
require.Zero(t, n)
}