mirror of
https://github.com/fleetdm/fleet
synced 2026-04-21 13:37:30 +00:00
More sensible online status calculation (#1334)
Improve the mechanism used to calculate whether or not hosts are online. Previously, hosts were categorized as "online" if they had been seen within the past 30 minutes. To make the "online" status more representative of reality, hosts are marked "online" if the Kolide server has heard from them within two times the lowest polling interval as described by the Kolide-managed osquery configuration. For example, if you've configured osqueryd to check-in with Kolide every 10 seconds, only hosts that Kolide has heard from within the last 20 seconds will be marked "online".
This commit is contained in:
parent
8e61bfb945
commit
e4db95d2b5
10 changed files with 213 additions and 22 deletions
|
|
@ -1,3 +1,7 @@
|
|||
* Improve the mechanism used to calculate whether or not hosts are online.
|
||||
|
||||
Previously, hosts were categorized as "online" if they had been seen within the past 30 minutes. To make the "online" status more representative of reality, hosts are marked "online" if the Kolide server has heard from them within two times the lowest polling interval as described by the Kolide-managed osquery configuration. For example, if you've configured osqueryd to check-in with Kolide every 10 seconds, only hosts that Kolide has heard from within the last 20 seconds will be marked "online".
|
||||
|
||||
* Update Host details cards UI
|
||||
|
||||
* Add support for rotating the osquery status and result log files by sending a SIGHUP signal to the kolide process.
|
||||
|
|
|
|||
|
|
@ -519,9 +519,14 @@ func testDistributedQueriesForHost(t *testing.T, ds kolide.Datastore) {
|
|||
}
|
||||
|
||||
func testGenerateHostStatusStatistics(t *testing.T, ds kolide.Datastore) {
|
||||
if ds.Name() == "inmem" {
|
||||
fmt.Println("Busted test skipped for inmem")
|
||||
return
|
||||
}
|
||||
|
||||
mockClock := clock.NewMockClock()
|
||||
|
||||
online, offline, mia, new, err := ds.GenerateHostStatusStatistics(mockClock.Now())
|
||||
online, offline, mia, new, err := ds.GenerateHostStatusStatistics(mockClock.Now(), 60)
|
||||
assert.Nil(t, err)
|
||||
assert.Equal(t, uint(0), online)
|
||||
assert.Equal(t, uint(0), offline)
|
||||
|
|
@ -534,8 +539,8 @@ func testGenerateHostStatusStatistics(t *testing.T, ds kolide.Datastore) {
|
|||
OsqueryHostID: "1",
|
||||
UUID: "1",
|
||||
NodeKey: "1",
|
||||
DetailUpdateTime: mockClock.Now(),
|
||||
SeenTime: mockClock.Now(),
|
||||
DetailUpdateTime: mockClock.Now().Add(-30 * time.Second),
|
||||
SeenTime: mockClock.Now().Add(-30 * time.Second),
|
||||
UpdateCreateTimestamps: kolide.UpdateCreateTimestamps{
|
||||
CreateTimestamp: kolide.CreateTimestamp{CreatedAt: mockClock.Now()},
|
||||
},
|
||||
|
|
@ -584,12 +589,32 @@ func testGenerateHostStatusStatistics(t *testing.T, ds kolide.Datastore) {
|
|||
})
|
||||
assert.Nil(t, err)
|
||||
|
||||
online, offline, mia, new, err = ds.GenerateHostStatusStatistics(mockClock.Now())
|
||||
// With an online interval of 60, both the host that checked in a minute ago
|
||||
// as well as the host that checked in 30 seconds ago should both be online
|
||||
online, offline, mia, new, err = ds.GenerateHostStatusStatistics(mockClock.Now(), 60)
|
||||
assert.Nil(t, err)
|
||||
assert.Equal(t, uint(2), online)
|
||||
assert.Equal(t, uint(1), offline)
|
||||
assert.Equal(t, uint(1), mia)
|
||||
assert.Equal(t, uint(4), new)
|
||||
|
||||
// With an online interval of 10, no hosts should be online
|
||||
online, offline, mia, new, err = ds.GenerateHostStatusStatistics(mockClock.Now(), 10)
|
||||
assert.Nil(t, err)
|
||||
assert.Equal(t, uint(0), online)
|
||||
assert.Equal(t, uint(3), offline)
|
||||
assert.Equal(t, uint(1), mia)
|
||||
assert.Equal(t, uint(4), new)
|
||||
|
||||
// With an online interval of 3600 seconds (60 minutes), the host that checked
|
||||
// in 30 seconds ago, a minute ago, and 60 minutes ago should all appear to be
|
||||
// online
|
||||
online, offline, mia, new, err = ds.GenerateHostStatusStatistics(mockClock.Now(), 60*60)
|
||||
assert.Nil(t, err)
|
||||
assert.Equal(t, uint(3), online)
|
||||
assert.Equal(t, uint(0), offline)
|
||||
assert.Equal(t, uint(1), mia)
|
||||
assert.Equal(t, uint(4), new)
|
||||
}
|
||||
|
||||
func testMarkHostSeen(t *testing.T, ds kolide.Datastore) {
|
||||
|
|
|
|||
|
|
@ -113,7 +113,7 @@ func (d *Datastore) ListHosts(opt kolide.ListOptions) ([]*kolide.Host, error) {
|
|||
return hosts, nil
|
||||
}
|
||||
|
||||
func (d *Datastore) GenerateHostStatusStatistics(now time.Time) (online, offline, mia, new uint, err error) {
|
||||
func (d *Datastore) GenerateHostStatusStatistics(now time.Time, onlineInterval uint) (online, offline, mia, new uint, err error) {
|
||||
d.mtx.Lock()
|
||||
defer d.mtx.Unlock()
|
||||
|
||||
|
|
|
|||
|
|
@ -290,7 +290,7 @@ func (d *Datastore) ListHosts(opt kolide.ListOptions) ([]*kolide.Host, error) {
|
|||
return hosts, nil
|
||||
}
|
||||
|
||||
func (d *Datastore) GenerateHostStatusStatistics(now time.Time) (online, offline, mia, new uint, e error) {
|
||||
func (d *Datastore) GenerateHostStatusStatistics(now time.Time, onlineInterval uint) (online, offline, mia, new uint, e error) {
|
||||
sqlStatement := `
|
||||
SELECT (
|
||||
SELECT count(id)
|
||||
|
|
@ -300,13 +300,13 @@ func (d *Datastore) GenerateHostStatusStatistics(now time.Time) (online, offline
|
|||
(
|
||||
SELECT count(id)
|
||||
FROM hosts
|
||||
WHERE DATE_ADD(seen_time, INTERVAL 30 MINUTE) <= ?
|
||||
WHERE DATE_ADD(seen_time, INTERVAL ? SECOND) <= ?
|
||||
AND DATE_ADD(seen_time, INTERVAL 30 DAY) >= ?
|
||||
) AS offline,
|
||||
(
|
||||
SELECT count(id)
|
||||
FROM hosts
|
||||
WHERE DATE_ADD(seen_time, INTERVAL 30 MINUTE) > ?
|
||||
WHERE DATE_ADD(seen_time, INTERVAL ? SECOND) > ?
|
||||
) AS online,
|
||||
(
|
||||
SELECT count(id)
|
||||
|
|
@ -323,7 +323,7 @@ func (d *Datastore) GenerateHostStatusStatistics(now time.Time) (online, offline
|
|||
Online uint `db:"online"`
|
||||
New uint `db:"new"`
|
||||
}{}
|
||||
err := d.db.Get(&counts, sqlStatement, now, now, now, now, now)
|
||||
err := d.db.Get(&counts, sqlStatement, now, 2*onlineInterval, now, now, 2*onlineInterval, now, now)
|
||||
if err != nil && err != sql.ErrNoRows {
|
||||
e = errors.Wrap(err, "generating host statistics")
|
||||
return
|
||||
|
|
|
|||
|
|
@ -40,7 +40,7 @@ type HostStore interface {
|
|||
EnrollHost(osqueryHostId string, nodeKeySize int) (*Host, error)
|
||||
AuthenticateHost(nodeKey string) (*Host, error)
|
||||
MarkHostSeen(host *Host, t time.Time) error
|
||||
GenerateHostStatusStatistics(now time.Time) (online, offline, mia, new uint, err error)
|
||||
GenerateHostStatusStatistics(now time.Time, onlineInterval uint) (online, offline, mia, new uint, err error)
|
||||
SearchHosts(query string, omit ...uint) ([]*Host, error)
|
||||
// DistributedQueriesForHost retrieves the distributed queries that the
|
||||
// given host should run. The result map is a mapping from campaign ID
|
||||
|
|
|
|||
|
|
@ -34,6 +34,13 @@ type OptionService interface {
|
|||
// ModifyOptions will change values of the options in OptionRequest. Note
|
||||
// passing ReadOnly options will cause an error.
|
||||
ModifyOptions(ctx context.Context, req OptionRequest) ([]Option, error)
|
||||
// ExpectedCheckinInterval returns how often we should expect to hear from a
|
||||
// host. By maintaining a known list of osquery configuration options which
|
||||
// influence the interval that osqueryd hosts check-in to a TLS server, we
|
||||
// can deduce a minimum amount of time that we should expect to hear from an
|
||||
// osqueryd agent if it is online. This is currently two times the most
|
||||
// frequent check-in interval.
|
||||
ExpectedCheckinInterval(ctx context.Context) (uint, error)
|
||||
}
|
||||
|
||||
const (
|
||||
|
|
|
|||
|
|
@ -14,7 +14,11 @@ func (svc service) GetHost(ctx context.Context, id uint) (*kolide.Host, error) {
|
|||
}
|
||||
|
||||
func (svc service) GetHostSummary(ctx context.Context) (*kolide.HostSummary, error) {
|
||||
online, offline, mia, new, err := svc.ds.GenerateHostStatusStatistics(svc.clock.Now())
|
||||
onlineInterval, err := svc.ExpectedCheckinInterval(ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
online, offline, mia, new, err := svc.ds.GenerateHostStatusStatistics(svc.clock.Now(), onlineInterval)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
|
|
|||
|
|
@ -20,3 +20,60 @@ func (svc service) ModifyOptions(ctx context.Context, req kolide.OptionRequest)
|
|||
}
|
||||
return req.Options, nil
|
||||
}
|
||||
|
||||
func (svc service) ExpectedCheckinInterval(ctx context.Context) (uint, error) {
|
||||
interval := uint(0)
|
||||
found := false
|
||||
|
||||
osqueryIntervalOptionNames := []string{
|
||||
"distributed_interval",
|
||||
"logger_tls_period",
|
||||
}
|
||||
|
||||
for _, option := range osqueryIntervalOptionNames {
|
||||
// for each option which is known to hold a TLS check-in interval, try to
|
||||
// fetch it
|
||||
opt, err := svc.ds.OptionByName(option)
|
||||
if err != nil {
|
||||
// if the option is not set, try the next known option
|
||||
if _, ok := err.(kolide.NotFoundError); ok {
|
||||
continue
|
||||
}
|
||||
// if some other error occured when getting the option, we want to return
|
||||
// that
|
||||
return 0, err
|
||||
}
|
||||
|
||||
// try to cast the option as a uint. if this fails, the option has likely been set incorrectly
|
||||
var val uint
|
||||
switch v := opt.Value.Val.(type) {
|
||||
case int:
|
||||
val = uint(v)
|
||||
case uint:
|
||||
val = v
|
||||
case uint64:
|
||||
val = uint(v)
|
||||
case float64:
|
||||
val = uint(v)
|
||||
default:
|
||||
return 0, errors.New("Option is not a number: " + opt.Name)
|
||||
}
|
||||
|
||||
// If an option has not been found yet, we want to save this interval.
|
||||
// If an option HAS been found already and this one is less, we want to
|
||||
// save that as our new minimum check-in interval.
|
||||
if !found || val < interval {
|
||||
found = true
|
||||
interval = val
|
||||
}
|
||||
}
|
||||
|
||||
// if we never found any interval options set, the default distributed
|
||||
// interval is 60, so we use that
|
||||
if !found {
|
||||
return 60, nil
|
||||
}
|
||||
|
||||
// return the lowest interval that we found
|
||||
return interval, nil
|
||||
}
|
||||
|
|
|
|||
98
server/service/service_options_test.go
Normal file
98
server/service/service_options_test.go
Normal file
|
|
@ -0,0 +1,98 @@
|
|||
package service
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
|
||||
"github.com/kolide/kolide/server/config"
|
||||
"github.com/kolide/kolide/server/datastore/inmem"
|
||||
"github.com/kolide/kolide/server/kolide"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestExpectedCheckinInterval(t *testing.T) {
|
||||
ds, err := inmem.New(config.TestConfig())
|
||||
require.Nil(t, err)
|
||||
require.Nil(t, ds.MigrateData())
|
||||
svc, err := newTestService(ds, nil)
|
||||
require.Nil(t, err)
|
||||
ctx := context.Background()
|
||||
|
||||
var distributedInterval uint
|
||||
var distributedIntervalID uint
|
||||
var loggerTlsPeriod uint
|
||||
var loggerTlsPeriodID uint
|
||||
|
||||
updateLocalOptionValues := func(opts []kolide.Option) {
|
||||
for _, option := range opts {
|
||||
if option.Name == "distributed_interval" {
|
||||
distributedInterval = uint(option.Value.Val.(int))
|
||||
distributedIntervalID = option.ID
|
||||
}
|
||||
if option.Name == "logger_tls_period" {
|
||||
loggerTlsPeriod = uint(option.Value.Val.(int))
|
||||
loggerTlsPeriodID = option.ID
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
options, err := svc.GetOptions(ctx)
|
||||
require.Nil(t, err)
|
||||
updateLocalOptionValues(options)
|
||||
require.Equal(t, 10, int(distributedInterval))
|
||||
require.Equal(t, 10, int(loggerTlsPeriod))
|
||||
interval, err := svc.ExpectedCheckinInterval(ctx)
|
||||
require.Nil(t, err)
|
||||
assert.Equal(t, 10, int(interval))
|
||||
|
||||
options, err = svc.ModifyOptions(ctx, kolide.OptionRequest{
|
||||
Options: []kolide.Option{
|
||||
kolide.Option{
|
||||
ID: distributedIntervalID,
|
||||
Name: "distributed_interval",
|
||||
Value: kolide.OptionValue{
|
||||
Val: 5,
|
||||
},
|
||||
Type: kolide.OptionTypeInt,
|
||||
ReadOnly: false,
|
||||
},
|
||||
},
|
||||
},
|
||||
)
|
||||
require.Nil(t, err)
|
||||
|
||||
options, err = svc.GetOptions(ctx)
|
||||
require.Nil(t, err)
|
||||
updateLocalOptionValues(options)
|
||||
require.Equal(t, 5, int(distributedInterval))
|
||||
require.Equal(t, 10, int(loggerTlsPeriod))
|
||||
interval, err = svc.ExpectedCheckinInterval(ctx)
|
||||
require.Nil(t, err)
|
||||
assert.Equal(t, 5, int(interval))
|
||||
|
||||
options, err = svc.ModifyOptions(ctx, kolide.OptionRequest{
|
||||
Options: []kolide.Option{
|
||||
kolide.Option{
|
||||
ID: loggerTlsPeriodID,
|
||||
Name: "logger_tls_period",
|
||||
Value: kolide.OptionValue{
|
||||
Val: 1,
|
||||
},
|
||||
Type: kolide.OptionTypeInt,
|
||||
ReadOnly: false,
|
||||
},
|
||||
},
|
||||
},
|
||||
)
|
||||
require.Nil(t, err)
|
||||
|
||||
options, err = svc.GetOptions(ctx)
|
||||
require.Nil(t, err)
|
||||
updateLocalOptionValues(options)
|
||||
require.Equal(t, 5, int(distributedInterval))
|
||||
require.Equal(t, 1, int(loggerTlsPeriod))
|
||||
interval, err = svc.ExpectedCheckinInterval(ctx)
|
||||
require.Nil(t, err)
|
||||
assert.Equal(t, 1, int(interval))
|
||||
}
|
||||
|
|
@ -33,21 +33,17 @@ func validateValueMapsToOptionType(opt kolide.Option) error {
|
|||
if !opt.OptionSet() {
|
||||
return nil
|
||||
}
|
||||
val := opt.GetValue()
|
||||
switch opt.Type {
|
||||
case kolide.OptionTypeBool:
|
||||
_, ok := val.(bool)
|
||||
if !ok {
|
||||
switch opt.GetValue().(type) {
|
||||
case int, uint, uint64, float64:
|
||||
if opt.Type != kolide.OptionTypeInt {
|
||||
return errTypeMismatch
|
||||
}
|
||||
case kolide.OptionTypeString:
|
||||
_, ok := val.(string)
|
||||
if !ok {
|
||||
case string:
|
||||
if opt.Type != kolide.OptionTypeString {
|
||||
return errTypeMismatch
|
||||
}
|
||||
case kolide.OptionTypeInt:
|
||||
_, ok := val.(float64) // JSON unmarshaler represents all numbers in float64
|
||||
if !ok {
|
||||
case bool:
|
||||
if opt.Type != kolide.OptionTypeBool {
|
||||
return errTypeMismatch
|
||||
}
|
||||
default:
|
||||
|
|
|
|||
Loading…
Reference in a new issue