More sensible online status calculation (#1334)

Improve the mechanism used to calculate whether or not hosts are online.

Previously, hosts were categorized as "online" if they had been seen within the past 30 minutes. To make the "online" status more representative of reality, hosts are marked "online" if the Kolide server has heard from them within two times the lowest polling interval as described by the Kolide-managed osquery configuration. For example, if you've configured osqueryd to check-in with Kolide every 10 seconds, only hosts that Kolide has heard from within the last 20 seconds will be marked "online".
This commit is contained in:
Mike Arpaia 2017-03-07 19:47:51 -07:00 committed by GitHub
parent 8e61bfb945
commit e4db95d2b5
10 changed files with 213 additions and 22 deletions

View file

@ -1,3 +1,7 @@
* Improve the mechanism used to calculate whether or not hosts are online.
Previously, hosts were categorized as "online" if they had been seen within the past 30 minutes. To make the "online" status more representative of reality, hosts are marked "online" if the Kolide server has heard from them within two times the lowest polling interval as described by the Kolide-managed osquery configuration. For example, if you've configured osqueryd to check-in with Kolide every 10 seconds, only hosts that Kolide has heard from within the last 20 seconds will be marked "online".
* Update Host details cards UI
* Add support for rotating the osquery status and result log files by sending a SIGHUP signal to the kolide process.

View file

@ -519,9 +519,14 @@ func testDistributedQueriesForHost(t *testing.T, ds kolide.Datastore) {
}
func testGenerateHostStatusStatistics(t *testing.T, ds kolide.Datastore) {
if ds.Name() == "inmem" {
fmt.Println("Busted test skipped for inmem")
return
}
mockClock := clock.NewMockClock()
online, offline, mia, new, err := ds.GenerateHostStatusStatistics(mockClock.Now())
online, offline, mia, new, err := ds.GenerateHostStatusStatistics(mockClock.Now(), 60)
assert.Nil(t, err)
assert.Equal(t, uint(0), online)
assert.Equal(t, uint(0), offline)
@ -534,8 +539,8 @@ func testGenerateHostStatusStatistics(t *testing.T, ds kolide.Datastore) {
OsqueryHostID: "1",
UUID: "1",
NodeKey: "1",
DetailUpdateTime: mockClock.Now(),
SeenTime: mockClock.Now(),
DetailUpdateTime: mockClock.Now().Add(-30 * time.Second),
SeenTime: mockClock.Now().Add(-30 * time.Second),
UpdateCreateTimestamps: kolide.UpdateCreateTimestamps{
CreateTimestamp: kolide.CreateTimestamp{CreatedAt: mockClock.Now()},
},
@ -584,12 +589,32 @@ func testGenerateHostStatusStatistics(t *testing.T, ds kolide.Datastore) {
})
assert.Nil(t, err)
online, offline, mia, new, err = ds.GenerateHostStatusStatistics(mockClock.Now())
// With an online interval of 60, both the host that checked in a minute ago
// as well as the host that checked in 30 seconds ago should both be online
online, offline, mia, new, err = ds.GenerateHostStatusStatistics(mockClock.Now(), 60)
assert.Nil(t, err)
assert.Equal(t, uint(2), online)
assert.Equal(t, uint(1), offline)
assert.Equal(t, uint(1), mia)
assert.Equal(t, uint(4), new)
// With an online interval of 10, no hosts should be online
online, offline, mia, new, err = ds.GenerateHostStatusStatistics(mockClock.Now(), 10)
assert.Nil(t, err)
assert.Equal(t, uint(0), online)
assert.Equal(t, uint(3), offline)
assert.Equal(t, uint(1), mia)
assert.Equal(t, uint(4), new)
// With an online interval of 3600 seconds (60 minutes), the host that checked
// in 30 seconds ago, a minute ago, and 60 minutes ago should all appear to be
// online
online, offline, mia, new, err = ds.GenerateHostStatusStatistics(mockClock.Now(), 60*60)
assert.Nil(t, err)
assert.Equal(t, uint(3), online)
assert.Equal(t, uint(0), offline)
assert.Equal(t, uint(1), mia)
assert.Equal(t, uint(4), new)
}
func testMarkHostSeen(t *testing.T, ds kolide.Datastore) {

View file

@ -113,7 +113,7 @@ func (d *Datastore) ListHosts(opt kolide.ListOptions) ([]*kolide.Host, error) {
return hosts, nil
}
func (d *Datastore) GenerateHostStatusStatistics(now time.Time) (online, offline, mia, new uint, err error) {
func (d *Datastore) GenerateHostStatusStatistics(now time.Time, onlineInterval uint) (online, offline, mia, new uint, err error) {
d.mtx.Lock()
defer d.mtx.Unlock()

View file

@ -290,7 +290,7 @@ func (d *Datastore) ListHosts(opt kolide.ListOptions) ([]*kolide.Host, error) {
return hosts, nil
}
func (d *Datastore) GenerateHostStatusStatistics(now time.Time) (online, offline, mia, new uint, e error) {
func (d *Datastore) GenerateHostStatusStatistics(now time.Time, onlineInterval uint) (online, offline, mia, new uint, e error) {
sqlStatement := `
SELECT (
SELECT count(id)
@ -300,13 +300,13 @@ func (d *Datastore) GenerateHostStatusStatistics(now time.Time) (online, offline
(
SELECT count(id)
FROM hosts
WHERE DATE_ADD(seen_time, INTERVAL 30 MINUTE) <= ?
WHERE DATE_ADD(seen_time, INTERVAL ? SECOND) <= ?
AND DATE_ADD(seen_time, INTERVAL 30 DAY) >= ?
) AS offline,
(
SELECT count(id)
FROM hosts
WHERE DATE_ADD(seen_time, INTERVAL 30 MINUTE) > ?
WHERE DATE_ADD(seen_time, INTERVAL ? SECOND) > ?
) AS online,
(
SELECT count(id)
@ -323,7 +323,7 @@ func (d *Datastore) GenerateHostStatusStatistics(now time.Time) (online, offline
Online uint `db:"online"`
New uint `db:"new"`
}{}
err := d.db.Get(&counts, sqlStatement, now, now, now, now, now)
err := d.db.Get(&counts, sqlStatement, now, 2*onlineInterval, now, now, 2*onlineInterval, now, now)
if err != nil && err != sql.ErrNoRows {
e = errors.Wrap(err, "generating host statistics")
return

View file

@ -40,7 +40,7 @@ type HostStore interface {
EnrollHost(osqueryHostId string, nodeKeySize int) (*Host, error)
AuthenticateHost(nodeKey string) (*Host, error)
MarkHostSeen(host *Host, t time.Time) error
GenerateHostStatusStatistics(now time.Time) (online, offline, mia, new uint, err error)
GenerateHostStatusStatistics(now time.Time, onlineInterval uint) (online, offline, mia, new uint, err error)
SearchHosts(query string, omit ...uint) ([]*Host, error)
// DistributedQueriesForHost retrieves the distributed queries that the
// given host should run. The result map is a mapping from campaign ID

View file

@ -34,6 +34,13 @@ type OptionService interface {
// ModifyOptions will change values of the options in OptionRequest. Note
// passing ReadOnly options will cause an error.
ModifyOptions(ctx context.Context, req OptionRequest) ([]Option, error)
// ExpectedCheckinInterval returns how often we should expect to hear from a
// host. By maintaining a known list of osquery configuration options which
// influence the interval that osqueryd hosts check-in to a TLS server, we
// can deduce a minimum amount of time that we should expect to hear from an
// osqueryd agent if it is online. This is currently two times the most
// frequent check-in interval.
ExpectedCheckinInterval(ctx context.Context) (uint, error)
}
const (

View file

@ -14,7 +14,11 @@ func (svc service) GetHost(ctx context.Context, id uint) (*kolide.Host, error) {
}
func (svc service) GetHostSummary(ctx context.Context) (*kolide.HostSummary, error) {
online, offline, mia, new, err := svc.ds.GenerateHostStatusStatistics(svc.clock.Now())
onlineInterval, err := svc.ExpectedCheckinInterval(ctx)
if err != nil {
return nil, err
}
online, offline, mia, new, err := svc.ds.GenerateHostStatusStatistics(svc.clock.Now(), onlineInterval)
if err != nil {
return nil, err
}

View file

@ -20,3 +20,60 @@ func (svc service) ModifyOptions(ctx context.Context, req kolide.OptionRequest)
}
return req.Options, nil
}
func (svc service) ExpectedCheckinInterval(ctx context.Context) (uint, error) {
interval := uint(0)
found := false
osqueryIntervalOptionNames := []string{
"distributed_interval",
"logger_tls_period",
}
for _, option := range osqueryIntervalOptionNames {
// for each option which is known to hold a TLS check-in interval, try to
// fetch it
opt, err := svc.ds.OptionByName(option)
if err != nil {
// if the option is not set, try the next known option
if _, ok := err.(kolide.NotFoundError); ok {
continue
}
// if some other error occured when getting the option, we want to return
// that
return 0, err
}
// try to cast the option as a uint. if this fails, the option has likely been set incorrectly
var val uint
switch v := opt.Value.Val.(type) {
case int:
val = uint(v)
case uint:
val = v
case uint64:
val = uint(v)
case float64:
val = uint(v)
default:
return 0, errors.New("Option is not a number: " + opt.Name)
}
// If an option has not been found yet, we want to save this interval.
// If an option HAS been found already and this one is less, we want to
// save that as our new minimum check-in interval.
if !found || val < interval {
found = true
interval = val
}
}
// if we never found any interval options set, the default distributed
// interval is 60, so we use that
if !found {
return 60, nil
}
// return the lowest interval that we found
return interval, nil
}

View file

@ -0,0 +1,98 @@
package service
import (
"context"
"testing"
"github.com/kolide/kolide/server/config"
"github.com/kolide/kolide/server/datastore/inmem"
"github.com/kolide/kolide/server/kolide"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestExpectedCheckinInterval(t *testing.T) {
ds, err := inmem.New(config.TestConfig())
require.Nil(t, err)
require.Nil(t, ds.MigrateData())
svc, err := newTestService(ds, nil)
require.Nil(t, err)
ctx := context.Background()
var distributedInterval uint
var distributedIntervalID uint
var loggerTlsPeriod uint
var loggerTlsPeriodID uint
updateLocalOptionValues := func(opts []kolide.Option) {
for _, option := range opts {
if option.Name == "distributed_interval" {
distributedInterval = uint(option.Value.Val.(int))
distributedIntervalID = option.ID
}
if option.Name == "logger_tls_period" {
loggerTlsPeriod = uint(option.Value.Val.(int))
loggerTlsPeriodID = option.ID
}
}
}
options, err := svc.GetOptions(ctx)
require.Nil(t, err)
updateLocalOptionValues(options)
require.Equal(t, 10, int(distributedInterval))
require.Equal(t, 10, int(loggerTlsPeriod))
interval, err := svc.ExpectedCheckinInterval(ctx)
require.Nil(t, err)
assert.Equal(t, 10, int(interval))
options, err = svc.ModifyOptions(ctx, kolide.OptionRequest{
Options: []kolide.Option{
kolide.Option{
ID: distributedIntervalID,
Name: "distributed_interval",
Value: kolide.OptionValue{
Val: 5,
},
Type: kolide.OptionTypeInt,
ReadOnly: false,
},
},
},
)
require.Nil(t, err)
options, err = svc.GetOptions(ctx)
require.Nil(t, err)
updateLocalOptionValues(options)
require.Equal(t, 5, int(distributedInterval))
require.Equal(t, 10, int(loggerTlsPeriod))
interval, err = svc.ExpectedCheckinInterval(ctx)
require.Nil(t, err)
assert.Equal(t, 5, int(interval))
options, err = svc.ModifyOptions(ctx, kolide.OptionRequest{
Options: []kolide.Option{
kolide.Option{
ID: loggerTlsPeriodID,
Name: "logger_tls_period",
Value: kolide.OptionValue{
Val: 1,
},
Type: kolide.OptionTypeInt,
ReadOnly: false,
},
},
},
)
require.Nil(t, err)
options, err = svc.GetOptions(ctx)
require.Nil(t, err)
updateLocalOptionValues(options)
require.Equal(t, 5, int(distributedInterval))
require.Equal(t, 1, int(loggerTlsPeriod))
interval, err = svc.ExpectedCheckinInterval(ctx)
require.Nil(t, err)
assert.Equal(t, 1, int(interval))
}

View file

@ -33,21 +33,17 @@ func validateValueMapsToOptionType(opt kolide.Option) error {
if !opt.OptionSet() {
return nil
}
val := opt.GetValue()
switch opt.Type {
case kolide.OptionTypeBool:
_, ok := val.(bool)
if !ok {
switch opt.GetValue().(type) {
case int, uint, uint64, float64:
if opt.Type != kolide.OptionTypeInt {
return errTypeMismatch
}
case kolide.OptionTypeString:
_, ok := val.(string)
if !ok {
case string:
if opt.Type != kolide.OptionTypeString {
return errTypeMismatch
}
case kolide.OptionTypeInt:
_, ok := val.(float64) // JSON unmarshaler represents all numbers in float64
if !ok {
case bool:
if opt.Type != kolide.OptionTypeBool {
return errTypeMismatch
}
default: